Beispiel #1
0
 def __init__(self, config, bulk_data_transport):
     PyThread.__init__(self)
     self.StripedServerURL = config["ServerURL"]
     self.RegistryAddress = (config["Registry"]["host"], config["Registry"]["port"])
     self.NWorkers = config["NWorkers"]
     self.Tag = config.get("Tag", "default")
     self.CacheLimit = config.get("CacheLimit", 1.0) * 1.0e9
     self.NJobsRunning = config.get("RunningJobs", 2)
     self.QueueCapacity = config.get("JobQueueCapacity", 10)
     self.ModuleStorage = config.get("ModuleStorage", "/tmp/modules")
     if not self.ModuleStorage in sys.path:
         sys.path.insert(0, self.ModuleStorage)
     self.Sock = None
     self.Workers = []
     self.Stop = False
     self.Accumulators = TaskQueue(self.NJobsRunning, capacity=self.QueueCapacity)
     self.BulkDataTransport = bulk_data_transport
     
     self.WorkerLogFileTemplate = None
     self.LogFile = None
     self.LogDir = config.get("LogDir")
     if self.LogDir is not None:
         if not os.path.isdir(self.LogDir):
             os.makedirs(self.LogDir, 0o755)
         self.WorkerLogFileTemplate = "%s/worker.%%(wid)d.log" % (self.LogDir,)
         self.LogFile = LogFile("%s/worker_master.log" % (self.LogDir,), keep=3)
Beispiel #2
0
class Controller(object):
    def __init__(self, batch, max_workers, config, schema, bucket_name,
                 dataset_name, data_reader_class, existing_frames, dry_run):
        self.Batch = batch
        self.TaskQueue = TaskQueue(max_workers)
        self.ExistingFrames = existing_frames
        self.CBConfig = config
        self.Schema = schema
        self.BucketName = bucket_name
        sekf.DatasetName = dtaset_name
        self.DataReaderClass = data_reader_class
        self.DryRun = dry_run

    def run(self):
        for i, segment in enumerate(self.Batch):
            frames = set(segment.frameIDs()) - self.ExistingFrames
            if frames:
                task = LoaderTask(
                    self,
                    i,
                    segment,
                    self.CBConfig,
                    self.Schema,
                    None,  # FIX ME: metadata is None for now
                    frames,
                    self.BucketName,
                    self.DatasetName,
                    self.DataReaderClass,
                    self.DryRun)
                self.TaskQueue.add(task)
        self.TaskQueue.waitUntilDone()
Beispiel #3
0
 def __init__(self, batch, max_workers, config, schema, bucket_name,
              dataset_name, data_reader_class, existing_frames, dry_run):
     self.Batch = batch
     self.TaskQueue = TaskQueue(max_workers)
     self.ExistingFrames = existing_frames
     self.CBConfig = config
     self.Schema = schema
     self.BucketName = bucket_name
     sekf.DatasetName = dtaset_name
     self.DataReaderClass = data_reader_class
     self.DryRun = dry_run
Beispiel #4
0
 def __init__(self, host, port, worker_registry, authenticator,
              data_server_url, bulk_transport_port, queue_capacity,
              max_jobs_running, source_archive, log_file_dir):
     PyThread.__init__(self)
     self.DataServerURL = data_server_url
     self.WorkerRegistry = worker_registry
     self.Sock = None
     self.Port = port
     self.Host = host
     self.Stop = False
     self.Authenticator = authenticator
     self.MaxJobs = max_jobs_running
     self.QueueCapacity = queue_capacity
     self.JobQueue = TaskQueue(max_jobs_running, capacity=queue_capacity)
     self.JIDPrefix = "%03d" % (os.getpid() % 1000, )
     self.NextJID = 1
     self.DataClient = StripedClient(data_server_url)
     self.SourceArchive = source_archive
     self.LogFileDir = log_file_dir
     self.JobHistory = []
     self.BulkTransportPort = bulk_transport_port
Beispiel #5
0
 def __init__(self, wid, nworkers, striped_server_url, logfile_template, cache_limit, module_storage):
     multiprocessing.Process.__init__(self)
     self.ID = wid
     self.NWorkers = nworkers
     self.Client = StripedClient(striped_server_url, cache="long", cache_limit=cache_limit, log=self.log)
     self.ModuleStorage = module_storage
     self.Stop = False
     self.LogFile = None
     self.Sock = socket(AF_INET, SOCK_STREAM)
     self.Sock.bind(("127.0.0.1", 0))
     self.Port = self.Sock.getsockname()[1]
     self.Address = self.Sock.getsockname()
     self.Tasks = TaskQueue(2, capacity=10)
     if logfile_template != None:
         self.LogFile = LogFile(logfile_template % {"wid":self.ID}, keep=3)
     self.log("created at port %d" % (self.Port,))
Beispiel #6
0
class WorkerMaster(PyThread):

    def __init__(self, config, bulk_data_transport):
        PyThread.__init__(self)
        self.StripedServerURL = config["ServerURL"]
        self.RegistryAddress = (config["Registry"]["host"], config["Registry"]["port"])
        self.NWorkers = config["NWorkers"]
        self.Tag = config.get("Tag", "default")
        self.CacheLimit = config.get("CacheLimit", 1.0) * 1.0e9
        self.NJobsRunning = config.get("RunningJobs", 2)
        self.QueueCapacity = config.get("JobQueueCapacity", 10)
        self.ModuleStorage = config.get("ModuleStorage", "/tmp/modules")
        if not self.ModuleStorage in sys.path:
            sys.path.insert(0, self.ModuleStorage)
        self.Sock = None
        self.Workers = []
        self.Stop = False
        self.Accumulators = TaskQueue(self.NJobsRunning, capacity=self.QueueCapacity)
        self.BulkDataTransport = bulk_data_transport
        
        self.WorkerLogFileTemplate = None
        self.LogFile = None
        self.LogDir = config.get("LogDir")
        if self.LogDir is not None:
            if not os.path.isdir(self.LogDir):
                os.makedirs(self.LogDir, 0o755)
            self.WorkerLogFileTemplate = "%s/worker.%%(wid)d.log" % (self.LogDir,)
            self.LogFile = LogFile("%s/worker_master.log" % (self.LogDir,), keep=3)

        
    def log(self, msg):
        print(("Worker master: %s" % (msg,)))
        if self.LogFile is not None:
            self.LogFile.log("Worker master: %s" % (msg,))
        
        
    def run(self):
        signal.signal(signal.SIGINT, self.sigint)

        self.Sock = socket(AF_INET, SOCK_STREAM)
        self.Sock.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1)
        self.Sock.bind(("", 0))
        self.Sock.listen(10)

        port = self.Sock.getsockname()[1]
        pinger = WorkerRegistryPinger(self.RegistryAddress, port, self.Tag)
        pinger.start()

        #
        # Start workers
        #
        
        self.Workers = [Worker(i, self.NWorkers, self.StripedServerURL, self.WorkerLogFileTemplate, self.CacheLimit, self.ModuleStorage) 
                for i in range(self.NWorkers)]
        for w in self.Workers:
            w.start()
            self.log("startied worker %d with pid %d" % (w.ID, w.pid))
        nrunning = self.NWorkers
        
        while not self.Stop:
            sock, addr = self.Sock.accept()
            dxsock = DataExchangeSocket(sock)
            close_sock = True
            #print "Client connected: %s" % (addr,)
            self.log("Client connected: %s" % (addr,))
            
            # read job description JSON
            #print "reading params..."
            
            try:    msg = dxsock.recv()
            except:
                self.log("Can not read initial message. Closing the connection. Error:\n%s" % 
                            (traceback.format_exc(),))
                msg = None
            if msg and msg.Type == 'request':
                try:
                    request = WorkerRequest.fromDXMsg(msg)
                    self.log("Request received:\n  jid/wid: %s/%s\n  dataset: %s\n  data_url: %s\n  frames: %s\n" % (
                                request.JID, request.WID, request.DatasetName, request.DataServerURL,
                                request.RGIDs)
                    )
                    signature, t, salt, alg = msg["worker_authenticator"].split(":")
                    #print "worker_authenticator:", (signature, t, salt, alg)
                    key = pinger.Key
                    verified, reason = request.verifySignature(key, signature, t, salt, alg)
                    if not verified:
                        self.log("Signature verification failed: %s" % (reason,))
                        dxsock.send(DXMessage("exception").append(info="Authentication failed: %s" % (reason,)))
                    else:
                        self.Accumulators << AccumulatorDriver(dxsock, request, self.Workers, self.ModuleStorage, self.BulkDataTransport, self.LogFile)
                        close_sock = False
                except:
                    self.log("Error processing the request. Closing the connection\n%s" % (traceback.format_exc(),))

            if close_sock:
                dxsock.close()

                    
    def sigint(self, signum, frame):
        print("SIGINT received. Terminating...")
        self.terminate()

    @synchronized
    def terminate(self):
        self.Stop = True
        self.Accumulators.hold()
        for t in self.Accumulators.activeTasks():
            t.terminate()
        self.Accumulators.flush()
        self.Sock.close()
Beispiel #7
0
def run_batch(argv):
    from DataReader import DataReader

    Usage = """
    python run_batch.py [options] <batch file> <bucket name> <dataset name>

    Options:
        -c <CouchBase config file>, default - value of the COUCHBASE_BACKEND_CFG environment variable
        -m <max workers>, default = 5
        -O - override existing frames
        -s <stagger>, default = 10 (seconds)
        -n - dry run
    """

    opts, args = getopt.getopt(argv, "m:s:c:On")
    opts = dict(opts)
    MaxWorkers = int(opts.get("-m", 5))
    Stagger = float(opts.get("-s", 1))
    Config = opts.get("-c", os.environ.get("COUCHBASE_BACKEND_CFG"))
    Override = "-O" in opts
    DryRun = "-n" in opts

    if not Config:
        print(
            "Couchbase config file must be specified either with -c, or using COUCHBASE_BACKEND_CFG env. variable"
        )
        print()
        print(Usage)
        sys.exit(1)

    if len(args) < 3:
        print(Usage)
        sys.exit(2)

    batch_file, bucket_name, dataset_name = args

    batch = Batch.load(batch_file)
    backend = CouchBaseBackend(bucket_name, print_errors=True, config=Config)
    schema = backend.schema(dataset_name)

    if not schema:
        print("Empty schema")
        sys.exit(1)

    existing_frames = set()

    if not Override:
        existing_frames = set(backend.RGIDs(dataset_name))

    if existing_frames:
        print("The following frames exist and will not be overriden:",
              sorted(list(existing_frames)))

    task_queue = TaskQueue(MaxWorkers, stagger=Stagger)
    printer = Printer(batch)
    for i, segment in enumerate(batch):
        #print "segment:", i, segment, segment.frameIDs()
        frames = set(segment.frameIDs()) - existing_frames
        #print "segment:", i, segment, segment.frameIDs(), frames
        if frames:
            task = LoaderTask(
                printer,
                i,
                segment,
                Config,
                schema,
                None,  # FIX ME: metadata is None for now
                frames,
                bucket_name,
                dataset_name,
                DataReader,
                DryRun)
            task_queue.addTask(task)
    task_queue.waitUntilEmpty()
Beispiel #8
0
class JobServer(PyThread):
    def __init__(self, host, port, worker_registry, authenticator,
                 data_server_url, bulk_transport_port, queue_capacity,
                 max_jobs_running, source_archive, log_file_dir):
        PyThread.__init__(self)
        self.DataServerURL = data_server_url
        self.WorkerRegistry = worker_registry
        self.Sock = None
        self.Port = port
        self.Host = host
        self.Stop = False
        self.Authenticator = authenticator
        self.MaxJobs = max_jobs_running
        self.QueueCapacity = queue_capacity
        self.JobQueue = TaskQueue(max_jobs_running, capacity=queue_capacity)
        self.JIDPrefix = "%03d" % (os.getpid() % 1000, )
        self.NextJID = 1
        self.DataClient = StripedClient(data_server_url)
        self.SourceArchive = source_archive
        self.LogFileDir = log_file_dir
        self.JobHistory = []
        self.BulkTransportPort = bulk_transport_port

    @synchronized
    def purgeJobHistory(self):
        now = time.time()
        self.JobHistory = list(
            filter(lambda j, tmax=now - 24 * 3600: j.Ended and j.Ended > tmax,
                   self.JobHistory))

    @synchronized
    def jid(self):
        t = "%s%04d" % (
            self.JIDPrefix,
            self.NextJID,
        )
        self.NextJID = (self.NextJID + 1) % 10000
        return t

    def log(self, msg):
        log("[server]: %s" % (msg, ))

    def run(self):
        self.Sock = socket(AF_INET, SOCK_STREAM)
        self.Sock.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1)
        self.Sock.bind(('', self.Port))
        self.Sock.listen(5)
        data_exchange_listener = DataExchangeSocket(self.Sock)

        while not self.Stop:
            data_exchange = None
            try:
                data_exchange = data_exchange_listener.accept()
                msg = data_exchange.recv()
                #print "msg:", msg.Type
                if msg and msg.Type == 'job_request':
                    job_description = JobDescription.fromDXMsg(msg)
                    exists = self.DataClient.dataset(
                        job_description.DatasetName).exists
                    #print "exists:", exists
                    if not exists:
                        self.log("Dataset not found: %s" %
                                 (job_description.DatasetName, ))
                        data_exchange.send(
                            DXMessage("job_failed").append(
                                reason="Dataset '%s' not found" %
                                (job_description.DatasetName, )))
                    else:
                        jid = self.jid()
                        self.log(
                            "Job description received. Job id %s assigned" %
                            (jid, ))
                        job_log_file_path = None if self.LogFileDir is None else "%s/job_%s.log" % (
                            self.LogFileDir, jid)
                        jt = JobTask(self, jid, job_description,
                                     self.DataServerURL,
                                     self.BulkTransportPort, self.DataClient,
                                     data_exchange, job_log_file_path)
                        self.JobQueue << jt
                        data_exchange = None  # the job task owns it now !
                        if self.SourceArchive is not None:
                            open("%s/ws_%s.txt" % (self.SourceArchive, jid),
                                 "w").write(job_description.WorkerText)
                self.purgeJobHistory()
            except:
                dump = traceback.format_exc()
                self.log("Uncaught exception: %s" % (dump, ))
                if data_exchange is not None:
                    data_exchange.send(
                        DXMessage("job_failed").append(reason="Exception: %s" %
                                                       (dump, )))
            finally:
                if data_exchange is not None:
                    data_exchange.close()
                    data_exchange = None

    def workers(self, tags=None):
        return self.WorkerRegistry.workers(tags=tags)

    def validate_job(self, job_description):
        validated, identity = self.Authenticator.validate(
            job_description.AuthToken, job_description.Username)
        if validated:
            job_description.Identity = identity
        return validated

    @synchronized
    def jobStarted(self, job_task):
        jid = job_task.JID
        self.log("Jobs running: " +
                 ",".join([j.JID for j in self.JobQueue.activeTasks()]))

    @synchronized
    def jobs(self):
        self.purgeJobHistory()
        queued, running = self.JobQueue.tasks()[:]
        ids = set([j.JID for j in queued + running])
        return queued, running, [
            j for j in self.JobHistory if not j.JID in ids
        ]

    @synchronized
    def jobEnded(self, job_task):
        self.JobHistory.append(job_task)
        jid = job_task.JID
        self.log("Jobs running: " + ",".join(
            [j.JID for j in self.JobQueue.activeTasks() if j.JID != jid]))

    @synchronized
    def jobFailed(self, job_task, reason):
        self.JobHistory.append(job_task)
        jid = job_task.JID
        self.log("Jobs running: " + ",".join(
            [j.JID for j in self.JobQueue.activeTasks() if j.JID != jid]))
Beispiel #9
0
 def __init__(self, port):
     PyThread.__init__(self)
     self.Port = port
     self.Stop = False
     self.TaskQueue = TaskQueue(10, capacity=10)
     self.Data = {}
Beispiel #10
0
def create_batch(argv):
    from DataReader import DataReader

    Usage = """
    python create_batch.py [options] <batch_file> <bucket name> <dataset name> @<file with input file list>
    python create_batch.py [options] <batch_file> <bucket name> <dataset name> <directory path>
    python create_batch.py [options] <batch_file> <bucket name> <dataset name> <file> <file> ...

    Options:
        -O (reuse|REUSE|allocate) - override existing batch description file and either 
                              reuse same frame id range or 
                              allocate new range
        -c <couchbase config file>
        -n <target frame size>, default = 10000
        -p <path prefix> - prefix to add to the file paths read from the file or given as the list
        -k <n> - how many lowest path components, in addition to the file name 
                 to keep in the provenance data, defailt=0, i.e. keep the file name only
        -x <extension> - if the input is specified as a directory, then this is the extension of data files
                         under the directory. Default = "root"
        -m <n readers> - run multiple data reader threads in parallel, default = 1
        -q - be quiet
    """

    opts, args = getopt.getopt(argv, "n:p:k:x:O:qc:m:")
    opts = dict(opts)
    Config = opts.get("-c")
    FrameSize = int(opts.get("-n", 10000))
    Prefix = opts.get("-p")
    Keep = int(opts.get("-k", 0))
    Extension = opts.get("-x", "root")
    Override = "-O" in opts
    OverrideMode = opts.get("-O")
    MaxReaders = int(opts.get("-m", 1))

    Quiet = "-q" in opts

    if len(args) < 4 or not OverrideMode in (None, "reuse", "allocate",
                                             "REUSE"):
        print(Usage)
        sys.exit(1)

    BatchFile, BucketName, DatasetName = args[:3]

    exists = False
    try:
        exists = os.path.isfile(BatchFile)
    except:
        pass

    old_batch = None

    if exists:

        if not Override:
            print()
            print(
                "ERROR: File %s exists. Use -O (reuse|allocate) to override." %
                (BatchFile, ))
            print("Old file left unchanged.")
            print()
            print(Usage)
            sys.exit(1)

        old_batch = Batch.load(BatchFile)
        #print len(old_batch)

    source = args[3]
    if source[0] == '@':
        paths = [
            f
            for f in [ff.strip() for ff in open(source[1:], "r").readlines()]
            if f
        ]
    elif stat.S_ISDIR(os.stat(source).st_mode):
        assert not Prefix, "\nERROR: Can not use path prefix with the input specified as the directory\n"
        if Extension[0] == '.':
            Extension = Extension[1:]
        paths = sorted(glob.glob(
            "%s/*.%s" %
            (source, Extension)))  # local directory - assume root files
    else:
        paths = args[3:]  # explicit file path list

    if Prefix: paths = [Prefix + f for f in paths]

    provenance_names = []
    for fp in paths:
        parts = fp.split("/")
        provenance_names.append("/".join(parts[-1 - Keep:]))

    if not Quiet:
        print("Building frame map from %d files..." % (len(paths, )))

    backend = CouchBaseBackend(BucketName, print_errors=True, config=Config)
    schema = backend.schema(DatasetName)

    if not schema:
        print("Dataset %s not found" % (DatasetName, ))
        sys.exit(1)

    class FileCounter(object):
        def __init__(self, ntotal, show_tqdm):
            self.T = None if not (use_tqdm and show_tqdm) else tqdm(
                total=ntotal)

        def tick(self, n=1):
            if self.T is not None:
                self.T.update(n)

    file_counter = FileCounter(len(paths), not Quiet)
    file_infos = [
        FileInfo(path, None, prov)
        for path, prov in zip(paths, provenance_names)
    ]
    queue = TaskQueue(MaxReaders,
                      tasks=[
                          GetNEventsTask(fi, schema, file_counter.tick)
                          for fi in file_infos
                      ])
    queue.waitUntilEmpty()

    batch = Batch().build(DataReader, schema, FrameSize, file_infos)

    NFrames = len(batch)

    if not Quiet:
        print("Frame map with %d frames generated" % (NFrames, ))
    start_farme_id = None
    if old_batch is not None:
        nold = len(old_batch)
        if OverrideMode.lower() == "reuse":
            if nold < NFrames and OverrideMode != "REUSE":
                print()
                print(
                    "ERROR: Can not reuse old frame id range because old range (%d) is shorter than needed (%d)"
                    % (nold, NFrames))
                print("       Use -O REUSE (capitals) to override")
                print()
                sys.exit(1)
            if nold > NFrames:
                print()
                print(
                    "WARNING: old frame id range (%d) is larger than new one (%d)"
                    % (nold, NFrames))
                print()
            start_farme_id = old_batch.StartFrameID
            if not Quiet:
                print("Frame ID range starting at %d will be reused" %
                      (start_farme_id, ))

    if start_farme_id is None:
        start_farme_id = backend.allocateRGIDs(DatasetName, NFrames)
        if not Quiet:
            print("Frame ID range is allocated starting at %d" %
                  (start_farme_id, ))

    batch.setStartFrameID(start_farme_id)

    batch.save(BatchFile)

    if not Quiet: print("Batch saved to file: %s" % (BatchFile, ))