def __init__(self, config, bulk_data_transport): PyThread.__init__(self) self.StripedServerURL = config["ServerURL"] self.RegistryAddress = (config["Registry"]["host"], config["Registry"]["port"]) self.NWorkers = config["NWorkers"] self.Tag = config.get("Tag", "default") self.CacheLimit = config.get("CacheLimit", 1.0) * 1.0e9 self.NJobsRunning = config.get("RunningJobs", 2) self.QueueCapacity = config.get("JobQueueCapacity", 10) self.ModuleStorage = config.get("ModuleStorage", "/tmp/modules") if not self.ModuleStorage in sys.path: sys.path.insert(0, self.ModuleStorage) self.Sock = None self.Workers = [] self.Stop = False self.Accumulators = TaskQueue(self.NJobsRunning, capacity=self.QueueCapacity) self.BulkDataTransport = bulk_data_transport self.WorkerLogFileTemplate = None self.LogFile = None self.LogDir = config.get("LogDir") if self.LogDir is not None: if not os.path.isdir(self.LogDir): os.makedirs(self.LogDir, 0o755) self.WorkerLogFileTemplate = "%s/worker.%%(wid)d.log" % (self.LogDir,) self.LogFile = LogFile("%s/worker_master.log" % (self.LogDir,), keep=3)
class Controller(object): def __init__(self, batch, max_workers, config, schema, bucket_name, dataset_name, data_reader_class, existing_frames, dry_run): self.Batch = batch self.TaskQueue = TaskQueue(max_workers) self.ExistingFrames = existing_frames self.CBConfig = config self.Schema = schema self.BucketName = bucket_name sekf.DatasetName = dtaset_name self.DataReaderClass = data_reader_class self.DryRun = dry_run def run(self): for i, segment in enumerate(self.Batch): frames = set(segment.frameIDs()) - self.ExistingFrames if frames: task = LoaderTask( self, i, segment, self.CBConfig, self.Schema, None, # FIX ME: metadata is None for now frames, self.BucketName, self.DatasetName, self.DataReaderClass, self.DryRun) self.TaskQueue.add(task) self.TaskQueue.waitUntilDone()
def __init__(self, batch, max_workers, config, schema, bucket_name, dataset_name, data_reader_class, existing_frames, dry_run): self.Batch = batch self.TaskQueue = TaskQueue(max_workers) self.ExistingFrames = existing_frames self.CBConfig = config self.Schema = schema self.BucketName = bucket_name sekf.DatasetName = dtaset_name self.DataReaderClass = data_reader_class self.DryRun = dry_run
def __init__(self, host, port, worker_registry, authenticator, data_server_url, bulk_transport_port, queue_capacity, max_jobs_running, source_archive, log_file_dir): PyThread.__init__(self) self.DataServerURL = data_server_url self.WorkerRegistry = worker_registry self.Sock = None self.Port = port self.Host = host self.Stop = False self.Authenticator = authenticator self.MaxJobs = max_jobs_running self.QueueCapacity = queue_capacity self.JobQueue = TaskQueue(max_jobs_running, capacity=queue_capacity) self.JIDPrefix = "%03d" % (os.getpid() % 1000, ) self.NextJID = 1 self.DataClient = StripedClient(data_server_url) self.SourceArchive = source_archive self.LogFileDir = log_file_dir self.JobHistory = [] self.BulkTransportPort = bulk_transport_port
def __init__(self, wid, nworkers, striped_server_url, logfile_template, cache_limit, module_storage): multiprocessing.Process.__init__(self) self.ID = wid self.NWorkers = nworkers self.Client = StripedClient(striped_server_url, cache="long", cache_limit=cache_limit, log=self.log) self.ModuleStorage = module_storage self.Stop = False self.LogFile = None self.Sock = socket(AF_INET, SOCK_STREAM) self.Sock.bind(("127.0.0.1", 0)) self.Port = self.Sock.getsockname()[1] self.Address = self.Sock.getsockname() self.Tasks = TaskQueue(2, capacity=10) if logfile_template != None: self.LogFile = LogFile(logfile_template % {"wid":self.ID}, keep=3) self.log("created at port %d" % (self.Port,))
class WorkerMaster(PyThread): def __init__(self, config, bulk_data_transport): PyThread.__init__(self) self.StripedServerURL = config["ServerURL"] self.RegistryAddress = (config["Registry"]["host"], config["Registry"]["port"]) self.NWorkers = config["NWorkers"] self.Tag = config.get("Tag", "default") self.CacheLimit = config.get("CacheLimit", 1.0) * 1.0e9 self.NJobsRunning = config.get("RunningJobs", 2) self.QueueCapacity = config.get("JobQueueCapacity", 10) self.ModuleStorage = config.get("ModuleStorage", "/tmp/modules") if not self.ModuleStorage in sys.path: sys.path.insert(0, self.ModuleStorage) self.Sock = None self.Workers = [] self.Stop = False self.Accumulators = TaskQueue(self.NJobsRunning, capacity=self.QueueCapacity) self.BulkDataTransport = bulk_data_transport self.WorkerLogFileTemplate = None self.LogFile = None self.LogDir = config.get("LogDir") if self.LogDir is not None: if not os.path.isdir(self.LogDir): os.makedirs(self.LogDir, 0o755) self.WorkerLogFileTemplate = "%s/worker.%%(wid)d.log" % (self.LogDir,) self.LogFile = LogFile("%s/worker_master.log" % (self.LogDir,), keep=3) def log(self, msg): print(("Worker master: %s" % (msg,))) if self.LogFile is not None: self.LogFile.log("Worker master: %s" % (msg,)) def run(self): signal.signal(signal.SIGINT, self.sigint) self.Sock = socket(AF_INET, SOCK_STREAM) self.Sock.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1) self.Sock.bind(("", 0)) self.Sock.listen(10) port = self.Sock.getsockname()[1] pinger = WorkerRegistryPinger(self.RegistryAddress, port, self.Tag) pinger.start() # # Start workers # self.Workers = [Worker(i, self.NWorkers, self.StripedServerURL, self.WorkerLogFileTemplate, self.CacheLimit, self.ModuleStorage) for i in range(self.NWorkers)] for w in self.Workers: w.start() self.log("startied worker %d with pid %d" % (w.ID, w.pid)) nrunning = self.NWorkers while not self.Stop: sock, addr = self.Sock.accept() dxsock = DataExchangeSocket(sock) close_sock = True #print "Client connected: %s" % (addr,) self.log("Client connected: %s" % (addr,)) # read job description JSON #print "reading params..." try: msg = dxsock.recv() except: self.log("Can not read initial message. Closing the connection. Error:\n%s" % (traceback.format_exc(),)) msg = None if msg and msg.Type == 'request': try: request = WorkerRequest.fromDXMsg(msg) self.log("Request received:\n jid/wid: %s/%s\n dataset: %s\n data_url: %s\n frames: %s\n" % ( request.JID, request.WID, request.DatasetName, request.DataServerURL, request.RGIDs) ) signature, t, salt, alg = msg["worker_authenticator"].split(":") #print "worker_authenticator:", (signature, t, salt, alg) key = pinger.Key verified, reason = request.verifySignature(key, signature, t, salt, alg) if not verified: self.log("Signature verification failed: %s" % (reason,)) dxsock.send(DXMessage("exception").append(info="Authentication failed: %s" % (reason,))) else: self.Accumulators << AccumulatorDriver(dxsock, request, self.Workers, self.ModuleStorage, self.BulkDataTransport, self.LogFile) close_sock = False except: self.log("Error processing the request. Closing the connection\n%s" % (traceback.format_exc(),)) if close_sock: dxsock.close() def sigint(self, signum, frame): print("SIGINT received. Terminating...") self.terminate() @synchronized def terminate(self): self.Stop = True self.Accumulators.hold() for t in self.Accumulators.activeTasks(): t.terminate() self.Accumulators.flush() self.Sock.close()
def run_batch(argv): from DataReader import DataReader Usage = """ python run_batch.py [options] <batch file> <bucket name> <dataset name> Options: -c <CouchBase config file>, default - value of the COUCHBASE_BACKEND_CFG environment variable -m <max workers>, default = 5 -O - override existing frames -s <stagger>, default = 10 (seconds) -n - dry run """ opts, args = getopt.getopt(argv, "m:s:c:On") opts = dict(opts) MaxWorkers = int(opts.get("-m", 5)) Stagger = float(opts.get("-s", 1)) Config = opts.get("-c", os.environ.get("COUCHBASE_BACKEND_CFG")) Override = "-O" in opts DryRun = "-n" in opts if not Config: print( "Couchbase config file must be specified either with -c, or using COUCHBASE_BACKEND_CFG env. variable" ) print() print(Usage) sys.exit(1) if len(args) < 3: print(Usage) sys.exit(2) batch_file, bucket_name, dataset_name = args batch = Batch.load(batch_file) backend = CouchBaseBackend(bucket_name, print_errors=True, config=Config) schema = backend.schema(dataset_name) if not schema: print("Empty schema") sys.exit(1) existing_frames = set() if not Override: existing_frames = set(backend.RGIDs(dataset_name)) if existing_frames: print("The following frames exist and will not be overriden:", sorted(list(existing_frames))) task_queue = TaskQueue(MaxWorkers, stagger=Stagger) printer = Printer(batch) for i, segment in enumerate(batch): #print "segment:", i, segment, segment.frameIDs() frames = set(segment.frameIDs()) - existing_frames #print "segment:", i, segment, segment.frameIDs(), frames if frames: task = LoaderTask( printer, i, segment, Config, schema, None, # FIX ME: metadata is None for now frames, bucket_name, dataset_name, DataReader, DryRun) task_queue.addTask(task) task_queue.waitUntilEmpty()
class JobServer(PyThread): def __init__(self, host, port, worker_registry, authenticator, data_server_url, bulk_transport_port, queue_capacity, max_jobs_running, source_archive, log_file_dir): PyThread.__init__(self) self.DataServerURL = data_server_url self.WorkerRegistry = worker_registry self.Sock = None self.Port = port self.Host = host self.Stop = False self.Authenticator = authenticator self.MaxJobs = max_jobs_running self.QueueCapacity = queue_capacity self.JobQueue = TaskQueue(max_jobs_running, capacity=queue_capacity) self.JIDPrefix = "%03d" % (os.getpid() % 1000, ) self.NextJID = 1 self.DataClient = StripedClient(data_server_url) self.SourceArchive = source_archive self.LogFileDir = log_file_dir self.JobHistory = [] self.BulkTransportPort = bulk_transport_port @synchronized def purgeJobHistory(self): now = time.time() self.JobHistory = list( filter(lambda j, tmax=now - 24 * 3600: j.Ended and j.Ended > tmax, self.JobHistory)) @synchronized def jid(self): t = "%s%04d" % ( self.JIDPrefix, self.NextJID, ) self.NextJID = (self.NextJID + 1) % 10000 return t def log(self, msg): log("[server]: %s" % (msg, )) def run(self): self.Sock = socket(AF_INET, SOCK_STREAM) self.Sock.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1) self.Sock.bind(('', self.Port)) self.Sock.listen(5) data_exchange_listener = DataExchangeSocket(self.Sock) while not self.Stop: data_exchange = None try: data_exchange = data_exchange_listener.accept() msg = data_exchange.recv() #print "msg:", msg.Type if msg and msg.Type == 'job_request': job_description = JobDescription.fromDXMsg(msg) exists = self.DataClient.dataset( job_description.DatasetName).exists #print "exists:", exists if not exists: self.log("Dataset not found: %s" % (job_description.DatasetName, )) data_exchange.send( DXMessage("job_failed").append( reason="Dataset '%s' not found" % (job_description.DatasetName, ))) else: jid = self.jid() self.log( "Job description received. Job id %s assigned" % (jid, )) job_log_file_path = None if self.LogFileDir is None else "%s/job_%s.log" % ( self.LogFileDir, jid) jt = JobTask(self, jid, job_description, self.DataServerURL, self.BulkTransportPort, self.DataClient, data_exchange, job_log_file_path) self.JobQueue << jt data_exchange = None # the job task owns it now ! if self.SourceArchive is not None: open("%s/ws_%s.txt" % (self.SourceArchive, jid), "w").write(job_description.WorkerText) self.purgeJobHistory() except: dump = traceback.format_exc() self.log("Uncaught exception: %s" % (dump, )) if data_exchange is not None: data_exchange.send( DXMessage("job_failed").append(reason="Exception: %s" % (dump, ))) finally: if data_exchange is not None: data_exchange.close() data_exchange = None def workers(self, tags=None): return self.WorkerRegistry.workers(tags=tags) def validate_job(self, job_description): validated, identity = self.Authenticator.validate( job_description.AuthToken, job_description.Username) if validated: job_description.Identity = identity return validated @synchronized def jobStarted(self, job_task): jid = job_task.JID self.log("Jobs running: " + ",".join([j.JID for j in self.JobQueue.activeTasks()])) @synchronized def jobs(self): self.purgeJobHistory() queued, running = self.JobQueue.tasks()[:] ids = set([j.JID for j in queued + running]) return queued, running, [ j for j in self.JobHistory if not j.JID in ids ] @synchronized def jobEnded(self, job_task): self.JobHistory.append(job_task) jid = job_task.JID self.log("Jobs running: " + ",".join( [j.JID for j in self.JobQueue.activeTasks() if j.JID != jid])) @synchronized def jobFailed(self, job_task, reason): self.JobHistory.append(job_task) jid = job_task.JID self.log("Jobs running: " + ",".join( [j.JID for j in self.JobQueue.activeTasks() if j.JID != jid]))
def __init__(self, port): PyThread.__init__(self) self.Port = port self.Stop = False self.TaskQueue = TaskQueue(10, capacity=10) self.Data = {}
def create_batch(argv): from DataReader import DataReader Usage = """ python create_batch.py [options] <batch_file> <bucket name> <dataset name> @<file with input file list> python create_batch.py [options] <batch_file> <bucket name> <dataset name> <directory path> python create_batch.py [options] <batch_file> <bucket name> <dataset name> <file> <file> ... Options: -O (reuse|REUSE|allocate) - override existing batch description file and either reuse same frame id range or allocate new range -c <couchbase config file> -n <target frame size>, default = 10000 -p <path prefix> - prefix to add to the file paths read from the file or given as the list -k <n> - how many lowest path components, in addition to the file name to keep in the provenance data, defailt=0, i.e. keep the file name only -x <extension> - if the input is specified as a directory, then this is the extension of data files under the directory. Default = "root" -m <n readers> - run multiple data reader threads in parallel, default = 1 -q - be quiet """ opts, args = getopt.getopt(argv, "n:p:k:x:O:qc:m:") opts = dict(opts) Config = opts.get("-c") FrameSize = int(opts.get("-n", 10000)) Prefix = opts.get("-p") Keep = int(opts.get("-k", 0)) Extension = opts.get("-x", "root") Override = "-O" in opts OverrideMode = opts.get("-O") MaxReaders = int(opts.get("-m", 1)) Quiet = "-q" in opts if len(args) < 4 or not OverrideMode in (None, "reuse", "allocate", "REUSE"): print(Usage) sys.exit(1) BatchFile, BucketName, DatasetName = args[:3] exists = False try: exists = os.path.isfile(BatchFile) except: pass old_batch = None if exists: if not Override: print() print( "ERROR: File %s exists. Use -O (reuse|allocate) to override." % (BatchFile, )) print("Old file left unchanged.") print() print(Usage) sys.exit(1) old_batch = Batch.load(BatchFile) #print len(old_batch) source = args[3] if source[0] == '@': paths = [ f for f in [ff.strip() for ff in open(source[1:], "r").readlines()] if f ] elif stat.S_ISDIR(os.stat(source).st_mode): assert not Prefix, "\nERROR: Can not use path prefix with the input specified as the directory\n" if Extension[0] == '.': Extension = Extension[1:] paths = sorted(glob.glob( "%s/*.%s" % (source, Extension))) # local directory - assume root files else: paths = args[3:] # explicit file path list if Prefix: paths = [Prefix + f for f in paths] provenance_names = [] for fp in paths: parts = fp.split("/") provenance_names.append("/".join(parts[-1 - Keep:])) if not Quiet: print("Building frame map from %d files..." % (len(paths, ))) backend = CouchBaseBackend(BucketName, print_errors=True, config=Config) schema = backend.schema(DatasetName) if not schema: print("Dataset %s not found" % (DatasetName, )) sys.exit(1) class FileCounter(object): def __init__(self, ntotal, show_tqdm): self.T = None if not (use_tqdm and show_tqdm) else tqdm( total=ntotal) def tick(self, n=1): if self.T is not None: self.T.update(n) file_counter = FileCounter(len(paths), not Quiet) file_infos = [ FileInfo(path, None, prov) for path, prov in zip(paths, provenance_names) ] queue = TaskQueue(MaxReaders, tasks=[ GetNEventsTask(fi, schema, file_counter.tick) for fi in file_infos ]) queue.waitUntilEmpty() batch = Batch().build(DataReader, schema, FrameSize, file_infos) NFrames = len(batch) if not Quiet: print("Frame map with %d frames generated" % (NFrames, )) start_farme_id = None if old_batch is not None: nold = len(old_batch) if OverrideMode.lower() == "reuse": if nold < NFrames and OverrideMode != "REUSE": print() print( "ERROR: Can not reuse old frame id range because old range (%d) is shorter than needed (%d)" % (nold, NFrames)) print(" Use -O REUSE (capitals) to override") print() sys.exit(1) if nold > NFrames: print() print( "WARNING: old frame id range (%d) is larger than new one (%d)" % (nold, NFrames)) print() start_farme_id = old_batch.StartFrameID if not Quiet: print("Frame ID range starting at %d will be reused" % (start_farme_id, )) if start_farme_id is None: start_farme_id = backend.allocateRGIDs(DatasetName, NFrames) if not Quiet: print("Frame ID range is allocated starting at %d" % (start_farme_id, )) batch.setStartFrameID(start_farme_id) batch.save(BatchFile) if not Quiet: print("Batch saved to file: %s" % (BatchFile, ))