def jobs(self, **kwargs): """ :param path: :param source: :param pattern: :param stateful: :return: """ path = kwargs.get("path", self.default_path) if "processed" in kwargs.get("source", "processed"): converter = CSVReader() for base_path, workernode, run, filename in relevant_directories( path=path): current_path = os.path.join( os.path.join(base_path, workernode), run) if filename: for job in self.read_job(path=current_path, name=filename, converter=converter): yield job else: for dir_entry in sorted(os.listdir(current_path)): matches = re.match( kwargs.get("pattern", "(\d*)-process.csv"), dir_entry) if matches: for job in self.read_job(path=current_path, name=matches.group(1), converter=converter): yield job else: # convert raw data for base_path, workernode, run, _ in relevant_directories( path=path): current_path = os.path.join( os.path.join(base_path, workernode), run) converter = CSVReader() parser = ProcessStreamParser(workernode=workernode, run=run, data_source=self, path=current_path, data_reader=converter) converter.parser = parser for job in self._read_stream( path=current_path, data_path=os.path.join( os.path.join( kwargs.get("data_path", self.default_path), workernode), run), workernode=workernode, run=run, stateful=kwargs.get("stateful", False), pattern="^[0-9]{10}-process.log-[0-9]{8}", converter=converter): yield job
def network_statistics(self, **kwargs): """ :param path: :param stateful: :return: """ path = kwargs.get("path", self.default_path) for base_path, workernode, run, _ in relevant_directories(path=path): current_path = os.path.join(os.path.join(base_path, workernode), run) converter = CSVReader() parser = NetworkStatisticsParser(workernode=workernode, run=run, data_source=self, path=current_path, data_reader=converter) converter.parser = parser for statistics in self._read_stream( path=current_path, workernode=workernode, run=run, stateful=kwargs.get("stateful", False), pattern="^[0-9]{10}-(process|traffic).log-[0-9]{8}", converter=converter): yield statistics
def create_payloads(): """ For CMS pilots there is a very simple (but quick-n-dirty) approach to recognise the actual payloads. The method is based on the name of processes. As soon as there is a reliable but automatic solution, I need to switch to this one. The extractor looks into the different CMS jobs, which payloads still have not been identified and extracts those data. It is saved to `processed/payloads`. The payload ids are build from the job id from database and additionally the payload count. """ print print "Starting to extract payloads from CMS pilots" path = eval_input_path() output_path = eval_output_path() count = eval_cores() level = directory_level(path) if level == RUN_LEVEL: count = 1 do_multicore(count=count, target=_create_payloads, data=[{ "path": os.path.join(os.path.join(element[0], element[1]), element[2]), "output_path": output_path } for element in list(relevant_directories(path))])
def archive_jobs(): """ Function that starts archival of written jobs into a single zip file. """ print print "Starting to archive valid and complete jobs" path = eval_input_path() count = eval_cores() names = ["path", "workernode", "run"] data = [ dict(zip(names, element)) for element in list(relevant_directories(path)) ] do_multicore(count=count, target=_archive_jobs, data=data)
def jobs(self, **kwargs): """ :param path: :param data_path: :param source: :return: """ if "raw" in kwargs.get("source", "processed"): for job in FileDataSource.jobs(self, **kwargs): yield job else: with SQLCommand(dataSource=self._db_data_source) as sql_command: path = kwargs.get("path", self.default_path) level = directory_level(path) job_object = DBJobObject(valid=True, completed=True) if level == RUN_LEVEL: _, workernode, run, _ = next(relevant_directories(path=path), (None, None, None)) job_object.run = run workernode_object = self._db_operator.load_or_create_workernode(data=workernode) job_object.workernode_id = workernode_object.id_value elif level == WORKERNODE_LEVEL: workernode = os.path.split(path)[1] workernode_object = self._db_operator.load_or_create_workernode(data=workernode) job_object.workernode_id = workernode_object.id_value elif level == FILE_LEVEL: job_object = DBJobObject( id=os.path.basename(path).split("-")[0], valid=True, completed=True) for job_result in sql_command.find(job_object): current_path = path if level == BASE_LEVEL: # join different workernodes and runs workernode_object = self._db_operator.load_one( data=DBWorkernodeObject(id=job_result.workernode_id) ) current_path = os.path.join(os.path.join(path, workernode_object.name), job_result.run) elif level == WORKERNODE_LEVEL: # join different runs current_path = os.path.join(path, job_result.run) elif level == FILE_LEVEL: current_path = os.path.dirname(path) for job in FileDataSource.read_job( self, path=current_path, name=job_result.id_value): yield job
def create_payloads(ctx, paths, output_path, pcount): data = [] for path in paths: # prepare data for folder, workernode_subdir, run_subdir, _ in relevant_directories( path): # get all relevant files current_path = os.path.join( os.path.join(folder, workernode_subdir), run_subdir) data.extend([{ "path": filename, "output_path": output_path } for filename in glob.glob("%s/*-process.csv" % current_path)]) if pcount > 1: do_multicore(count=pcount, target=_create_payloads, data=data) else: for element in data: _create_payloads(element)
def prepare_raw_data(ctx, paths, output_path, pcount): data = [] for path in paths: # prepare data # TODO: is this called for every filename?! for folder, workernode_subdir, run_subdir, _ in relevant_directories( path): data.append({ "path": os.path.join(os.path.join(folder, workernode_subdir), run_subdir), "output_path": output_path }) if pcount > 1: do_multicore(count=pcount, target=_prepare_raw_data, data=data) else: _prepare_raw_data(data[0])
def prepare_raw_data(): """ Function that starts the splitting of raw data that was directly retrieved by GNM tool. """ print print "Starting to split data stream into jobs" path = eval_input_path() output_path = eval_output_path() count = eval_cores() level = directory_level(path) if level == RUN_LEVEL: count = 1 do_multicore(count=count, target=_prepare_raw_data, data=[{ "path": os.path.join(os.path.join(element[0], element[1]), element[2]), "output_path": output_path } for element in list(relevant_directories(path))])
def generate_network_statistics(): """ Function that starts generation of network statistics based on available raw data. """ print print "Starting to generate network statistics" path = eval_input_path() output_path = eval_output_path() count = eval_cores() level = directory_level(path) if level == RUN_LEVEL: count = 1 do_multicore(count=count, target=_generate_network_statistics, data=[{ "path": os.path.join(os.path.join(element[0], element[1]), element[2]), "output_path": output_path } for element in list(relevant_directories(path))])
def traffics(self, **kwargs): """ :param path: :param data_path: :param source: :param stateful: :return: """ path = kwargs.get("path", self.default_path) if "processed" in kwargs.get("source", "processed"): pass else: # convert raw data for base_path, workernode, run, _ in relevant_directories( path=path): current_path = os.path.join( os.path.join(base_path, workernode), run) converter = CSVReader() parser = TrafficStreamParser(workernode=workernode, run=run, data_source=self, path=current_path, data_reader=converter) converter.parser = parser for traffic in self._read_stream( path=current_path, data_path=os.path.join( os.path.join( kwargs.get("data_path", self.default_path), workernode), run), workernode=workernode, run=run, stateful=kwargs.get("stateful", False), pattern="^[0-9]{10}-traffic.log-[0-9]{8}", converter=converter): yield traffic