def jobs(self, **kwargs):
     """
     :param path:
     :param source:
     :param pattern:
     :param stateful:
     :return:
     """
     path = kwargs.get("path", self.default_path)
     if "processed" in kwargs.get("source", "processed"):
         converter = CSVReader()
         for base_path, workernode, run, filename in relevant_directories(
                 path=path):
             current_path = os.path.join(
                 os.path.join(base_path, workernode), run)
             if filename:
                 for job in self.read_job(path=current_path,
                                          name=filename,
                                          converter=converter):
                     yield job
             else:
                 for dir_entry in sorted(os.listdir(current_path)):
                     matches = re.match(
                         kwargs.get("pattern", "(\d*)-process.csv"),
                         dir_entry)
                     if matches:
                         for job in self.read_job(path=current_path,
                                                  name=matches.group(1),
                                                  converter=converter):
                             yield job
     else:
         # convert raw data
         for base_path, workernode, run, _ in relevant_directories(
                 path=path):
             current_path = os.path.join(
                 os.path.join(base_path, workernode), run)
             converter = CSVReader()
             parser = ProcessStreamParser(workernode=workernode,
                                          run=run,
                                          data_source=self,
                                          path=current_path,
                                          data_reader=converter)
             converter.parser = parser
             for job in self._read_stream(
                     path=current_path,
                     data_path=os.path.join(
                         os.path.join(
                             kwargs.get("data_path", self.default_path),
                             workernode), run),
                     workernode=workernode,
                     run=run,
                     stateful=kwargs.get("stateful", False),
                     pattern="^[0-9]{10}-process.log-[0-9]{8}",
                     converter=converter):
                 yield job
 def network_statistics(self, **kwargs):
     """
     :param path:
     :param stateful:
     :return:
     """
     path = kwargs.get("path", self.default_path)
     for base_path, workernode, run, _ in relevant_directories(path=path):
         current_path = os.path.join(os.path.join(base_path, workernode),
                                     run)
         converter = CSVReader()
         parser = NetworkStatisticsParser(workernode=workernode,
                                          run=run,
                                          data_source=self,
                                          path=current_path,
                                          data_reader=converter)
         converter.parser = parser
         for statistics in self._read_stream(
                 path=current_path,
                 workernode=workernode,
                 run=run,
                 stateful=kwargs.get("stateful", False),
                 pattern="^[0-9]{10}-(process|traffic).log-[0-9]{8}",
                 converter=converter):
             yield statistics
Beispiel #3
0
def create_payloads():
    """
    For CMS pilots there is a very simple (but quick-n-dirty) approach to recognise the actual
    payloads. The method is based on the name of processes. As soon as there is a reliable but
    automatic solution, I need to switch to this one.

    The extractor looks into the different CMS jobs, which payloads still have not been identified
    and extracts those data. It is saved to `processed/payloads`.

    The payload ids are build from the job id from database and additionally the payload count.
    """
    print
    print "Starting to extract payloads from CMS pilots"
    path = eval_input_path()
    output_path = eval_output_path()
    count = eval_cores()
    level = directory_level(path)
    if level == RUN_LEVEL:
        count = 1

    do_multicore(count=count,
                 target=_create_payloads,
                 data=[{
                     "path":
                     os.path.join(os.path.join(element[0], element[1]),
                                  element[2]),
                     "output_path":
                     output_path
                 } for element in list(relevant_directories(path))])
Beispiel #4
0
def archive_jobs():
    """
    Function that starts archival of written jobs into a single zip file.
    """
    print
    print "Starting to archive valid and complete jobs"
    path = eval_input_path()
    count = eval_cores()
    names = ["path", "workernode", "run"]
    data = [
        dict(zip(names, element))
        for element in list(relevant_directories(path))
    ]
    do_multicore(count=count, target=_archive_jobs, data=data)
Beispiel #5
0
    def jobs(self, **kwargs):
        """
        :param path:
        :param data_path:
        :param source:
        :return:
        """
        if "raw" in kwargs.get("source", "processed"):
            for job in FileDataSource.jobs(self, **kwargs):
                yield job
        else:
            with SQLCommand(dataSource=self._db_data_source) as sql_command:
                path = kwargs.get("path", self.default_path)
                level = directory_level(path)
                job_object = DBJobObject(valid=True, completed=True)
                if level == RUN_LEVEL:
                    _, workernode, run, _ = next(relevant_directories(path=path),
                                                      (None, None, None))
                    job_object.run = run
                    workernode_object = self._db_operator.load_or_create_workernode(data=workernode)
                    job_object.workernode_id = workernode_object.id_value
                elif level == WORKERNODE_LEVEL:
                    workernode = os.path.split(path)[1]
                    workernode_object = self._db_operator.load_or_create_workernode(data=workernode)
                    job_object.workernode_id = workernode_object.id_value
                elif level == FILE_LEVEL:
                    job_object = DBJobObject(
                        id=os.path.basename(path).split("-")[0], valid=True, completed=True)

                for job_result in sql_command.find(job_object):
                    current_path = path
                    if level == BASE_LEVEL:
                        # join different workernodes and runs
                        workernode_object = self._db_operator.load_one(
                            data=DBWorkernodeObject(id=job_result.workernode_id)
                        )
                        current_path = os.path.join(os.path.join(path, workernode_object.name),
                                                    job_result.run)
                    elif level == WORKERNODE_LEVEL:
                        # join different runs
                        current_path = os.path.join(path, job_result.run)
                    elif level == FILE_LEVEL:
                        current_path = os.path.dirname(path)

                    for job in FileDataSource.read_job(
                            self,
                            path=current_path,
                            name=job_result.id_value):
                        yield job
def create_payloads(ctx, paths, output_path, pcount):
    data = []
    for path in paths:
        # prepare data
        for folder, workernode_subdir, run_subdir, _ in relevant_directories(
                path):
            # get all relevant files
            current_path = os.path.join(
                os.path.join(folder, workernode_subdir), run_subdir)
            data.extend([{
                "path": filename,
                "output_path": output_path
            } for filename in glob.glob("%s/*-process.csv" % current_path)])
    if pcount > 1:
        do_multicore(count=pcount, target=_create_payloads, data=data)
    else:
        for element in data:
            _create_payloads(element)
def prepare_raw_data(ctx, paths, output_path, pcount):
    data = []
    for path in paths:
        # prepare data
        # TODO: is this called for every filename?!
        for folder, workernode_subdir, run_subdir, _ in relevant_directories(
                path):
            data.append({
                "path":
                os.path.join(os.path.join(folder, workernode_subdir),
                             run_subdir),
                "output_path":
                output_path
            })
    if pcount > 1:
        do_multicore(count=pcount, target=_prepare_raw_data, data=data)
    else:
        _prepare_raw_data(data[0])
Beispiel #8
0
def prepare_raw_data():
    """
    Function that starts the splitting of raw data that was directly retrieved by GNM tool.
    """
    print
    print "Starting to split data stream into jobs"
    path = eval_input_path()
    output_path = eval_output_path()
    count = eval_cores()
    level = directory_level(path)
    if level == RUN_LEVEL:
        count = 1
    do_multicore(count=count,
                 target=_prepare_raw_data,
                 data=[{
                     "path":
                     os.path.join(os.path.join(element[0], element[1]),
                                  element[2]),
                     "output_path":
                     output_path
                 } for element in list(relevant_directories(path))])
Beispiel #9
0
def generate_network_statistics():
    """
    Function that starts generation of network statistics based on available raw data.
    """
    print
    print "Starting to generate network statistics"
    path = eval_input_path()
    output_path = eval_output_path()
    count = eval_cores()
    level = directory_level(path)
    if level == RUN_LEVEL:
        count = 1
    do_multicore(count=count,
                 target=_generate_network_statistics,
                 data=[{
                     "path":
                     os.path.join(os.path.join(element[0], element[1]),
                                  element[2]),
                     "output_path":
                     output_path
                 } for element in list(relevant_directories(path))])
Beispiel #10
0
 def traffics(self, **kwargs):
     """
     :param path:
     :param data_path:
     :param source:
     :param stateful:
     :return:
     """
     path = kwargs.get("path", self.default_path)
     if "processed" in kwargs.get("source", "processed"):
         pass
     else:
         # convert raw data
         for base_path, workernode, run, _ in relevant_directories(
                 path=path):
             current_path = os.path.join(
                 os.path.join(base_path, workernode), run)
             converter = CSVReader()
             parser = TrafficStreamParser(workernode=workernode,
                                          run=run,
                                          data_source=self,
                                          path=current_path,
                                          data_reader=converter)
             converter.parser = parser
             for traffic in self._read_stream(
                     path=current_path,
                     data_path=os.path.join(
                         os.path.join(
                             kwargs.get("data_path", self.default_path),
                             workernode), run),
                     workernode=workernode,
                     run=run,
                     stateful=kwargs.get("stateful", False),
                     pattern="^[0-9]{10}-traffic.log-[0-9]{8}",
                     converter=converter):
                 yield traffic