Ejemplo n.º 1
0
def build(config, archiver, operators):
    """
    Build the history given a archiver and collection of operators.

    :param config: The wily configuration
    :type  config: :namedtuple:`wily.config.WilyConfig`

    :param archiver: The archiver to use
    :type  archiver: :namedtuple:`wily.archivers.Archiver`

    :param operators: The list of operators to execute
    :type operators: `list` of :namedtuple:`wily.operators.Operator`
    """
    try:
        logger.debug(f"Using {archiver.name} archiver module")
        archiver = archiver.cls(config)
        revisions = archiver.revisions(config.path, config.max_revisions)
    except InvalidGitRepositoryError:
        # TODO: This logic shouldn't really be here (SoC)
        logger.info(f"Defaulting back to the filesystem archiver, not a valid git repo")
        archiver = FilesystemArchiver(config)
        revisions = archiver.revisions(config.path, config.max_revisions)
    except Exception as e:
        if hasattr(e, "message"):
            logger.error(f"Failed to setup archiver: '{e.message}'")
        else:
            logger.error(f"Failed to setup archiver: '{type(e)} - {e}'")
        exit(1)

    state = State(config, archiver=archiver)
    # Check for existence of cache, else provision
    state.ensure_exists()

    index = state.index[archiver.name]

    # remove existing revisions from the list
    revisions = [revision for revision in revisions if revision not in index][::-1]

    logger.info(
        f"Found {len(revisions)} revisions from '{archiver.name}' archiver in '{config.path}'."
    )

    _op_desc = ",".join([operator.name for operator in operators])
    logger.info(f"Running operators - {_op_desc}")

    bar = Bar("Processing", max=len(revisions) * len(operators))
    state.operators = operators

    # Index all files the first time, only scan changes afterward
    seed = True
    prev_roots = None
    try:
        with multiprocessing.Pool(processes=len(operators)) as pool:
            for revision in revisions:
                # Checkout target revision
                archiver.checkout(revision, config.checkout_options)
                stats = {"operator_data": {}}

                if seed:
                    targets = config.targets
                else:  # Only target changed files
                    # TODO : Check that changed files are children of the targets
                    targets = [
                        str(pathlib.Path(config.path) / pathlib.Path(file))
                        for file in revision.files
                        # if any([True for target in config.targets if
                        #         target in pathlib.Path(pathlib.Path(config.path) / pathlib.Path(file)).parents])
                    ]

                # Run each operator as a separate process
                data = pool.starmap(
                    run_operator,
                    [(operator, revision, config, targets) for operator in operators],
                )
                # data is a list of tuples, where for each operator, it is a tuple of length 2,
                operator_data_len = 2
                # second element in the tuple, i.e data[i][1]) has the collected data
                for i in range(0, len(operators)):
                    if i < len(data) and len(data[i]) >= operator_data_len and len(data[i][1]) == 0:
                        logger.warn(f"In revision {revision.key}, for operator {operators[i].name}: No data collected")

                # Map the data back into a dictionary
                for operator_name, result in data:
                    # find all unique directories in the results
                    roots = {pathlib.Path(entry).parents[0] for entry in result.keys()}
                    indices = set(result.keys())

                    # For a seed run, there is no previous change set, so use current
                    if seed:
                        prev_roots = roots
                        prev_indices = indices
                    roots = prev_roots | roots

                    # Copy the ir from any unchanged files from the prev revision
                    if not seed:
                        missing_indices = prev_indices - indices
                        # TODO: Check existence of file path.
                        for missing in missing_indices:
                            # Don't copy aggregate keys as their values may have changed
                            if missing in roots:
                                continue
                            # previous index may not have that operator
                            if operator_name not in prev_stats["operator_data"]:
                                continue
                            # previous index may not have file either
                            if (
                                missing
                                not in prev_stats["operator_data"][operator_name]
                            ):
                                continue
                            result[missing] = prev_stats["operator_data"][
                                operator_name
                            ][missing]

                    # Aggregate metrics across all root paths using the aggregate function in the metric
                    for root in roots:
                        # find all matching entries recursively
                        aggregates = [
                            path
                            for path in result.keys()
                            if root in pathlib.Path(path).parents
                        ]
                        result[str(root)] = {"total": {}}
                        # aggregate values
                        for metric in resolve_operator(operator_name).cls.metrics:
                            func = metric.aggregate
                            values = [
                                result[aggregate]["total"][metric.name]
                                for aggregate in aggregates
                                if aggregate in result
                                and metric.name in result[aggregate]["total"]
                            ]
                            if len(values) > 0:
                                result[str(root)]["total"][metric.name] = func(values)

                    prev_indices = set(result.keys())
                    prev_roots = roots
                    stats["operator_data"][operator_name] = result
                    bar.next()

                prev_stats = stats
                seed = False
                ir = index.add(revision, operators=operators)
                ir.store(config, archiver, stats)
        index.save()
        bar.finish()
    except Exception as e:
        logger.error(f"Failed to build cache: {type(e)}: '{e}'")
        raise e
    finally:
        # Reset the archive after every run back to the head of the branch
        archiver.finish()
Ejemplo n.º 2
0
def build(config, archiver, operators):
    """
    Build the history given a archiver and collection of operators.

    :param config: The wily configuration
    :type  config: :namedtuple:`wily.config.WilyConfig`

    :param archiver: The archiver to use
    :type  archiver: :namedtuple:`wily.archivers.Archiver`

    :param operators: The list of operators to execute
    :type operators: `list` of :namedtuple:`wily.operators.Operator`
    """
    try:
        logger.debug(f"Using {archiver.name} archiver module")
        archiver = archiver.cls(config)
        revisions = archiver.revisions(config.path, config.max_revisions)
    except InvalidGitRepositoryError:
        # TODO: This logic shouldn't really be here (SoC)
        logger.info(
            f"Defaulting back to the filesystem archiver, not a valid git repo"
        )
        archiver = FilesystemArchiver(config)
        revisions = archiver.revisions(config.path, config.max_revisions)
    except Exception as e:
        if hasattr(e, "message"):
            logger.error(f"Failed to setup archiver: '{e.message}'")
        else:
            logger.error(f"Failed to setup archiver: '{type(e)} - {e}'")
        exit(1)

    state = State(config, archiver=archiver)
    # Check for existence of cache, else provision
    state.ensure_exists()

    index = state.index[archiver.name]

    # remove existing revisions from the list
    revisions = [revision for revision in revisions if revision not in index]

    logger.info(
        f"Found {len(revisions)} revisions from '{archiver.name}' archiver in '{config.path}'."
    )

    _op_desc = ",".join([operator.name for operator in operators])
    logger.info(f"Running operators - {_op_desc}")

    bar = Bar("Processing", max=len(revisions) * len(operators))
    state.operators = operators
    try:
        with multiprocessing.Pool(processes=len(operators)) as pool:
            for revision in revisions:
                # Checkout target revision
                archiver.checkout(revision, config.checkout_options)
                stats = {"operator_data": {}}

                # Run each operator as a seperate process
                data = pool.starmap(
                    run_operator,
                    [(operator, revision, config) for operator in operators],
                )

                # Map the data back into a dictionary
                for operator_name, result in data:
                    # aggregate values to directories
                    roots = []

                    # find all unique directories in the results
                    for entry in result.keys():
                        parent = pathlib.Path(entry).parents[0]
                        if parent not in roots:
                            roots.append(parent)

                    for root in roots:
                        # find all matching entries recursively
                        aggregates = [
                            path for path in result.keys()
                            if root in pathlib.Path(path).parents
                        ]
                        result[str(root)] = {"total": {}}
                        # aggregate values
                        for metric in resolve_operator(
                                operator_name).cls.metrics:
                            func = metric.aggregate
                            values = [
                                result[aggregate]["total"][metric.name]
                                for aggregate in aggregates
                                if aggregate in result
                                and metric.name in result[aggregate]["total"]
                            ]
                            if len(values) > 0:
                                result[str(root)]["total"][metric.name] = func(
                                    values)

                    stats["operator_data"][operator_name] = result
                    bar.next()

                ir = index.add(revision, operators=operators)
                ir.store(config, archiver, stats)
        index.save()
        bar.finish()
    except Exception as e:
        logger.error(f"Failed to build cache: '{e}'")
        raise e
    finally:
        # Reset the archive after every run back to the head of the branch
        archiver.finish()