Beispiel #1
0
def serial_worker(force: bool, pipeline: _Pipeline, mappers: Tuple[_Mapper,
                                                                   ...],
                  root_data_dir_path: Path) -> Tuple[Path, Path]:
    storage = PipelineStorage(pipeline_id=pipeline.id,
                              root_data_dir_path=root_data_dir_path)
    pipeline_wrapper = PipelineWrapper(pipeline, storage)

    pipeline_wrapper.run(force=force, mappers=mappers)

    edges_csv_file_path = storage.loaded_data_dir_path / 'edges.csv'
    nodes_csv_file_path = storage.loaded_data_dir_path / 'nodes.csv'

    return edges_csv_file_path, nodes_csv_file_path
Beispiel #2
0
def test_swow_pipeline(pipeline_storage, sample_archive_path,
                       sample_swow_edges, sample_swow_nodes):
    args = {'swow_archive_path': sample_archive_path}
    swow_pipeline = SwowPipeline(**args)
    pipeline_wrapper = PipelineWrapper(swow_pipeline, pipeline_storage)

    extract_kwds = pipeline_wrapper.extract()
    graph_generator = pipeline_wrapper.transform(**extract_kwds)

    nodes, edges = set(), set()
    for node_or_edge in graph_generator:
        if isinstance(node_or_edge, KgNode):
            nodes.add(node_or_edge)
        elif isinstance(node_or_edge, KgEdge):
            edges.add(node_or_edge)

    assert nodes == sample_swow_nodes
    assert edges == sample_swow_edges
Beispiel #3
0
def test_rpi_combined_pipeline(pipeline_storage, graph_generator):
    rows_per_pipeline = 6

    pipelines = tuple(
        MockPipeline(id=f'pipe_{pipe_num}',
                     single_source=False,
                     transformer=MockTransformer(
                         tuple(islice(graph_generator, rows_per_pipeline))))
        for pipe_num in range(1, 4))

    combined_pipeline = RpiCombinedPipeline(pipelines=pipelines,
                                            parallel=False)

    wrapper = PipelineWrapper(combined_pipeline, pipeline_storage)
    extract_kwds = wrapper.extract()
    transform_result = wrapper.transform(**extract_kwds)

    graph = tuple(transform_result)
    assert len(graph) == len(pipelines) * rows_per_pipeline
def test_web_child_pipeline(
    pipeline_storage,
    part_whole_zip_url,
    part_whole_archive_filenames,
    wordnet_sense_url,
    web_child_test_http_client,
):
    pipeline = WebChildPipeline(http_client=web_child_test_http_client,
                                part_whole_url=part_whole_zip_url,
                                wordnet_sense_url=wordnet_sense_url,
                                **part_whole_archive_filenames)
    pipeline_wrapper = PipelineWrapper(pipeline, pipeline_storage)
    def __call__(self, args):
        if args.pipeline_module is None:
            raise ValueError("must specify a pipeline module")
        pipeline_class = self.__pipeline_class_dict[args.pipeline_module]

        pipeline = self.__instantiate_pipeline(args, pipeline_class)
        pipeline_storage = PipelineStorage(
            pipeline_id=pipeline.id,
            root_data_dir_path=self.__create_data_dir_path(args),
        )
        pipeline_wrapper = PipelineWrapper(pipeline=pipeline,
                                           storage=pipeline_storage)
        run_kwds = {
            "force":
            bool(getattr(args, "force", False)),
            "skip_whole_graph_check":
            bool(getattr(args, "skip_whole_graph_check", False))
        }
        if pipeline_class.__name__ == RpiCombinedPipeline.__name__:  # The odd imports make this necessary
            # Combined pipeline does its own mapping
            pipeline_wrapper.run(**run_kwds)
        else:
            with Mappers() as mappers:
                pipeline_wrapper.run(mappers=mappers, **run_kwds)
Beispiel #6
0
def run(node_edge_sequence: Tuple[Union[KgNode, KgEdge], ...],
        pipeline_storage: PipelineStorage):
    return PipelineWrapper(MockPipeline(node_edge_sequence),
                           pipeline_storage).run()
Beispiel #7
0
def test_has_part_pipeline(pipeline_storage):
    pipeline = HasPartPipeline()
    pipeline_wrapper = PipelineWrapper(pipeline, pipeline_storage)

    pipeline_wrapper.run(force=False)
def test_food_on_pipeline(pipeline_storage):
    pipeline = FoodOnPipeline()
    pipeline_wrapper = PipelineWrapper(pipeline, pipeline_storage)

    pipeline_wrapper.run(force=False)
Beispiel #9
0
def test_mcs_benchmark_pipeline(pipeline_storage):
    pipeline = McsBenchmarkPipeline(bzip=False)
    wrapper = PipelineWrapper(pipeline, pipeline_storage)
    wrapper.run()
Beispiel #10
0
def test_sentic_pipeline(
    pipeline_storage,
    sentic_zip_url,
    full_sentic_zip_client,
    full_sentic_zip_owl_filename,
):
    argparse = ArgParser()
    SenticPipeline.add_arguments(argparse)

    args = argparse.parse_args(
        [
            "--sentic_zip_url",
            sentic_zip_url,
            "--owl_filename",
            full_sentic_zip_owl_filename,
        ]
    )
    pipeline_kwds = vars(args).copy()
    sentic_pipeline = SenticPipeline(
        http_client=full_sentic_zip_client, **pipeline_kwds
    )
    pipeline_wrapper = PipelineWrapper(sentic_pipeline, pipeline_storage)

    extract_kwds = pipeline_wrapper.extract()
    graph_generator = pipeline_wrapper.transform(**extract_kwds)

    nodes_by_id = {}
    primitive_ids = set()
    sentic_ids = set()
    edges_by_subject = {}
    for node_or_edge in graph_generator:
        if isinstance(node_or_edge, KgNode):
            node = node_or_edge
            nodes_by_id[node.id] = node
            type = node.id.split(":", 2)[1]
            if type == sentic_types.PRIMITIVE:
                primitive_ids.add(node.id)
            elif type == sentic_types.SENTIC:
                sentic_ids.add(node.id)
        elif isinstance(node_or_edge, KgEdge):
            edge = node_or_edge
            subject_edges = edges_by_subject.setdefault(edge.subject, [])
            subject_edges.append(node_or_edge)

    # assert that all concept nodes are related to sentics, primitives and other concepts
    for id, node in nodes_by_id.items():
        type = node.id.split(":", 2)[1]
        if type != sentic_types.CONCEPT:
            continue
        assert id in edges_by_subject
        concept_edges, primitive_edges, sentic_edges = [], [], []
        for edge in edges_by_subject[id]:
            if edge.object in primitive_ids:
                primitive_edges.append(edge)
            elif edge.object in sentic_ids:
                sentic_edges.append(edge)
            else:
                concept_edges.append(edge)
        assert len(concept_edges) > 0, node
        assert len(primitive_edges) > 0, node
        assert len(sentic_edges) > 0, node