def serial_worker(force: bool, pipeline: _Pipeline, mappers: Tuple[_Mapper, ...], root_data_dir_path: Path) -> Tuple[Path, Path]: storage = PipelineStorage(pipeline_id=pipeline.id, root_data_dir_path=root_data_dir_path) pipeline_wrapper = PipelineWrapper(pipeline, storage) pipeline_wrapper.run(force=force, mappers=mappers) edges_csv_file_path = storage.loaded_data_dir_path / 'edges.csv' nodes_csv_file_path = storage.loaded_data_dir_path / 'nodes.csv' return edges_csv_file_path, nodes_csv_file_path
def test_swow_pipeline(pipeline_storage, sample_archive_path, sample_swow_edges, sample_swow_nodes): args = {'swow_archive_path': sample_archive_path} swow_pipeline = SwowPipeline(**args) pipeline_wrapper = PipelineWrapper(swow_pipeline, pipeline_storage) extract_kwds = pipeline_wrapper.extract() graph_generator = pipeline_wrapper.transform(**extract_kwds) nodes, edges = set(), set() for node_or_edge in graph_generator: if isinstance(node_or_edge, KgNode): nodes.add(node_or_edge) elif isinstance(node_or_edge, KgEdge): edges.add(node_or_edge) assert nodes == sample_swow_nodes assert edges == sample_swow_edges
def test_rpi_combined_pipeline(pipeline_storage, graph_generator): rows_per_pipeline = 6 pipelines = tuple( MockPipeline(id=f'pipe_{pipe_num}', single_source=False, transformer=MockTransformer( tuple(islice(graph_generator, rows_per_pipeline)))) for pipe_num in range(1, 4)) combined_pipeline = RpiCombinedPipeline(pipelines=pipelines, parallel=False) wrapper = PipelineWrapper(combined_pipeline, pipeline_storage) extract_kwds = wrapper.extract() transform_result = wrapper.transform(**extract_kwds) graph = tuple(transform_result) assert len(graph) == len(pipelines) * rows_per_pipeline
def test_web_child_pipeline( pipeline_storage, part_whole_zip_url, part_whole_archive_filenames, wordnet_sense_url, web_child_test_http_client, ): pipeline = WebChildPipeline(http_client=web_child_test_http_client, part_whole_url=part_whole_zip_url, wordnet_sense_url=wordnet_sense_url, **part_whole_archive_filenames) pipeline_wrapper = PipelineWrapper(pipeline, pipeline_storage)
def __call__(self, args): if args.pipeline_module is None: raise ValueError("must specify a pipeline module") pipeline_class = self.__pipeline_class_dict[args.pipeline_module] pipeline = self.__instantiate_pipeline(args, pipeline_class) pipeline_storage = PipelineStorage( pipeline_id=pipeline.id, root_data_dir_path=self.__create_data_dir_path(args), ) pipeline_wrapper = PipelineWrapper(pipeline=pipeline, storage=pipeline_storage) run_kwds = { "force": bool(getattr(args, "force", False)), "skip_whole_graph_check": bool(getattr(args, "skip_whole_graph_check", False)) } if pipeline_class.__name__ == RpiCombinedPipeline.__name__: # The odd imports make this necessary # Combined pipeline does its own mapping pipeline_wrapper.run(**run_kwds) else: with Mappers() as mappers: pipeline_wrapper.run(mappers=mappers, **run_kwds)
def run(node_edge_sequence: Tuple[Union[KgNode, KgEdge], ...], pipeline_storage: PipelineStorage): return PipelineWrapper(MockPipeline(node_edge_sequence), pipeline_storage).run()
def test_has_part_pipeline(pipeline_storage): pipeline = HasPartPipeline() pipeline_wrapper = PipelineWrapper(pipeline, pipeline_storage) pipeline_wrapper.run(force=False)
def test_food_on_pipeline(pipeline_storage): pipeline = FoodOnPipeline() pipeline_wrapper = PipelineWrapper(pipeline, pipeline_storage) pipeline_wrapper.run(force=False)
def test_mcs_benchmark_pipeline(pipeline_storage): pipeline = McsBenchmarkPipeline(bzip=False) wrapper = PipelineWrapper(pipeline, pipeline_storage) wrapper.run()
def test_sentic_pipeline( pipeline_storage, sentic_zip_url, full_sentic_zip_client, full_sentic_zip_owl_filename, ): argparse = ArgParser() SenticPipeline.add_arguments(argparse) args = argparse.parse_args( [ "--sentic_zip_url", sentic_zip_url, "--owl_filename", full_sentic_zip_owl_filename, ] ) pipeline_kwds = vars(args).copy() sentic_pipeline = SenticPipeline( http_client=full_sentic_zip_client, **pipeline_kwds ) pipeline_wrapper = PipelineWrapper(sentic_pipeline, pipeline_storage) extract_kwds = pipeline_wrapper.extract() graph_generator = pipeline_wrapper.transform(**extract_kwds) nodes_by_id = {} primitive_ids = set() sentic_ids = set() edges_by_subject = {} for node_or_edge in graph_generator: if isinstance(node_or_edge, KgNode): node = node_or_edge nodes_by_id[node.id] = node type = node.id.split(":", 2)[1] if type == sentic_types.PRIMITIVE: primitive_ids.add(node.id) elif type == sentic_types.SENTIC: sentic_ids.add(node.id) elif isinstance(node_or_edge, KgEdge): edge = node_or_edge subject_edges = edges_by_subject.setdefault(edge.subject, []) subject_edges.append(node_or_edge) # assert that all concept nodes are related to sentics, primitives and other concepts for id, node in nodes_by_id.items(): type = node.id.split(":", 2)[1] if type != sentic_types.CONCEPT: continue assert id in edges_by_subject concept_edges, primitive_edges, sentic_edges = [], [], [] for edge in edges_by_subject[id]: if edge.object in primitive_ids: primitive_edges.append(edge) elif edge.object in sentic_ids: sentic_edges.append(edge) else: concept_edges.append(edge) assert len(concept_edges) > 0, node assert len(primitive_edges) > 0, node assert len(sentic_edges) > 0, node