def process_article(self, raw_article, database, entity_properties): """ Process an article generated by previous tasks and process them. :param raw_article: Raw unserialized article. :type raw_article: str :param database: Neo4j luigi target. :type database: bwg.db.neo4j.Neo4jTarget :param entity_properties: Wikidata properties of all entities as dictionary. :type entity_properties: dict """ debug = self.task_config.get("PIPELINE_DEBUG", False) encoding = self.task_config["CORPUS_ENCODING"] article = deserialize_line(raw_article, encoding) article_meta, article_data = article["meta"], article["data"] if debug: print("{} processing article '{}'...".format( self.__class__.__name__, article["meta"]["title"])) for sentence_id, sentence_json in article_data.items(): if debug: print("{} finished sentence #{}.".format( self.__class__.__name__, sentence_id)) for relation_id, relation_json in sentence_json["data"][ "relations"].items(): database.add_relation(relation_json, sentence_json["data"]["sentence"], entity_properties)
def test_just_dump(): json_object = {"sentence": RAW_SENTENCE} assert type(just_dump(json_object)) == str assert type(just_dump(json_object, pretty=True)) == str assert json_object == json.loads(just_dump(json_object)) assert json_object == json.loads(just_dump(json_object, pretty=True)) assert json_object == deserialize_line(just_dump(json_object))
def run(self): encoding = self.task_config["CORPUS_ENCODING"] article_ids = [] with self.input().open("r") as input_file, self.output().open( "w") as output_file: for line in input_file: article = deserialize_line(line, encoding) article_ids.append(article["meta"]["id"]) run_info = self._generate_run_information(article_ids) output_file.write("{}\n".format(just_dump(run_info)))
def _read_pipeline_run_info(self, pri_file): """ Read the current pipeline run info. :param pri_file: File with pipeline run info. :type: Luigi.target. :return: Pipeline run info. :rtype: dict """ encoding = self.task_config["CORPUS_ENCODING"] for line in pri_file: self.pipeline_run_info = deserialize_line(line, encoding) break
def _read_properties_file(self, properties_file): """ Read all Wikidata properties from properties file. :param properties_file: File with Wikidata properties. :type properties_file: luigi.Target. :return: Wikidata properties of all entities as dictionary. :rtype: dict """ encoding = self.task_config["CORPUS_ENCODING"] entity_properties = {} for line in properties_file: article = deserialize_line(line, encoding) article_meta, article_data = article["meta"], article["data"] entity_properties.update(self._extract_properties(article_data)) return entity_properties