예제 #1
0
    def process_article(self, raw_article, database, entity_properties):
        """
        Process an article generated by previous tasks and process them.

        :param raw_article: Raw unserialized article.
        :type raw_article: str
        :param database: Neo4j luigi target.
        :type database: bwg.db.neo4j.Neo4jTarget
        :param entity_properties: Wikidata properties of all entities as dictionary.
        :type entity_properties: dict
        """
        debug = self.task_config.get("PIPELINE_DEBUG", False)
        encoding = self.task_config["CORPUS_ENCODING"]
        article = deserialize_line(raw_article, encoding)
        article_meta, article_data = article["meta"], article["data"]

        if debug:
            print("{} processing article '{}'...".format(
                self.__class__.__name__, article["meta"]["title"]))

        for sentence_id, sentence_json in article_data.items():
            if debug:
                print("{} finished sentence #{}.".format(
                    self.__class__.__name__, sentence_id))

            for relation_id, relation_json in sentence_json["data"][
                    "relations"].items():
                database.add_relation(relation_json,
                                      sentence_json["data"]["sentence"],
                                      entity_properties)
예제 #2
0
 def test_just_dump():
     json_object = {"sentence": RAW_SENTENCE}
     assert type(just_dump(json_object)) == str
     assert type(just_dump(json_object, pretty=True)) == str
     assert json_object == json.loads(just_dump(json_object))
     assert json_object == json.loads(just_dump(json_object, pretty=True))
     assert json_object == deserialize_line(just_dump(json_object))
    def run(self):
        encoding = self.task_config["CORPUS_ENCODING"]
        article_ids = []

        with self.input().open("r") as input_file, self.output().open(
                "w") as output_file:
            for line in input_file:
                article = deserialize_line(line, encoding)
                article_ids.append(article["meta"]["id"])

            run_info = self._generate_run_information(article_ids)
            output_file.write("{}\n".format(just_dump(run_info)))
예제 #4
0
    def _read_pipeline_run_info(self, pri_file):
        """
        Read the current pipeline run info.

        :param pri_file: File with pipeline run info.
        :type: Luigi.target.
        :return: Pipeline run info.
        :rtype: dict
        """
        encoding = self.task_config["CORPUS_ENCODING"]

        for line in pri_file:
            self.pipeline_run_info = deserialize_line(line, encoding)
            break
예제 #5
0
    def _read_properties_file(self, properties_file):
        """
        Read all Wikidata properties from properties file.

        :param properties_file: File with Wikidata properties.
        :type properties_file: luigi.Target.
        :return: Wikidata properties of all entities as dictionary.
        :rtype: dict
        """
        encoding = self.task_config["CORPUS_ENCODING"]
        entity_properties = {}

        for line in properties_file:
            article = deserialize_line(line, encoding)
            article_meta, article_data = article["meta"], article["data"]

            entity_properties.update(self._extract_properties(article_data))

        return entity_properties