Beispiel #1
0
 def _default_configs(cls) -> Config:
     # pylint: disable=protected-access
     merged = Config(cls.default_configs(), {}, allow_new_hparam=True)
     for base in cls.__bases__:
         if hasattr(base, "_default_configs"):
             merged = Config(
                 merged,
                 base._default_configs().todict(),  # type: ignore
                 allow_new_hparam=True,
             )
             break
     return merged
Beispiel #2
0
    def initialize(self, resources: Resources, configs: Config):
        # Populate the _request. The self._request_ready help avoid parsing
        # the feature scheme multiple times during `initialize`.
        if not self._request_ready:
            for key, value in configs.items():
                if key == "feature_scheme":
                    self._request["schemes"] = parse_feature_extractors(
                        configs.feature_scheme)
                else:
                    self._request[key] = value
            self._request_ready = True

        batcher_config = configs.batcher
        # Assign context type from here to make sure batcher is using the
        # same context type as predictor.
        batcher_context = configs["batcher"].get("context_type", None)
        if (batcher_context is None
                or batcher_context == self._request["context_type"]):
            batcher_config.context_type = self._request["context_type"]
        else:
            raise ProcessorConfigError(
                "The 'context_type' configuration value should be the same "
                "for the processor and the batcher, now for the processor the "
                f"value is {self._request['context_type']} and for the "
                f"batcher the value is {batcher_context}. It is also fine if "
                f"this value for batch config is left empty.")
        self.do_eval = configs.do_eval

        # This needs to be called later since batcher config needs to be loaded.
        super().initialize(resources, configs)
        for tag, scheme in self._request["schemes"].items():
            # Add input feature to the batcher.
            if scheme["type"] == extractor_utils.DATA_INPUT:
                self.batcher.add_feature_scheme(tag, scheme)  # type: ignore
 def __init__(self):
     self.resources: Resources = Resources()
     self.configs: Config = Config({}, {})
     # Determine whether to check the consistencies between components.
     self._check_type_consistency: bool = False
     # The flag indicating whether the component is initialized.
     self.__is_initialized: bool = False
Beispiel #4
0
    def __init__(self, config: Optional[Union[Dict, Config]] = None):
        super().__init__()
        self._config = Config(hparams=config,
                              default_hparams=self.default_configs())
        self._meta_data: Dict[int, str] = {}

        index_type = self._config.index_type
        device = self._config.device
        dim = self._config.dim

        if device.lower().startswith("gpu"):
            if isinstance(index_type, str) and not index_type.startswith("Gpu"):
                index_type = "Gpu" + index_type

            index_class = utils.get_class(index_type, module_paths=["faiss"])
            gpu_resource = faiss.StandardGpuResources()
            gpu_id = int(device[3:])
            if faiss.get_num_gpus() < gpu_id:
                gpu_id = 0
                logging.warning("Cannot create the index on device %s. "
                                "Total number of GPUs on this machine is "
                                "%s. Using gpu0 for the index.",
                                self._config.device, faiss.get_num_gpus())
            config_class_name = \
                self.INDEX_TYPE_TO_CONFIG.get(index_class.__name__)
            config = utils.get_class(config_class_name,  # type: ignore
                                     module_paths=["faiss"])()
            config.device = gpu_id
            self._index = index_class(gpu_resource, dim, config)

        else:
            index_class = utils.get_class(index_type, module_paths=["faiss"])
            self._index = index_class(dim)
    def initialize(self, resources: Resources, configs: Config):
        self.resources = resources
        self.config = Config(configs, self.default_configs())

        # TODO: At the time of writing, no way in texar to set encoder in
        # `texar.torch.modules.classifiers.BertClassifier`. Should not ideally
        # be changing a private variable.
        # pylint: disable=protected-access
        BERTClassifier._ENCODER_CLASS = BERTEncoder
        # pylint: enable=protected-access

        cache_dir = os.path.join(os.path.dirname(__file__),
                                 self.config.model_dir)

        self.device = torch.device('cuda:0') \
            if torch.cuda.is_available() else torch.device('cpu')

        self.model = BERTClassifier(
            pretrained_model_name=self.config.pretrained_model_name,
            cache_dir=cache_dir,
            hparams=self.config).to(self.device)

        self.tokenizer = BERTTokenizer(
            pretrained_model_name=self.config.pretrained_model_name,
            cache_dir=cache_dir,
            hparams=None)
Beispiel #6
0
def main(input_path: str, output_path: str, max_packs: int = -1):
    pl = Pipeline[DataPack]()
    pl.set_reader(Mimic3DischargeNoteReader(),
                  config={'max_num_notes': max_packs})
    pl.add(NLTKSentenceSegmenter())

    config = yaml.safe_load(open("bio_ner_config.yml", "r"))
    config = Config(config, default_hparams=None)

    pl.add(BERTTokenizer(), config=config.BERTTokenizer)
    pl.add(BioBERTNERPredictor(), config=config.BioBERTNERPredictor)
    pl.add(ElasticSearchPackIndexProcessor())

    pl.add(
        PackIdJsonPackWriter(), {
            'output_dir': output_path,
            'indent': 2,
            'overwrite': True,
            'drop_record': True,
            'zip_pack': True
        })

    pl.initialize()

    for idx, pack in enumerate(pl.process_dataset(input_path)):
        if (idx + 1) % 50 == 0:
            print(f"{time.strftime('%m-%d %H:%M')}: Processed {idx + 1} packs")
 def initialize(self, config: Optional[Union[Config, Dict]] = None):
     # pylint: disable=attribute-defined-outside-init,unused-argument
     self._config = Config(config, default_hparams=self.default_configs())
     self._user_request = self._config.request
     self._validate_config()
     self._parse_request(self._user_request)
     self._build_vocab()
Beispiel #8
0
def stanford_nlp_example(lang: str, text: str):
    pl = Pipeline[DataPack]()
    pl.set_reader(StringReader())

    models_path = os.getcwd()
    config = Config(
        {
            'processors': 'tokenize,pos,lemma,depparse',
            'lang': lang,
            # Language code for the language to build the Pipeline
            'use_gpu': False
        },
        StandfordNLPProcessor.default_configs())
    pl.add(component=StandfordNLPProcessor(models_path), config=config)

    pl.initialize()

    pack = pl.process(text)
    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        tokens = [(token.text, token.pos, token.lemma)
                  for token in pack.get(Token, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")

        print(colored("Dependency Relations:", 'red'))
        for link in pack.get(Dependency, sentence):
            parent: Token = link.get_parent()  # type: ignore
            child: Token = link.get_child()  # type: ignore
            print(colored(child.text, 'cyan'), "has relation",
                  colored(link.rel_type, 'green'), "of parent",
                  colored(parent.text, 'cyan'))

        print("\n----------------------\n")
Beispiel #9
0
    def _query_tweets(self, query: str):
        """
        This function searches tweets using Tweepy.

        Args:
            query: user's input query for twitter API search

        Returns:
            List of tweets
        """
        credentials = yaml.safe_load(open(self.configs.credential_file, "r"))
        credentials = Config(credentials, default_hparams=None)

        auth = tw.OAuthHandler(credentials.consumer_key,
                               credentials.consumer_secret)
        auth.set_access_token(credentials.access_token,
                              credentials.access_token_secret)

        api = tw.API(auth, wait_on_rate_limit=True)

        # Collect tweets
        tweets = tw.Cursor(
            api.search,
            q=query,
            lang=self.configs.lang,
            since=self.configs.date_since,
            result_type=self.configs.result_type,
            tweet_mode="extended",
        ).items(self.configs.num_tweets_returned)

        return tweets
    def _parse_configs(self, configs):
        parsed_configs = self.default_configs()
        parsed_configs["batch_size"] = configs.batch_size
        parsed_configs["scope"] = get_class(configs.scope)
        parsed_configs["do_eval"] = configs.do_eval
        parsed_configs["feature_scheme"] = {}
        for tag, scheme in configs.feature_scheme.items():
            parsed_configs["feature_scheme"][tag] = {}
            if scheme["type"] == "data_input":
                parsed_configs["feature_scheme"][tag][
                    "type"] = TrainPreprocessor.DATA_INPUT
            elif scheme["type"] == "data_output":
                parsed_configs["feature_scheme"][tag][
                    "type"] = TrainPreprocessor.DATA_OUTPUT

            extractor = get_class(scheme["extractor"]["class_name"])()
            extractor.initialize(config=scheme["extractor"]["config"])
            if "vocab_path" in scheme["extractor"]:
                vocab_file = open(scheme["extractor"]["vocab_path"], "rb")
                extractor.vocab = pickle.load(vocab_file)
                vocab_file.close()
            parsed_configs["feature_scheme"][tag]["extractor"] = extractor

            if "converter" not in scheme:
                parsed_configs["feature_scheme"][tag]["converter"] = Converter(
                    {})
            else:
                parsed_configs["feature_scheme"][tag]["converter"] = scheme[
                    "converter"]
        return Config(parsed_configs, default_hparams=self.default_configs())
 def __init__(self, reader, reader_config, indexer_config=None):
     self.reader = reader
     self.reader_config = reader_config
     self.config = indexer_config if indexer_config is not None \
         else self.default_config()
     self.config = Config(self.config, default_hparams=None)
     self.create_pipeline()
Beispiel #12
0
    def __init__(self):
        super().__init__()
        self.current_batch: Dict = {}
        self.data_pack_pool: List[PackType] = []
        self.current_batch_sources: List[int] = []

        self._cross_pack: bool = True
        self.configs: Config = Config({}, {})
    def initialize(self, resources: Resources, configs: Config):
        self.resources = resources
        self.config = Config(configs, self.default_configs())

        #print(self.config)
        self.device = torch.device('cuda:0') \
            if torch.cuda.is_available() else torch.device('cpu')

        self.model = AutoModelForSequenceClassification.from_pretrained(self.config.model_name).to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)
Beispiel #14
0
 def initialize(self, config: Optional[Union[Config, Dict]] = None):
     self._config = Config(
         config,
         default_hparams=self.default_configs(),
         allow_new_hparam=True,
     )
     self._user_request = self._config.request
     self._validate_config()
     self._parse_request(self._user_request)
     self._build_vocab()
Beispiel #15
0
def parse_feature_extractors(scheme_configs: Config) -> Dict[str, Any]:
    feature_requests: Dict[str, Any] = {}

    for tag, scheme_config in scheme_configs.items():
        assert (
            "extractor" in scheme_config
        ), "Field not found for data request scheme: `extractor`"
        assert (
            "type" in scheme_config
        ), "Field not found for data request scheme: `type`"
        assert scheme_config["type"] in [
            "data_input",
            "data_output",
        ], "Type field must be either data_input or data_output."

        feature_requests[tag] = {}

        if scheme_config["type"] == "data_input":
            feature_requests[tag]["type"] = DATA_INPUT
        elif scheme_config["type"] == "data_output":
            feature_requests[tag]["type"] = DATA_OUTPUT

        extractor_class = get_class(scheme_config["extractor"]["class_name"])
        extractor: BaseExtractor = extractor_class()
        if not isinstance(extractor, BaseExtractor):
            raise RuntimeError(
                "Invalid extractor: ", scheme_config["extractor"]
            )

        extractor.initialize(config=scheme_config["extractor"]["config"])

        # Load vocab from disk if provided.
        if "vocab_path" in scheme_config["extractor"]:
            with open(
                scheme_config["extractor"]["vocab_path"], "rb"
            ) as vocab_file:
                extractor.vocab = pickle.load(vocab_file)

        feature_requests[tag]["extractor"] = extractor

        if "converter" not in scheme_config:
            # Create default converter if there is no given converter
            feature_requests[tag]["converter"] = Converter({})
        else:
            converter_class = get_class(
                scheme_config["converter"]["class_name"]
            )
            converter: Converter = converter_class()
            if not isinstance(converter, Converter):
                raise RuntimeError(
                    "Invalid converter: ", scheme_config["converter"]
                )
            feature_requests[tag]["converter"] = converter

    return feature_requests
Beispiel #16
0
def main():
    pl = Pipeline[DataPack]()
    pl.set_reader(StringReader())
    pl.add(NLTKSentenceSegmenter())
    pl.add(NLTKWordTokenizer())
    pl.add(NLTKPOSTagger())

    config = yaml.safe_load(open("config.yml", "r"))

    config = Config(config, default_hparams=None)

    pl.add(CoNLLNERPredictor(), config=config.NER)
    pl.add(SRLPredictor(), config=config.SRL)

    pl.initialize()

    text = (
        "So I was excited to see Journey to the Far Side of the Sun finally "
        "get released on an affordable DVD (the previous print had been "
        "fetching $100 on eBay - I'm sure those people wish they had their "
        "money back - but more about that in a second)."
    )

    pack = pl.process_one(text)

    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", "red"), sent_text, "\n")
        # first method to get entry in a sentence
        tokens = [
            (token.text, token.pos) for token in pack.get(Token, sentence)
        ]
        entities = [
            (entity.text, entity.ner_type)
            for entity in pack.get(EntityMention, sentence)
        ]
        print(colored("Tokens:", "red"), tokens, "\n")
        print(colored("EntityMentions:", "red"), entities, "\n")

        # second method to get entry in a sentence
        print(colored("Semantic role labels:", "red"))
        for link in pack.get(PredicateLink, sentence):
            parent: PredicateMention = link.get_parent()
            child: PredicateArgument = link.get_child()
            print(
                f'  - "{child.text}" is role {link.arg_type} of '
                f'predicate "{parent.text}"'
            )
            entities = [
                entity.text for entity in pack.get(EntityMention, child)
            ]
            print("      Entities in predicate argument:", entities, "\n")
        print()

        input(colored("Press ENTER to continue...\n", "green"))
Beispiel #17
0
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)

        self.resources = resources
        self.config = Config(configs, self.default_configs())
        if not self.config.pretrained_model_name:
            raise ValueError("Please specify a pretrained bert model")
        self.tokenizer = BERTTokenizer(
            pretrained_model_name=self.config.pretrained_model_name,
            cache_dir=None,
            hparams=None,
        )
Beispiel #18
0
    def initialize(self, config: Union[Dict, Config]):
        self.config = Config(config, self.default_configs())

        if self.config.vocab_method != "custom":
            self._vocab = Vocabulary(
                method=self.config.vocab_method,
                use_pad=self.config.need_pad,
                use_unk=self.config.vocab_use_unk,
                pad_value=self.config.pad_value,
                unk_value=self.config.unk_value,
            )
        else:
            self._vocab = None
        self._vocab_method = self.config.vocab_method
Beispiel #19
0
    def __init__(self, config: Union[Dict, Config]):
        self.config = Config(config, self.default_configs())

        if self.config.entry_type is None:
            raise AttributeError("entry_type needs to be specified in "
                                 "the configuration of an extractor.")

        if self.config.vocab_method != "raw":
            self.vocab: Optional[Vocabulary] = \
                Vocabulary(method=self.config.vocab_method,
                           need_pad=self.config.need_pad,
                           use_unk=self.config.vocab_use_unk)
        else:
            self.vocab = None
def main(args):
    """
    Build a pipeline to process MS_MARCO dataset using
    MSMarcoPassageReader and build elastic indexer.
    """
    # config_file = os.path.join(os.path.dirname(__file__), 'config.yml')
    config_file = args.config_file
    config = yaml.safe_load(open(config_file, "r"))
    config = Config(config, default_hparams=None)

    pipeline = Pipeline[DataPack]()

    pipeline.set_reader(MSMarcoPassageReader())
    pipeline.add(ElasticSearchTextIndexProcessor(), config=config.create_index)
    pipeline.run(args.data_dir)
Beispiel #21
0
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)

        # Validate multi_pack project config:
        #   A `multi_pack` project must have `multi_ontology` set.
        if self.configs.project_type != "single_pack" and (
                self.configs.project_type != "multi_pack"
                or self.configs.multi_ontology is None):
            raise ProcessorConfigError("Invalid project type configuration.")

        # Generate default configurations
        self.configs.project_configs = Config(
            hparams=self.configs.project_configs,
            default_hparams=self._default_project_configs(),
        )
        self.configs.multi_ontology = self.configs.multi_ontology or Config({},
                                                                            {})
        self.configs.project_path = os.path.abspath(
            self.configs.project_path or self.configs.project_name)

        self._viewer = StaveViewer(
            project_path=self.configs.project_path,
            host=self.configs.host,
            port=self.configs.port,
            thread_daemon=self.configs.server_thread_daemon,
        )

        #  Write meta data to project folder
        self._project_writer = StaveProjectWriter(
            project_path=self.configs.project_path,
            project_name=self.configs.project_name,
            project_type=self.configs.project_type,
            ontology=self.resources.get("onto_specs_dict"),
            project_configs=self.configs.project_configs.todict(),
            multi_ontology=self.configs.multi_ontology.todict(),
        )
Beispiel #22
0
    def make_configs(
        cls,
        configs: Optional[Union[Config, Dict[str, Any]]],
    ) -> Config:
        """
        Create the configuration by merging the
        provided config with the `default_configs`.

        The following config conventions are expected:
          - The top level key can be a special `@config_path`.

          - `@config_path` should be point to a file system path, which will
             be a YAML file containing configurations.

          - Other key values in the configs will be considered as parameters.

        Args:
            configs: The input config to be merged with the default config.

        Returns:
            The merged configuration.
        """
        merged_configs: Dict = {}

        if configs is not None:
            if isinstance(configs, Config):
                configs = configs.todict()

            if configs.get("@config_path", None) is not None:
                with open(configs.pop("@config_path"), encoding="utf-8") as f:
                    filebased_configs = yaml.safe_load(f)
            else:
                filebased_configs = {}

            merged_configs.update(filebased_configs)

            merged_configs.update(configs)

        try:
            final_configs = Config(merged_configs,
                                   cls._default_configs().todict())
        except ValueError as e:
            raise ProcessorConfigError(
                f"Configuration error for the processor "
                f"{get_full_module_name(cls)}.") from e

        return final_configs
Beispiel #23
0
def main(dataset_dir: str):
    config = yaml.safe_load(open("config.yml", "r"))
    config = Config(config, default_hparams=None)

    pl = Pipeline[DataPack]()
    pl.set_reader(PlainTextReader())
    pl.add(NLTKSentenceSegmenter())
    pl.add(NLTKWordTokenizer())
    pl.add(NLTKPOSTagger())
    pl.add(CoNLLNERPredictor(), config=config.NER)
    pl.add(SRLPredictor(), config=config.SRL)

    pl.initialize()

    for pack in pl.process_dataset(dataset_dir):
        print(colored("Document", "red"), pack.pack_name)
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", "red"), sent_text, "\n")
            # first method to get entry in a sentence
            tokens = [
                (token.text, token.pos) for token in pack.get(Token, sentence)
            ]
            entities = [
                (entity.text, entity.ner_type)
                for entity in pack.get(EntityMention, sentence)
            ]
            print(colored("Tokens:", "red"), tokens, "\n")
            print(colored("EntityMentions:", "red"), entities, "\n")

            # second method to get entry in a sentence
            print(colored("Semantic role labels:", "red"))
            for link in pack.get(PredicateLink, sentence):
                parent: PredicateMention = link.get_parent()  # type: ignore
                child: PredicateArgument = link.get_child()  # type: ignore
                print(
                    f'  - "{child.text}" is role {link.arg_type} of '
                    f'predicate "{parent.text}"'
                )
                entities = [
                    entity.text for entity in pack.get(EntityMention, child)
                ]
                print("      Entities in predicate argument:", entities, "\n")
            print()

            input(colored("Press ENTER to continue...\n", "green"))
Beispiel #24
0
    def __init__(self,
                 pack_iterator: Iterator[DataPack],
                 request: Dict,
                 config: Optional[Union[Config, Dict]] = None):
        self._config: Config = \
            Config(config, default_hparams=self.default_configs())
        self._validate_config()

        self._pack_iterator: Iterator[DataPack] = pack_iterator
        self._cached_packs: List[DataPack] = []

        self._user_request: Dict = request
        self._request: Dict = {}
        self._request_ready: bool = False
        self._vocab_ready: bool = False

        self._parse_request(self._user_request)
        self._build_vocab()
    def initialize(self, config: Union[Dict, Config]):
        # pylint: disable=attribute-defined-outside-init
        self.config = Config(config, self.default_configs())
        if self.config.entry_type is None:
            raise AttributeError("`entry_type` needs to be specified in "
                                 "the configuration of an extractor.")
        self._entry_type = get_class(self.config.entry_type)

        if self.config.vocab_method != "custom":
            self._vocab = Vocabulary(
                method=self.config.vocab_method,
                use_pad=self.config.need_pad,
                use_unk=self.config.vocab_use_unk,
                pad_value=self.config.pad_value,
                unk_value=self.config.unk_value,
            )
        else:
            self._vocab = None
        self._vocab_method = self.config.vocab_method
Beispiel #26
0
    def make_configs(
            cls, configs: Optional[Union[Config, Dict[str, Any]]]) -> Config:
        """
        Create the component configuration for this class, by merging the
        provided config with the ``default_config``.

        The following config conventions are expected:
          - The top level key can be a special `config_path`.
          - `config_path` should be point to a file system path, which will
             be a YAML file containing configurations.
          - Other key values in the configs will be considered as parameters.

        Args:
            configs: The input config to be merged with the default config.

        Returns:
            The merged configuration.
        """
        merged_configs: Dict = {}

        if configs is not None:
            if isinstance(configs, Config):
                configs = configs.todict()

            if "config_path" in configs and not configs["config_path"] is None:
                filebased_configs = yaml.safe_load(
                    open(configs.pop("config_path")))
            else:
                filebased_configs = {}

            merged_configs.update(filebased_configs)

            merged_configs.update(configs)

        try:
            final_configs = Config(merged_configs, cls.default_configs())
        except ValueError as e:
            raise ProcessorConfigError(
                f'Configuration error for the processor '
                f'{get_full_module_name(cls)}.') from e

        return final_configs
Beispiel #27
0
    def load_pretrained_config(
        self,
        pretrained_model_name: Optional[str] = None,
        cache_dir: Optional[str] = None,
        hparams=None,
    ):
        r"""Load paths and configurations of the pre-trained model.

        Args:
            pretrained_model_name (optional): A str with the name
                of a pre-trained model to load. If `None`, will use the model
                name in :attr:`hparams`.
            cache_dir (optional): The path to a folder in which the
                fine-tuned model is present.
            hparams (dict or HParams, optional): Hyperparameters. Missing
                hyperparameter will be set to default values. See
                :meth:`default_hparams` for the hyperparameter structure
                and default values.
        """

        self.pretrained_model_name = (hparams["pretrained_model_name"]
                                      if pretrained_model_name is None else
                                      pretrained_model_name)

        rel_dir = hparams["model_dir"] if cache_dir is None else cache_dir
        self.cache_dir = os.path.join(os.path.dirname(__file__), rel_dir)

        if self.pretrained_model_name is None or self.cache_dir is None:
            raise ValueError("Pre-trained model name and directory should"
                             "be defined in the fine tuned BERT model.")

        self.pretrained_model_dir = os.path.join(self.cache_dir,
                                                 self.pretrained_model_name)

        pretrained_model_hparams = self._transform_config(
            self.pretrained_model_name, self.pretrained_model_dir)

        super_params = self.default_hparams()
        if "prefix" not in super_params:
            super_params["prefix"] = "_encoder.encoder."
        self._hparams = Config(pretrained_model_hparams, super_params)
Beispiel #28
0
def stanford_nlp_example(lang: str, text: str):
    pl = Pipeline[DataPack]()
    pl.set_reader(StringReader())

    config = Config(
        {
            "processors": "tokenize,pos,lemma,depparse",
            "lang": lang,
            # Language code for the language to build the Pipeline
            "use_gpu": False,
        },
        StandfordNLPProcessor.default_configs(),
    )
    pl.add(component=StandfordNLPProcessor(), config=config)

    pl.initialize()

    pack = pl.process(text)
    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", "red"), sent_text, "\n")
        tokens = [(token.text, token.pos, token.lemma)
                  for token in pack.get(Token, sentence)]
        print(colored("Tokens:", "red"), tokens, "\n")

        print(colored("Dependency Relations:", "red"))
        for link in pack.get(Dependency, sentence):
            parent: Token = link.get_parent()  # type: ignore
            child: Token = link.get_child()  # type: ignore
            print(
                colored(child.text, "cyan"),
                "has relation",
                colored(link.rel_type, "green"),  # type: ignore
                "of parent",
                colored(parent.text, "cyan"),
            )

        print("\n----------------------\n")
Beispiel #29
0
def main(nif_context: str, nif_page_structure: str, mapping_literals: str,
         mapping_objects: str, nif_text_links: str, redirects: str,
         info_boxs: str, output_path: str):
    # Load redirects.
    print_progress('Loading redirects', '\n')
    logging.info("Loading redirects")
    redirect_pickle = os.path.join(output_path, 'redirects.pickle')

    redirect_map: Dict[str, str]
    if os.path.exists(redirect_pickle):
        redirect_map = pickle.load(open(redirect_pickle, 'rb'))
    else:
        redirect_map = load_redirects(redirects)
        with open(redirect_pickle, 'wb') as pickle_f:
            pickle.dump(redirect_map, pickle_f)
    print_progress('\nLoading redirects', '\n')
    logging.info("Done loading.")

    # The datasets are read in two steps.
    raw_pack_dir = os.path.join(output_path, 'nif_raw')

    # First, we create the NIF reader that read the NIF in order.
    nif_pl = Pipeline[DataPack]()
    nif_pl.resource.update(redirects=redirect_map)

    nif_pl.set_reader(DBpediaWikiReader(), config=Config(
        {
            'redirect_path': redirects,
            'nif_page_structure': nif_page_structure,
            'nif_text_links': nif_text_links,
        },
        DBpediaWikiReader.default_configs()
    ))

    nif_pl.add(WikiArticleWriter(), config=Config(
        {
            'output_dir': raw_pack_dir,
            'zip_pack': True,
        },
        WikiArticleWriter.default_configs()
    ))

    nif_pl.initialize()
    logging.info('Start running the DBpedia text pipeline.')
    print_progress('Start running the DBpedia text pipeline.', '\n')
    nif_pl.run(nif_context)

    # Second, we add info boxes to the packs with NIF.
    ib_pl = Pipeline[DataPack]()
    ib_pl.resource.update(redirects=redirect_map)
    ib_pl.set_reader(DBpediaInfoBoxReader(), config=Config(
        {
            'pack_index': os.path.join(raw_pack_dir, 'article.idx'),
            'pack_dir': raw_pack_dir,
            'mapping_literals': mapping_literals,
            'mapping_objects': mapping_objects,
            'reading_log': os.path.join(output_path, 'infobox.log')
        },
        DBpediaInfoBoxReader.default_configs()
    ))

    ib_pl.add(WikiArticleWriter(), config=Config(
        {
            'output_dir': os.path.join(output_path, 'nif_info_box'),
            'zip_pack': True,
        },
        WikiArticleWriter.default_configs()
    ))

    # Now we run the info box pipeline.
    ib_pl.run(info_boxs)
 def __init__(self) -> None:
     super().__init__()
     self.config = Config(None, self.default_configs())