Esempio n. 1
0
def string_processor_example(ner_model_dir: str, srl_model_dir: str):
    pl = Pipeline()
    pl.set_reader(StringReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())

    ner_configs = HParams(
        {'storage_path': os.path.join(ner_model_dir, 'resources.pkl')},
        CoNLLNERPredictor.default_hparams())

    ner_predictor = CoNLLNERPredictor()

    pl.add_processor(ner_predictor, ner_configs)

    srl_configs = HParams({
        'storage_path': srl_model_dir,
    }, SRLPredictor.default_hparams())
    pl.add_processor(SRLPredictor(), srl_configs)

    pl.initialize()

    text = (
        "The plain green Norway spruce is displayed in the gallery's foyer. "
        "Wentworth worked as an assistant to sculptor Henry Moore in the "
        "late 1960s. His reputation as a sculptor grew in the 1980s.")

    pack = pl.process_one(text)

    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        # first method to get entry in a sentence
        tokens = [(token.text, token.pos)
                  for token in pack.get(Token, sentence)]
        entities = [(entity.text, entity.ner_type)
                    for entity in pack.get(EntityMention, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")
        print(colored("EntityMentions:", 'red'), entities, "\n")

        # second method to get entry in a sentence
        print(colored("Semantic role labels:", 'red'))
        for link in pack.get(PredicateLink, sentence):
            parent: PredicateMention = link.get_parent()  # type: ignore
            child: PredicateArgument = link.get_child()  # type: ignore
            print(f"  - \"{child.text}\" is role {link.arg_type} of "
                  f"predicate \"{parent.text}\"")
            entities = [
                entity.text for entity in pack.get(EntityMention, child)
            ]
            print("      Entities in predicate argument:", entities, "\n")
        print()

        input(colored("Press ENTER to continue...\n", 'green'))
Esempio n. 2
0
def setup(config: Config) -> Pipeline:
    resource = Resources()
    query_pipeline = Pipeline[MultiPack](resource=resource)
    query_pipeline.set_reader(
        reader=MultiPackTerminalReader(), config=config.reader)
    query_pipeline.add(
        component=MicrosoftBingTranslator(), config=config.translator)
    query_pipeline.add(
        component=BertBasedQueryCreator(), config=config.query_creator)
    query_pipeline.add(
        component=SearchProcessor(), config=config.searcher)

    top_response_pack_name = config.indexer.response_pack_name + '_0'

    query_pipeline.add(
        component=NLTKSentenceSegmenter(),
        selector=NameMatchSelector(select_name=top_response_pack_name))
    query_pipeline.add(
        component=NLTKWordTokenizer(),
        selector=NameMatchSelector(select_name=top_response_pack_name))
    query_pipeline.add(
        component=NLTKPOSTagger(),
        selector=NameMatchSelector(select_name=top_response_pack_name))
    query_pipeline.add(
        component=SRLPredictor(), config=config.SRL,
        selector=NameMatchSelector(select_name=top_response_pack_name))
    query_pipeline.add(
        component=MicrosoftBingTranslator(), config=config.back_translator)

    query_pipeline.initialize()

    return query_pipeline
Esempio n. 3
0
def main(dataset_dir: str, ner_model_path: str, srl_model_path: str):
    pl = Pipeline()
    pl.set_reader(PlainTextReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())

    ner_configs = HParams(
        {'storage_path': os.path.join(ner_model_path, 'resources.pkl')},
        CoNLLNERPredictor.default_hparams())

    pl.add_processor(CoNLLNERPredictor(), ner_configs)

    srl_configs = HParams({
        'storage_path': srl_model_path,
    }, SRLPredictor.default_hparams())
    pl.add_processor(SRLPredictor(), srl_configs)
    pl.initialize()

    for pack in pl.process_dataset(dataset_dir):
        print(colored("Document", 'red'), pack.meta.doc_id)
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")
            # first method to get entry in a sentence
            tokens = [(token.text, token.pos)
                      for token in pack.get(Token, sentence)]
            entities = [(entity.text, entity.ner_type)
                        for entity in pack.get(EntityMention, sentence)]
            print(colored("Tokens:", 'red'), tokens, "\n")
            print(colored("EntityMentions:", 'red'), entities, "\n")

            # second method to get entry in a sentence
            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent: PredicateMention = link.get_parent()  # type: ignore
                child: PredicateArgument = link.get_child()  # type: ignore
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
                entities = [
                    entity.text for entity in pack.get(EntityMention, child)
                ]
                print("      Entities in predicate argument:", entities, "\n")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))
def main():
    pl = Pipeline()
    pl.set_reader(StringReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())

    pl.add_processor(CoNLLNERPredictor(), config=config.NER)
    pl.add_processor(SRLPredictor(), config=config.SRL)

    pl.initialize()

    text = (
        "So I was excited to see Journey to the Far Side of the Sun finally "
        "get released on an affordable DVD (the previous print had been "
        "fetching $100 on eBay - I'm sure those people wish they had their "
        "money back - but more about that in a second).")

    pack = pl.process_one(text)

    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        # first method to get entry in a sentence
        tokens = [(token.text, token.pos)
                  for token in pack.get(Token, sentence)]
        entities = [(entity.text, entity.ner_type)
                    for entity in pack.get(EntityMention, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")
        print(colored("EntityMentions:", 'red'), entities, "\n")

        # second method to get entry in a sentence
        print(colored("Semantic role labels:", 'red'))
        for link in pack.get(PredicateLink, sentence):
            parent: PredicateMention = link.get_parent()  # type: ignore
            child: PredicateArgument = link.get_child()  # type: ignore
            print(f"  - \"{child.text}\" is role {link.arg_type} of "
                  f"predicate \"{parent.text}\"")
            entities = [
                entity.text for entity in pack.get(EntityMention, child)
            ]
            print("      Entities in predicate argument:", entities, "\n")
        print()

        input(colored("Press ENTER to continue...\n", 'green'))
Esempio n. 5
0
def main(dataset_dir: str):
    config = yaml.safe_load(open("config.yml", "r"))
    config = Config(config, default_hparams=None)

    pl = Pipeline[DataPack]()
    pl.set_reader(PlainTextReader())
    pl.add(NLTKSentenceSegmenter())
    pl.add(NLTKWordTokenizer())
    pl.add(NLTKPOSTagger())
    pl.add(CoNLLNERPredictor(), config=config.NER)
    pl.add(SRLPredictor(), config=config.SRL)

    pl.initialize()

    for pack in pl.process_dataset(dataset_dir):
        print(colored("Document", 'red'), pack.meta.doc_id)
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")
            # first method to get entry in a sentence
            tokens = [(token.text, token.pos)
                      for token in pack.get(Token, sentence)]
            entities = [(entity.text, entity.ner_type)
                        for entity in pack.get(EntityMention, sentence)]
            print(colored("Tokens:", 'red'), tokens, "\n")
            print(colored("EntityMentions:", 'red'), entities, "\n")

            # second method to get entry in a sentence
            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent: PredicateMention = link.get_parent()  # type: ignore
                child: PredicateArgument = link.get_child()  # type: ignore
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
                entities = [
                    entity.text for entity in pack.get(EntityMention, child)
                ]
                print("      Entities in predicate argument:", entities, "\n")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))
Esempio n. 6
0
def main():

    config = yaml.safe_load(open("config.yml", "r"))
    config = HParams(config, default_hparams=None)

    resource = Resources()
    query_pipeline = Pipeline(resource=resource)
    query_pipeline.set_reader(reader=MultiPackTerminalReader(),
                              config=config.reader)

    query_pipeline.add_processor(processor=MicrosoftBingTranslator(),
                                 config=config.translator)
    query_pipeline.add_processor(processor=BertBasedQueryCreator(),
                                 config=config.query_creator)
    query_pipeline.add_processor(processor=SearchProcessor(),
                                 config=config.indexer)
    query_pipeline.add_processor(
        processor=NLTKSentenceSegmenter(),
        selector=NameMatchSelector(
            select_name=config.indexer.response_pack_name[0]))
    query_pipeline.add_processor(
        processor=NLTKWordTokenizer(),
        selector=NameMatchSelector(
            select_name=config.indexer.response_pack_name[0]))
    query_pipeline.add_processor(
        processor=NLTKPOSTagger(),
        selector=NameMatchSelector(
            select_name=config.indexer.response_pack_name[0]))
    query_pipeline.add_processor(
        processor=SRLPredictor(),
        config=config.SRL,
        selector=NameMatchSelector(
            select_name=config.indexer.response_pack_name[0]))
    query_pipeline.add_processor(processor=MicrosoftBingTranslator(),
                                 config=config.back_translator)

    query_pipeline.initialize()

    for m_pack in query_pipeline.process_dataset():

        # update resource to be used in the next conversation
        query_pack = m_pack.get_pack(config.translator.in_pack_name)
        if resource.get("user_utterance"):
            resource.get("user_utterance").append(query_pack)
        else:
            resource.update(user_utterance=[query_pack])

        response_pack = m_pack.get_pack(config.back_translator.in_pack_name)

        if resource.get("bot_utterance"):
            resource.get("bot_utterance").append(response_pack)
        else:
            resource.update(bot_utterance=[response_pack])

        english_pack = m_pack.get_pack("pack")
        print(colored("English Translation of the query: ", "green"),
              english_pack.text, "\n")
        pack = m_pack.get_pack(config.indexer.response_pack_name[0])
        print(colored("Retrieved Document", "green"), pack.text, "\n")
        print(colored("German Translation", "green"),
              m_pack.get_pack("response").text, "\n")
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")

            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent = link.get_parent()
                child = link.get_child()
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))
Esempio n. 7
0
def main():

    config = yaml.safe_load(open("config.yml", "r"))
    config = HParams(config, default_hparams=None)

    if not os.path.exists(config.indexer.model_dir):
        print(f"Creating a new index...")
        encoder = BERTEncoder(pretrained_model_name="bert-base-uncased")
        encoder.to(device)

        feature_original_types = {
            "id": ["int64", "FixedLenFeature"],
            "input_ids": ["int64", "FixedLenFeature",
                          config.indexer.max_seq_length],
            "segment_ids": ["int64", "FixedLenFeature",
                            config.indexer.max_seq_length],
            "text": ["str", "FixedLenFeature"]
        }

        hparam = {
            "allow_smaller_final_batch": True,
            "batch_size": config.indexer.batch_size,
            "dataset": {
                "data_name": "data",
                "feature_original_types": feature_original_types,
                "files": config.indexer.pickle_data_dir
            },
            "shuffle": False
        }

        print(f"Embedding the text using BERTEncoder...")
        record_data = RecordData(hparams=hparam, device=device)
        data_iterator = DataIterator(record_data)
        index = EmbeddingBasedIndexer(hparams={
            "index_type": "GpuIndexFlatIP",
            "dim": 768,
            "device": "gpu0"
        })

        for idx, batch in enumerate(data_iterator):
            ids = batch["id"]
            input_ids = batch["input_ids"]
            segment_ids = batch["segment_ids"]
            text = batch["text"]
            _, pooled_output = get_embeddings(encoder, input_ids, segment_ids)
            index.add(vectors=pooled_output,
                      meta_data={k.item(): v for k, v in zip(ids, text)})

            if (idx + 1) % 50 == 0:
                print(f"Completed {idx+1} batches of size "
                      f"{config.indexer.batch_size}")

        index.save(path=config.indexer.model_dir)

    resource = Resources()
    query_pipeline = Pipeline(resource=resource)
    query_pipeline.set_reader(MultiPackTerminalReader())

    query_pipeline.add_processor(
        processor=MachineTranslationProcessor(), config=config.translator)
    query_pipeline.add_processor(
        processor=QueryCreator(), config=config.query_creator)
    query_pipeline.add_processor(
        processor=SearchProcessor(), config=config.indexer)
    query_pipeline.add_processor(
        processor=NLTKSentenceSegmenter(),
        selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=NLTKWordTokenizer(),
        selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=NLTKPOSTagger(),
        selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=SRLPredictor(), config=config.SRL,
        selector=NameMatchSelector(select_name="doc_0"))
    # query_pipeline.add_processor(
    #    processor=CoNLLNERPredictor(), config=config.NER,
    #    selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=MachineTranslationProcessor(), config=config.back_translator)

    query_pipeline.initialize()

    for m_pack in query_pipeline.process_dataset():

        # update resource to be used in the next conversation
        query_pack = m_pack.get_pack("query")
        if resource.get("user_utterance"):
            resource.get("user_utterance").append(query_pack)
        else:
            resource.update(user_utterance=[query_pack])

        response_pack = m_pack.get_pack("response")

        if resource.get("bot_utterance"):
            resource.get("bot_utterance").append(response_pack)
        else:
            resource.update(bot_utterance=[response_pack])

        english_pack = m_pack.get_pack("pack")
        print(colored("English Translation of the query: ", "green"),
              english_pack.text, "\n")
        pack = m_pack.get_pack("doc_0")
        print(colored("Retrieved Document", "green"), pack.text, "\n")
        print(colored("German Translation", "green"),
              m_pack.get_pack("response").text, "\n")
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")

            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent = link.get_parent()
                child = link.get_child()
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))