Python SpacyTokenizer.tokenize 예제들, allennlp.data.tokenizers.SpacyTokenizer.tokenize Python 예제들

예제 #1

0

파일 보기

class DSLSharedTaskDataset(DatasetReader):
    def __init__(self):
        super(DSLSharedTaskDataset, self).__init__(lazy=False)
        self.tokenizer = SpacyTokenizer()
        self.token_indexers = {'tokens': SingleIdTokenIndexer()}

    def _read(self, text_path: str) -> Iterable[Instance]:
        with open(text_path, "r") as text_data:
            text_data = text_data.read().splitlines()
            for line in text_data:
                try:
                    text, label = line.strip().split('\t')
                except ValueError:
                    print(line)
                text_field = TextField(self.tokenizer.tokenize(text),
                                       self.token_indexers)
                label_field = LabelField(label)
                fields = {'text': text_field, 'label': label_field}
                yield Instance(fields)

    def text_to_instance(self, text: str, label: str = None) -> Instance:
        tokens = self.tokenizer.tokenize(text)
        text_field = TextField(tokens, self.token_indexers)
        fields = {'text': text_field}
        if label:
            fields['label'] = LabelField(label)
        return Instance(fields)

예제 #2

0

파일 보기

파일: calc_elmo_embeddings.py 프로젝트: youlandasu/dstqa

def read_dataset(file_path):
    with open(file_path) as dataset_file:
        tokenizer = SpacyTokenizer()
        dataset_json = json.load(dataset_file)
        dialogs = []
        for dialog in dataset_json:
            dialog_idx = dialog["dialogue_idx"]
            dialog = dialog['dialogue']
            dialog_context = None
            for turn_i, turn in enumerate(dialog):
                sys_utt = turn['system_transcript']
                user_utt = turn['transcript']
                tokenized_sys_utt = tokenizer.tokenize(sys_utt)
                if turn_i != 0:
                    tokenized_sys_utt = [Token(text="<S>", lemma_="<S>")
                                         ] + tokenized_sys_utt
                tokenized_user_utt = tokenizer.tokenize(user_utt)
                if turn_i != len(dialog) - 1:
                    tokenized_user_utt = tokenized_user_utt + [
                        Token(text="</S>", lemma_="</S>")
                    ]
                if dialog_context is None:
                    dialog_context = tokenized_sys_utt + tokenized_user_utt
                else:
                    dialog_context += tokenized_sys_utt + tokenized_user_utt
            dialog_context = [t.text for t in dialog_context]
            dialogs.append((dialog_idx, [dialog_context]))
    return dialogs

예제 #3

0

파일 보기

파일: bert_embedder_test.py 프로젝트: alexanderkoller/allennlp

    def test_squad_with_unwordpieceable_passage(self):

        tokenizer = SpacyTokenizer()

        token_indexer = PretrainedBertIndexer("bert-base-uncased")

        passage1 = (
            "There were four major HDTV systems tested by SMPTE in the late 1970s, "
            "and in 1979 an SMPTE study group released A Study of High Definition Television Systems:"
        )
        question1 = "Who released A Study of High Definition Television Systems?"

        passage2 = (
            "Broca, being what today would be called a neurosurgeon, "
            "had taken an interest in the pathology of speech. He wanted "
            "to localize the difference between man and the other animals, "
            "which appeared to reside in speech. He discovered the speech "
            "center of the human brain, today called Broca's area after him. "
            "His interest was mainly in Biological anthropology, but a German "
            "philosopher specializing in psychology, Theodor Waitz, took up the "
            "theme of general and social anthropology in his six-volume work, "
            "entitled Die Anthropologie der Naturvölker, 1859–1864. The title was "
            """soon translated as "The Anthropology of Primitive Peoples". """
            "The last two volumes were published posthumously.")
        question2 = "What did Broca discover in the human brain?"

        from allennlp.data.dataset_readers.reading_comprehension.util import (
            make_reading_comprehension_instance, )

        instance1 = make_reading_comprehension_instance(
            tokenizer.tokenize(question1),
            tokenizer.tokenize(passage1),
            {"bert": token_indexer},
            passage1,
        )

        instance2 = make_reading_comprehension_instance(
            tokenizer.tokenize(question2),
            tokenizer.tokenize(passage2),
            {"bert": token_indexer},
            passage2,
        )

        vocab = Vocabulary()

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        qtokens = tensor_dict["question"]
        ptokens = tensor_dict["passage"]

        config = BertConfig(len(token_indexer.vocab))
        model = BertModel(config)
        embedder = BertEmbedder(model)

        _ = embedder(ptokens["bert"], offsets=ptokens["bert-offsets"])
        _ = embedder(qtokens["bert"], offsets=qtokens["bert-offsets"])

예제 #4

0

파일 보기

    def test_keep_spacy_tokens(self):
        word_tokenizer = SpacyTokenizer()
        sentence = "This should be an allennlp Token"
        tokens = word_tokenizer.tokenize(sentence)
        assert tokens
        assert all(isinstance(token, Token) for token in tokens)

        word_tokenizer = SpacyTokenizer(keep_spacy_tokens=True)
        sentence = "This should be a spacy Token"
        tokens = word_tokenizer.tokenize(sentence)
        assert tokens
        assert all(isinstance(token, spacy.tokens.Token) for token in tokens)

예제 #5

0

파일 보기

 def test_empty_list_can_be_tensorized(self):
     tokenizer = SpacyTokenizer()
     tokens = tokenizer.tokenize("Foo")
     text_field = TextField(tokens, self.word_indexer)
     list_field = ListField([text_field.empty_field()])
     fields = {
         "list": list_field,
         "bar": TextField(tokenizer.tokenize("BAR"), self.word_indexer),
     }
     instance = Instance(fields)
     instance.index_fields(self.vocab)
     instance.as_tensor_dict()

예제 #6

0

파일 보기

파일: bert_indexer_test.py 프로젝트: zulushakaka/allennlp

    def test_never_lowercase(self):
        # Our default tokenizer doesn't handle lowercasing.
        tokenizer = SpacyTokenizer()

        #            2 15 10 11  6
        sentence = "the laziest fox"

        tokens = tokenizer.tokenize(sentence)
        tokens.append(Token("[PAD]"))  # have to do this b/c tokenizer splits it in three

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(str(vocab_path), do_lowercase=True)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab)

        # PAD should get recognized and not lowercased      # [PAD]
        assert indexed_tokens["input_ids"] == [16, 2, 15, 10, 11, 6, 0, 17]

        # Unless we manually override the never lowercases
        token_indexer = PretrainedBertIndexer(
            str(vocab_path), do_lowercase=True, never_lowercase=()
        )
        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab)

        # now PAD should get lowercased and be UNK          # [UNK]
        assert indexed_tokens["input_ids"] == [16, 2, 15, 10, 11, 6, 1, 17]

예제 #7

0

파일 보기

 def test_char_span_to_token_span_handles_hard_cases(self):
     # An earlier version of the code had a hard time when the answer was the last token in the
     # passage.  This tests that case, on the instance that used to fail.
     tokenizer = SpacyTokenizer()
     passage = (
         "Beyonc\u00e9 is believed to have first started a relationship with Jay Z "
         +
         'after a collaboration on "\'03 Bonnie & Clyde", which appeared on his seventh '
         +
         "album The Blueprint 2: The Gift & The Curse (2002). Beyonc\u00e9 appeared as Jay "
         +
         "Z's girlfriend in the music video for the song, which would further fuel "
         +
         "speculation of their relationship. On April 4, 2008, Beyonc\u00e9 and Jay Z were "
         +
         "married without publicity. As of April 2014, the couple have sold a combined 300 "
         +
         "million records together. The couple are known for their private relationship, "
         +
         "although they have appeared to become more relaxed in recent years. Beyonc\u00e9 "
         +
         'suffered a miscarriage in 2010 or 2011, describing it as "the saddest thing" '
         +
         "she had ever endured. She returned to the studio and wrote music in order to cope "
         +
         "with the loss. In April 2011, Beyonc\u00e9 and Jay Z traveled to Paris in order "
         +
         "to shoot the album cover for her 4, and unexpectedly became pregnant in Paris."
     )
     start = 912
     end = 912 + len("Paris.")
     tokens = tokenizer.tokenize(passage)
     offsets = [(t.idx, t.idx + len(t.text)) for t in tokens]
     token_span = util.char_span_to_token_span(offsets, (start, end))[0]
     assert token_span == (184, 185)

예제 #8

0

파일 보기

def read(fn: str) -> Iterable[List[Extraction]]:
    tokenizer = SpacyTokenizer(pos_tags=True)
    prev_sent: List[Extraction] = []

    with open(fn) as fin:
        for line in tqdm(fin):
            data = line.strip().split("\t")
            confidence = data[0]
            if not all(data[2:5]):
                # Make sure that all required elements are present
                continue
            arg1, rel, args2 = (parse_element(e) for e in data[2:5])

            # Exactly one subject and one relation
            # and at least one object
            if len(rel) == 1 and len(arg1) == 1 and len(args2) >= 1:
                sent = data[5]
                cur_ex = Extraction(
                    sent=sent,
                    toks=tokenizer.tokenize(sent),
                    arg1=arg1[0],
                    rel=rel[0],
                    args2=args2,
                    confidence=confidence,
                )

                # Decide whether to append or yield
                if not prev_sent or prev_sent[0].sent == sent:
                    prev_sent.append(cur_ex)
                else:
                    yield prev_sent
                    prev_sent = [cur_ex]
    if prev_sent:
        # Yield last element
        yield prev_sent

예제 #9

0

파일 보기

파일: search_for_logical_forms.py 프로젝트: ha-lins/medical_dialog

def search(
    tables_directory: str,
    data: JsonDict,
    output_path: str,
    max_path_length: int,
    max_num_logical_forms: int,
    use_agenda: bool,
    output_separate_files: bool,
    conservative_agenda: bool,
) -> None:
    print(f"Starting search with {len(data)} instances", file=sys.stderr)
    language_logger = logging.getLogger("allennlp.semparse.domain_languages.wikitables_language")
    language_logger.setLevel(logging.ERROR)
    tokenizer = SpacyTokenizer()
    if output_separate_files and not os.path.exists(output_path):
        os.makedirs(output_path)
    if not output_separate_files:
        output_file_pointer = open(output_path, "w")
    for instance_data in data:
        utterance = instance_data["question"]
        question_id = instance_data["id"]
        if utterance.startswith('"') and utterance.endswith('"'):
            utterance = utterance[1:-1]
        # For example: csv/200-csv/47.csv -> tagged/200-tagged/47.tagged
        table_file = instance_data["table_filename"].replace("csv", "tagged")
        target_list = instance_data["target_values"]
        tokenized_question = tokenizer.tokenize(utterance)
        table_file = f"{tables_directory}/{table_file}"
        context = TableQuestionContext.read_from_file(table_file, tokenized_question)
        world = WikiTablesLanguage(context)
        walker = ActionSpaceWalker(world, max_path_length=max_path_length)
        correct_logical_forms = []
        if use_agenda:
            agenda = world.get_agenda(conservative=conservative_agenda)
            allow_partial_match = not conservative_agenda
            all_logical_forms = walker.get_logical_forms_with_agenda(
                agenda=agenda, max_num_logical_forms=10000, allow_partial_match=allow_partial_match
            )
        else:
            all_logical_forms = walker.get_all_logical_forms(max_num_logical_forms=10000)
        for logical_form in all_logical_forms:
            if world.evaluate_logical_form(logical_form, target_list):
                correct_logical_forms.append(logical_form)
        if output_separate_files and correct_logical_forms:
            with gzip.open(f"{output_path}/{question_id}.gz", "wt") as output_file_pointer:
                for logical_form in correct_logical_forms:
                    print(logical_form, file=output_file_pointer)
        elif not output_separate_files:
            print(f"{question_id} {utterance}", file=output_file_pointer)
            if use_agenda:
                print(f"Agenda: {agenda}", file=output_file_pointer)
            if not correct_logical_forms:
                print("NO LOGICAL FORMS FOUND!", file=output_file_pointer)
            for logical_form in correct_logical_forms[:max_num_logical_forms]:
                print(logical_form, file=output_file_pointer)
            print(file=output_file_pointer)
    if not output_separate_files:
        output_file_pointer.close()

예제 #10

0

파일 보기

파일: test_allennlp.py 프로젝트: case-k-git/env

 def test_passes_through_correctly(self):
     tokenizer = SpacyTokenizer()
     sentence = "this (sentence) has 'crazy' \"punctuation\"."
     tokens = [t.text for t in tokenizer.tokenize(sentence)]
     expected_tokens = [
         "this", "(", "sentence", ")", "has", "'", "crazy", "'", "\"",
         "punctuation", "\"", "."
     ]
     self.assertSequenceEqual(tokens, expected_tokens)

예제 #11

0

파일 보기

 def test_crashes_with_empty_feature_value_and_no_default(self):
     tokenizer = SpacyTokenizer(parse=True)
     tokens = tokenizer.tokenize("This is a sentence.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     vocab.add_token_to_namespace("ROOT", namespace="dep_labels")
     vocab.add_token_to_namespace("NONE", namespace="dep_labels")
     indexer = SingleIdTokenIndexer(namespace="dep_labels",
                                    feature_name="dep_")
     with pytest.raises(ValueError):
         indexer.tokens_to_indices([tokens[-1]], vocab)

예제 #12

0

파일 보기

    def test_no_namespace_means_no_counting(self):
        tokenizer = SpacyTokenizer(parse=True)
        tokens = tokenizer.tokenize("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = SingleIdTokenIndexer(namespace=None, feature_name="text_id")

        def fail():
            assert False

        counter = defaultdict(fail)
        for token in tokens:
            indexer.count_vocab_items(token, counter)

예제 #13

0

파일 보기

파일: sentence_classifier_predictor.py 프로젝트: pdasigi/allennlp-guide-examples

class SentenceClassifierPredictor(Predictor):
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyTokenizer()

    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"sentence": sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.tokenize(sentence)
        return self._dataset_reader.text_to_instance(tokens)

예제 #14

0

파일 보기

파일: reader.py 프로젝트: yelircaasi/citation-analysis

class CitationDataSetReader(DatasetReader):
    """
    We implement this CitationDataSetReader class by subclassing DatasetReader class,
    we also need to override some super class methods

    This CitationDataSetReader class reads the datasets(train|dev|test) and converts them to a collection of Instances.
    We used the default SpacyTokenizer for this project.

    We also need to register this dataset reader, for the Config files to be able to use this class.
    """
    def __init__(self):
        super().__init__()
        # default Spacy Tokenizer
        self.tokenizer = SpacyTokenizer()

    @overrides
    def _read(self, file_path: str) -> Iterable[Instance]:
        """

        This function reads the JSON Lines file, tokenize the text for each data point
         and returns a collection of Instances, each instance with tokens and label

        :param file_path: takes the file path as an Argument
        :return: returns a collection of Instances
        """
        ds_reader = DataReaderJsonLines(file_path)
        for citation in ds_reader.read():
            yield self.text_to_instance(citation_text=citation.text,
                                        intent=citation.intent)

    @overrides
    def text_to_instance(self, citation_text: str, intent: str) -> Instance:
        """
        :param citation_text: text from the data point
        :param intent: true label of the data instance
        :return: returns Instance class object with tokens & label fields.
        """

        citation_tokens = self.tokenizer.tokenize(citation_text)
        # Use ELMO Token Characters Indexer
        token_indexers = {
            "elmo": ELMoTokenCharactersIndexer(),
            "tokens": SingleIdTokenIndexer()
        }

        fields = {
            'tokens': TextField(citation_tokens, token_indexers),
            'label': LabelField(intent)
        }

        return Instance(fields)

예제 #15

0

파일 보기

    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", "words")
        self.vocab.add_token_to_namespace("s", "characters")
        self.vocab.add_token_to_namespace("e", "characters")
        self.vocab.add_token_to_namespace("n", "characters")
        self.vocab.add_token_to_namespace("t", "characters")
        self.vocab.add_token_to_namespace("c", "characters")
        for label in ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"]:
            self.vocab.add_token_to_namespace(label, "labels")

        self.word_indexer = {"words": SingleIdTokenIndexer("words")}
        self.words_and_characters_indexers = {
            "words":
            SingleIdTokenIndexer("words"),
            "characters":
            TokenCharactersIndexer("characters", min_padding_length=1),
        }
        self.field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence"]],
            self.word_indexer)
        self.field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence"]],
            self.word_indexer)
        self.field3 = TextField(
            [Token(t) for t in ["this", "is", "another", "sentence"]],
            self.word_indexer)

        self.empty_text_field = self.field1.empty_field()
        self.index_field = IndexField(1, self.field1)
        self.empty_index_field = self.index_field.empty_field()
        self.sequence_label_field = SequenceLabelField([1, 1, 0, 1],
                                                       self.field1)
        self.empty_sequence_label_field = self.sequence_label_field.empty_field(
        )

        tokenizer = SpacyTokenizer()
        tokens = tokenizer.tokenize("Foo")
        text_field = TextField(tokens, self.word_indexer)
        empty_list_field = ListField([text_field.empty_field()])
        empty_fields = {"list_tensor": empty_list_field}
        self.empty_instance = Instance(empty_fields)

        non_empty_list_field = ListField([text_field])
        non_empty_fields = {"list_tensor": non_empty_list_field}
        self.non_empty_instance = Instance(non_empty_fields)

        super().setUp()

예제 #16

0

파일 보기

class SentenceClassifierPredictor(Predictor):
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyTokenizer(language='en_core_web_sm',
                                         pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"sentence": sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.tokenize(sentence)
        return self._dataset_reader.text_to_instance([str(t) for t in tokens])

예제 #17

0

파일 보기

파일: span_utils_test.py 프로젝트: matasuke/allennlp

    def test_enumerate_spans_enumerates_all_spans(self):
        tokenizer = SpacyTokenizer(pos_tags=True)
        sentence = tokenizer.tokenize("This is a sentence.")

        spans = span_utils.enumerate_spans(sentence)
        assert spans == [
            (0, 0),
            (0, 1),
            (0, 2),
            (0, 3),
            (0, 4),
            (1, 1),
            (1, 2),
            (1, 3),
            (1, 4),
            (2, 2),
            (2, 3),
            (2, 4),
            (3, 3),
            (3, 4),
            (4, 4),
        ]

        spans = span_utils.enumerate_spans(sentence,
                                           max_span_width=3,
                                           min_span_width=2)
        assert spans == [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (2, 4),
                         (3, 4)]

        spans = span_utils.enumerate_spans(sentence,
                                           max_span_width=3,
                                           min_span_width=2,
                                           offset=20)
        assert spans == [(20, 21), (20, 22), (21, 22), (21, 23), (22, 23),
                         (22, 24), (23, 24)]

        def no_prefixed_punctuation(tokens: List[Token]):
            # Only include spans which don't start or end with punctuation.
            return tokens[0].pos_ != "PUNCT" and tokens[-1].pos_ != "PUNCT"

        spans = span_utils.enumerate_spans(
            sentence,
            max_span_width=3,
            min_span_width=2,
            filter_function=no_prefixed_punctuation)

        # No longer includes (2, 4) or (3, 4) as these include punctuation
        # as their last element.
        assert spans == [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3)]

예제 #18

0

파일 보기

파일: interleaving_dataset_reader_test.py 프로젝트: ha-lins/medical_dialog

class PlainTextReader(DatasetReader):
    def __init__(self):
        super().__init__()
        self._token_indexers = {"tokens": SingleIdTokenIndexer()}
        self._tokenizer = SpacyTokenizer()

    def _read(self, file_path: str) -> Iterable[Instance]:
        with open(file_path) as input_file:
            for line in input_file:
                yield self.text_to_instance(line)

    def text_to_instance(self, line: str) -> Instance:  # type: ignore

        tokens = self._tokenizer.tokenize(line)
        return Instance({"line": TextField(tokens, self._token_indexers)})

예제 #19

0

파일 보기

 def test_tokens_to_indices_with_non_default_feature_name(self):
     tokenizer = SpacyTokenizer(parse=True)
     tokens = tokenizer.tokenize("This is a sentence.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     root_index = vocab.add_token_to_namespace("ROOT",
                                               namespace="dep_labels")
     none_index = vocab.add_token_to_namespace("NONE",
                                               namespace="dep_labels")
     indexer = SingleIdTokenIndexer(namespace="dep_labels",
                                    feature_name="dep_",
                                    default_value="NONE")
     assert indexer.tokens_to_indices([tokens[1]], vocab) == {
         "tokens": [root_index]
     }
     assert indexer.tokens_to_indices([tokens[-1]], vocab) == {
         "tokens": [none_index]
     }

예제 #20

0

파일 보기

    def test_count_vocab_items_with_non_default_feature_name(self):
        tokenizer = SpacyTokenizer(parse=True)
        tokens = tokenizer.tokenize("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = SingleIdTokenIndexer(namespace="dep_labels",
                                       feature_name="dep_",
                                       default_value="NONE")
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)

        assert counter["dep_labels"] == {
            "ROOT": 1,
            "nsubj": 1,
            "det": 1,
            "NONE": 2,
            "attr": 1,
            "punct": 1,
        }

예제 #21

0

파일 보기

파일: util_test.py 프로젝트: ha-lins/medical_dialog

 def test_char_span_to_token_span_handles_easy_cases(self):
     # These are _inclusive_ spans, on both sides.
     tokenizer = SpacyTokenizer()
     passage = (
         "On January 7, 2012, Beyoncé gave birth to her first child, a daughter, Blue Ivy "
         + "Carter, at Lenox Hill Hospital in New York. Five months later, she performed for four "
         + "nights at Revel Atlantic City's Ovation Hall to celebrate the resort's opening, her "
         + "first performances since giving birth to Blue Ivy."
     )
     tokens = tokenizer.tokenize(passage)
     offsets = [(t.idx, t.idx + len(t.text)) for t in tokens]
     # "January 7, 2012"
     token_span = util.char_span_to_token_span(offsets, (3, 18))[0]
     assert token_span == (1, 4)
     # "Lenox Hill Hospital"
     token_span = util.char_span_to_token_span(offsets, (91, 110))[0]
     assert token_span == (22, 24)
     # "Lenox Hill Hospital in New York."
     token_span = util.char_span_to_token_span(offsets, (91, 123))[0]
     assert token_span == (22, 28)

예제 #22

0

파일 보기

파일: bert_indexer_test.py 프로젝트: zulushakaka/allennlp

    def test_token_type_ids(self):
        tokenizer = SpacyTokenizer()

        sentence = "the laziest  fox"

        tokens = tokenizer.tokenize(sentence)
        #           2   15 10 11  6   17    2   15 10 11  6
        #           the laziest   fox [SEP] the laziest   fox
        tokens = (
            tokens + [Token("[SEP]")] + tokens
        )  # have to do this b/c tokenizer splits `[SEP]` in three

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(str(vocab_path))

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab)

        #                                          [CLS] 2, 15, 10, 11, 6, 17, 2  15, 10, 11, 6, [SEP]
        assert indexed_tokens["token_type_ids"] == [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]

예제 #23

0

파일 보기

파일: util_test.py 프로젝트: ha-lins/medical_dialog

    def test_char_span_to_token_span_handles_out_of_bounds_start_end(self):
        tokenizer = SpacyTokenizer()
        passage = "This sentence is just for testing purposes"
        tokens = tokenizer.tokenize(passage)
        offsets = [(t.idx, t.idx + len(t.text)) for t in tokens]

        # scenario 1: negative start character span (this should really never happen)
        start = -1
        end = start + len("This")
        expected_span = (0, 0)
        token_span, error = util.char_span_to_token_span(offsets, (start, end))
        assert token_span == expected_span
        assert error

        # scenario 2: end character span exceeds sentence length, for whichever reason
        start = 34
        end = start + len("purposes") + 1
        expected_span = (6, 6)
        token_span, error = util.char_span_to_token_span(offsets, (start, end))
        assert token_span == expected_span
        assert error

예제 #24

0

파일 보기

파일: util_test.py 프로젝트: ha-lins/medical_dialog

    def test_char_span_to_token_span_handles_undertokenization(self):
        tokenizer = SpacyTokenizer()
        passage = "This sentence will have two under tokenized tokens, one#here and one at the#end"
        tokens = tokenizer.tokenize(passage)
        offsets = [(t.idx, t.idx + len(t.text)) for t in tokens]

        # scenario 1: under tokenized in the middle of the sentence, look for the first part of the token
        start = 52
        end = start + len("one")
        expected_span = (9, 9)  # the indices of the whole "one&here" token should be returned
        token_span, error = util.char_span_to_token_span(offsets, (start, end))
        assert token_span == expected_span
        assert error

        # scenario 2: under tokenized in the middle of the sentence, look for the second part of the token
        start = 56
        end = start + len("here")
        expected_span = (9, 9)  # the indices of the whole "one&here" token should be returned
        token_span, error = util.char_span_to_token_span(offsets, (start, end))
        assert token_span == expected_span
        assert error

        # scenario 3: under tokenized at the end of the sentence, look for the first part of the token
        start = 72
        end = start + len("the")
        expected_span = (13, 13)  # the indices of the whole "the&end" token should be returned
        token_span, error = util.char_span_to_token_span(offsets, (start, end))
        assert token_span == expected_span
        assert error

        # scenario 4: under tokenized at the end of the sentence, look for the second part of the token
        # this used to cause an IndexError
        start = 76
        end = start + len("end")
        expected_span = (13, 13)  # the indices of the whole "the&end" token should be returned
        token_span, errory = util.char_span_to_token_span(offsets, (start, end))
        assert token_span == expected_span
        assert error

예제 #25

0

파일 보기

파일: multi_task_trainer_test.py 프로젝트: zulushakaka/allennlp

class MyReader(DatasetReader):
    """
    Just reads in a text file and sticks each line
    in a `TextField` with the specified name.
    """
    def __init__(self, field_name: str) -> None:
        super().__init__()
        self.field_name = field_name
        self.tokenizer = SpacyTokenizer()
        self.token_indexers: Dict[str, TokenIndexer] = {
            "tokens": SingleIdTokenIndexer()
        }

    def text_to_instance(self, sentence: str) -> Instance:  # type: ignore

        tokens = self.tokenizer.tokenize(sentence)
        return Instance(
            {self.field_name: TextField(tokens, self.token_indexers)})

    def _read(self, file_path: str):
        with open(file_path) as data_file:
            for line in data_file:
                yield self.text_to_instance(line)

예제 #26

0

파일 보기

파일: bert_indexer_test.py 프로젝트: zulushakaka/allennlp

    def test_do_lowercase(self):
        # Our default tokenizer doesn't handle lowercasing.
        tokenizer = SpacyTokenizer()

        # Quick is UNK because of capitalization
        #           2   1     5     6   8      9    2  15 10 11 14   1
        sentence = "the Quick brown fox jumped over the laziest lazy elmo"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(str(vocab_path), do_lowercase=False)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab)

        # Quick should get 1 == OOV
        assert indexed_tokens["input_ids"] == [16, 2, 1, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17]

        # Does lowercasing by default
        token_indexer = PretrainedBertIndexer(str(vocab_path))
        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab)

        # Now Quick should get indexed correctly as 3 ( == "quick")
        assert indexed_tokens["input_ids"] == [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17]

예제 #27

0

파일 보기

class OpenIePredictor(Predictor):
    """
    Predictor for the [`SemanticRolelabeler`](../models/semantic_role_labeler.md) model
    (in its Open Information variant).
    Used by online demo and for prediction on an input file using command line.
    """
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyTokenizer(pos_tags=True)

    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like `{"sentence": "...", "predicate_index": "..."}`.
        Assumes sentence is tokenized, and that predicate_index points to a specific
        predicate (word index) within the sentence, for which to produce Open IE extractions.
        """
        tokens = json_dict["sentence"]
        predicate_index = int(json_dict["predicate_index"])
        verb_labels = [0 for _ in tokens]
        verb_labels[predicate_index] = 1
        return self._dataset_reader.text_to_instance(tokens, verb_labels)

    @overrides
    def predict_json(self, inputs: JsonDict) -> JsonDict:
        """
        Create instance(s) after predicting the format. One sentence containing multiple verbs
        will lead to multiple instances.

        Expects JSON that looks like `{"sentence": "..."}`

        Returns a JSON that looks like

        .. code-block:: js

            {"tokens": [...],
             "tag_spans": [{"ARG0": "...",
                            "V": "...",
                            "ARG1": "...",
                             ...}]}
        """
        sent_tokens = self._tokenizer.tokenize(inputs["sentence"])

        # Find all verbs in the input sentence
        pred_ids = [i for (i, t) in enumerate(sent_tokens) if t.pos_ == "VERB"]

        # Create instances
        instances = [
            self._json_to_instance({
                "sentence": sent_tokens,
                "predicate_index": pred_id
            }) for pred_id in pred_ids
        ]

        # Run model
        outputs = [[
            sanitize_label(label)
            for label in self._model.forward_on_instance(instance)["tags"]
        ] for instance in instances]

        # Consolidate predictions
        pred_dict = consolidate_predictions(outputs, sent_tokens)

        # Build and return output dictionary
        results = {"verbs": [], "words": sent_tokens}

        for tags in pred_dict.values():
            # Join multi-word predicates
            tags = join_mwp(tags)

            # Create description text
            description = make_oie_string(sent_tokens, tags)

            # Add a predicate prediction to the return dictionary.
            results["verbs"].append({
                "verb":
                get_predicate_text(sent_tokens, tags),
                "description":
                description,
                "tags":
                tags,
            })

        return sanitize(results)

예제 #28

0

파일 보기

class TestSpacyTokenizer(AllenNlpTestCase):
    def setup_method(self):
        super().setup_method()
        self.word_tokenizer = SpacyTokenizer()

    def test_tokenize_handles_complex_punctuation(self):
        sentence = "this (sentence) has 'crazy' \"punctuation\"."
        expected_tokens = [
            "this",
            "(",
            "sentence",
            ")",
            "has",
            "'",
            "crazy",
            "'",
            '"',
            "punctuation",
            '"',
            ".",
        ]
        tokens = self.word_tokenizer.tokenize(sentence)
        token_text = [t.text for t in tokens]
        assert token_text == expected_tokens
        for token in tokens:
            start = token.idx
            end = start + len(token.text)
            assert sentence[start:end] == token.text

    def test_tokenize_handles_contraction(self):
        # note that "would've" is kept together, while "ain't" is not.
        sentence = "it ain't joe's problem; would been yesterday"
        expected_tokens = [
            "it",
            "ai",
            "n't",
            "joe",
            "'s",
            "problem",
            ";",
            "would",
            "been",
            "yesterday",
        ]
        tokens = [t.text for t in self.word_tokenizer.tokenize(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_multiple_contraction(self):
        sentence = "wouldn't've"
        expected_tokens = ["would", "n't", "'ve"]
        tokens = [t.text for t in self.word_tokenizer.tokenize(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_final_apostrophe(self):
        sentence = "the jones' house"
        expected_tokens = ["the", "jones", "'", "house"]
        tokens = [t.text for t in self.word_tokenizer.tokenize(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_removes_whitespace_tokens(self):
        sentence = "the\n jones'   house  \x0b  55"
        expected_tokens = ["the", "jones", "'", "house", "55"]
        tokens = [t.text for t in self.word_tokenizer.tokenize(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_special_cases(self):
        # note that the etc. doesn't quite work --- we can special case this if we want.
        sentence = "Mr. and Mrs. Jones, etc., went to, e.g., the store"
        expected_tokens = [
            "Mr.",
            "and",
            "Mrs.",
            "Jones",
            ",",
            "etc",
            ".",
            ",",
            "went",
            "to",
            ",",
            "e.g.",
            ",",
            "the",
            "store",
        ]
        tokens = [t.text for t in self.word_tokenizer.tokenize(sentence)]
        assert tokens == expected_tokens

    def test_batch_tokenization(self):
        sentences = [
            "This is     a sentence",
            "This isn't a sentence.",
            "This is the 3rd     sentence." "Here's the 'fourth' sentence.",
        ]
        batch_split = self.word_tokenizer.batch_tokenize(sentences)
        separately_split = [self.word_tokenizer.tokenize(sentence) for sentence in sentences]
        assert len(batch_split) == len(separately_split)
        for batch_sentence, separate_sentence in zip(batch_split, separately_split):
            assert len(batch_sentence) == len(separate_sentence)
            for batch_word, separate_word in zip(batch_sentence, separate_sentence):
                assert batch_word.text == separate_word.text

    def test_keep_spacy_tokens(self):
        word_tokenizer = SpacyTokenizer()
        sentence = "This should be an allennlp Token"
        tokens = word_tokenizer.tokenize(sentence)
        assert tokens
        assert all(isinstance(token, Token) for token in tokens)

        word_tokenizer = SpacyTokenizer(keep_spacy_tokens=True)
        sentence = "This should be a spacy Token"
        tokens = word_tokenizer.tokenize(sentence)
        assert tokens
        assert all(isinstance(token, spacy.tokens.Token) for token in tokens)

    def test_to_params(self):
        tokenizer = SpacyTokenizer()
        params = tokenizer.to_params()
        assert isinstance(params, Params)
        assert params.params == {
            "type": "spacy",
            "language": tokenizer._language,
            "pos_tags": tokenizer._pos_tags,
            "parse": tokenizer._parse,
            "ner": tokenizer._ner,
            "keep_spacy_tokens": tokenizer._keep_spacy_tokens,
            "split_on_spaces": tokenizer._split_on_spaces,
            "start_tokens": tokenizer._start_tokens,
            "end_tokens": tokenizer._end_tokens,
        }

예제 #29

0

파일 보기

파일: table_question_context_test.py 프로젝트: rrmenon10/allennlp-semparse

class TestTableQuestionContext(SemparseTestCase):
    def setup_method(self):
        super().setup_method()
        self.tokenizer = SpacyTokenizer(pos_tags=True)

    def test_table_data(self):
        question = "what was the attendance when usl a league played?"
        question_tokens = self.tokenizer.tokenize(question)
        test_file = f"{self.FIXTURES_ROOT}/data/wikitables/sample_table.tagged"
        table_question_context = TableQuestionContext.read_from_file(
            test_file, question_tokens)
        assert table_question_context.table_data == [
            {
                "date_column:year": Date(2001, -1, -1),
                "number_column:year": 2001.0,
                "string_column:year": "2001",
                "number_column:division": 2.0,
                "string_column:division": "2",
                "string_column:league": "usl_a_league",
                "string_column:regular_season": "4th_western",
                "number_column:regular_season": 4.0,
                "string_column:playoffs": "quarterfinals",
                "string_column:open_cup": "did_not_qualify",
                "number_column:open_cup": None,
                "number_column:avg_attendance": 7169.0,
                "string_column:avg_attendance": "7_169",
            },
            {
                "date_column:year": Date(2005, -1, -1),
                "number_column:year": 2005.0,
                "string_column:year": "2005",
                "number_column:division": 2.0,
                "string_column:division": "2",
                "string_column:league": "usl_first_division",
                "string_column:regular_season": "5th",
                "number_column:regular_season": 5.0,
                "string_column:playoffs": "quarterfinals",
                "string_column:open_cup": "4th_round",
                "number_column:open_cup": 4.0,
                "number_column:avg_attendance": 6028.0,
                "string_column:avg_attendance": "6_028",
            },
        ]

    def test_table_data_from_untagged_file(self):
        question = "what was the attendance when usl a league played?"
        question_tokens = self.tokenizer.tokenize(question)
        test_file = f"{self.FIXTURES_ROOT}/data/wikitables/sample_table.tsv"
        table_lines = [line.strip() for line in open(test_file).readlines()]
        table_question_context = TableQuestionContext.read_from_lines(
            table_lines, question_tokens)
        # The content in the table represented by the untagged file we are reading here is the same as the one we
        # had in the tagged file above, except that we have a "Score" column instead of "Avg. Attendance" column,
        # which is changed to test the num2 extraction logic. I've shown the values not being extracted here as
        # well and commented them out.
        assert table_question_context.table_data == [
            {
                "number_column:year": 2001.0,
                # The value extraction logic we have for untagged lines does
                # not extract this value as a date.
                # 'date_column:year': Date(2001, -1, -1),
                "string_column:year": "2001",
                "number_column:division": 2.0,
                "string_column:division": "2",
                "string_column:league": "usl_a_league",
                "string_column:regular_season": "4th_western",
                # We only check for strings that are entirely numbers. So 4.0
                # will not be extracted.
                # 'number_column:regular_season': 4.0,
                "string_column:playoffs": "quarterfinals",
                "string_column:open_cup": "did_not_qualify",
                # 'number_column:open_cup': None,
                "number_column:score": 20.0,
                "num2_column:score": 30.0,
                "string_column:score": "20_30",
            },
            {
                "number_column:year": 2005.0,
                # 'date_column:year': Date(2005, -1, -1),
                "string_column:year": "2005",
                "number_column:division": 2.0,
                "string_column:division": "2",
                "string_column:league": "usl_first_division",
                "string_column:regular_season": "5th",
                # Same here as in the "division" column for the first row.
                # 5.0 will not be extracted from "5th".
                # 'number_column:regular_season': 5.0,
                "string_column:playoffs": "quarterfinals",
                "string_column:open_cup": "4th_round",
                # 'number_column:open_cup': 4.0,
                "number_column:score": 50.0,
                "num2_column:score": 40.0,
                "string_column:score": "50_40",
            },
        ]

    def test_number_extraction(self):
        question = """how many players on the 191617 illinois fighting illini men's basketball team
                      had more than 100 points scored?"""
        question_tokens = self.tokenizer.tokenize(question)
        test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-7.table"
        table_question_context = TableQuestionContext.read_from_file(
            test_file, question_tokens)
        _, number_entities = table_question_context.get_entities_from_question(
        )
        assert number_entities == [("191617", 5), ("100", 16)]

    def test_date_extraction(self):
        question = "how many laps did matt kenset complete on february 26, 2006."
        question_tokens = self.tokenizer.tokenize(question)
        test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-8.table"
        table_question_context = TableQuestionContext.read_from_file(
            test_file, question_tokens)
        _, number_entities = table_question_context.get_entities_from_question(
        )
        assert number_entities == [("2", 8), ("26", 9), ("2006", 11)]

    def test_date_extraction_2(self):
        question = """how many different players scored for the san jose earthquakes during their
                      1979 home opener against the timbers?"""
        question_tokens = self.tokenizer.tokenize(question)
        test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-6.table"
        table_question_context = TableQuestionContext.read_from_file(
            test_file, question_tokens)
        _, number_entities = table_question_context.get_entities_from_question(
        )
        assert number_entities == [("1979", 12)]

    def test_multiword_entity_extraction(self):
        question = "was the positioning better the year of the france venue or the year of the south korea venue?"
        question_tokens = self.tokenizer.tokenize(question)
        test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-3.table"
        table_question_context = TableQuestionContext.read_from_file(
            test_file, question_tokens)
        entities, _ = table_question_context.get_entities_from_question()
        assert entities == [
            ("string:france", ["string_column:venue"]),
            ("string:south_korea", ["string_column:venue"]),
        ]

    def test_rank_number_extraction(self):
        question = "what was the first tamil-language film in 1943?"
        question_tokens = self.tokenizer.tokenize(question)
        test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-1.table"
        table_question_context = TableQuestionContext.read_from_file(
            test_file, question_tokens)
        _, numbers = table_question_context.get_entities_from_question()
        assert numbers == [("1", 3), ("1943", 9)]

    def test_null_extraction(self):
        question = "on what date did the eagles score the least points?"
        question_tokens = self.tokenizer.tokenize(question)
        test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-2.table"
        table_question_context = TableQuestionContext.read_from_file(
            test_file, question_tokens)
        entities, numbers = table_question_context.get_entities_from_question()
        # "Eagles" does not appear in the table.
        assert entities == []
        assert numbers == []

    def test_numerical_column_type_extraction(self):
        question = """how many players on the 191617 illinois fighting illini men's basketball team
                      had more than 100 points scored?"""
        question_tokens = self.tokenizer.tokenize(question)
        test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-7.table"
        table_question_context = TableQuestionContext.read_from_file(
            test_file, question_tokens)
        column_names = table_question_context.column_names
        assert "number_column:games_played" in column_names
        assert "number_column:field_goals" in column_names
        assert "number_column:free_throws" in column_names
        assert "number_column:points" in column_names

    def test_date_column_type_extraction_1(self):
        question = "how many were elected?"
        question_tokens = self.tokenizer.tokenize(question)
        test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-5.table"
        table_question_context = TableQuestionContext.read_from_file(
            test_file, question_tokens)
        column_names = table_question_context.column_names
        assert "date_column:first_elected" in column_names

    def test_date_column_type_extraction_2(self):
        question = "how many were elected?"
        question_tokens = self.tokenizer.tokenize(question)
        test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-9.table"
        table_question_context = TableQuestionContext.read_from_file(
            test_file, question_tokens)
        column_names = table_question_context.column_names
        assert "date_column:date_of_appointment" in column_names
        assert "date_column:date_of_election" in column_names

    def test_string_column_types_extraction(self):
        question = "how many were elected?"
        question_tokens = self.tokenizer.tokenize(question)
        test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-10.table"
        table_question_context = TableQuestionContext.read_from_file(
            test_file, question_tokens)
        column_names = table_question_context.column_names
        assert "string_column:birthplace" in column_names
        assert "string_column:advocate" in column_names
        assert "string_column:notability" in column_names
        assert "string_column:name" in column_names

    def test_number_and_entity_extraction(self):
        question = "other than m1 how many notations have 1 in them?"
        question_tokens = self.tokenizer.tokenize(question)
        test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-11.table"
        table_question_context = TableQuestionContext.read_from_file(
            test_file, question_tokens)
        string_entities, number_entities = table_question_context.get_entities_from_question(
        )
        assert string_entities == [
            ("string:m1", ["string_column:notation"]),
            ("string:1", ["string_column:position"]),
        ]
        assert number_entities == [("1", 2), ("1", 7)]

    def test_get_knowledge_graph(self):
        question = "other than m1 how many notations have 1 in them?"
        question_tokens = self.tokenizer.tokenize(question)
        test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-11.table"
        table_question_context = TableQuestionContext.read_from_file(
            test_file, question_tokens)
        knowledge_graph = table_question_context.get_table_knowledge_graph()
        entities = knowledge_graph.entities
        # -1 is not in entities because there are no date columns in the table.
        assert sorted(entities) == [
            "1",
            "number_column:notation",
            "number_column:position",
            "string:1",
            "string:m1",
            "string_column:mnemonic",
            "string_column:notation",
            "string_column:position",
            "string_column:short_name",
            "string_column:swara",
        ]
        neighbors = knowledge_graph.neighbors
        # Each number extracted from the question will have all number and date columns as
        # neighbors. Each string entity extracted from the question will only have the corresponding
        # column as the neighbor.
        neighbors_with_sets = {
            key: set(value)
            for key, value in neighbors.items()
        }
        assert neighbors_with_sets == {
            "1": {"number_column:position", "number_column:notation"},
            "string_column:mnemonic": set(),
            "string_column:short_name": set(),
            "string_column:swara": set(),
            "number_column:position": {"1"},
            "number_column:notation": {"1"},
            "string:m1": {"string_column:notation"},
            "string:1": {"string_column:position"},
            "string_column:notation": {"string:m1"},
            "string_column:position": {"string:1"},
        }
        entity_text = knowledge_graph.entity_text
        assert entity_text == {
            "1": "1",
            "string:m1": "m1",
            "string:1": "1",
            "string_column:notation": "notation",
            "number_column:notation": "notation",
            "string_column:mnemonic": "mnemonic",
            "string_column:short_name": "short name",
            "string_column:swara": "swara",
            "number_column:position": "position",
            "string_column:position": "position",
        }

    def test_knowledge_graph_has_correct_neighbors(self):
        question = "when was the attendance greater than 5000?"
        question_tokens = self.tokenizer.tokenize(question)
        test_file = f"{self.FIXTURES_ROOT}/data/wikitables/sample_table.tagged"
        table_question_context = TableQuestionContext.read_from_file(
            test_file, question_tokens)
        knowledge_graph = table_question_context.get_table_knowledge_graph()
        neighbors = knowledge_graph.neighbors
        # '5000' is neighbors with number and date columns. '-1' is in entities because there is a
        # date column, which is its only neighbor.
        assert set(neighbors.keys()) == {
            "date_column:year",
            "number_column:year",
            "string_column:year",
            "number_column:division",
            "string_column:division",
            "string_column:league",
            "string_column:regular_season",
            "number_column:regular_season",
            "string_column:playoffs",
            "string_column:open_cup",
            "number_column:open_cup",
            "number_column:avg_attendance",
            "string_column:avg_attendance",
            "5000",
            "-1",
        }
        assert set(neighbors["date_column:year"]) == {"5000", "-1"}
        assert neighbors["number_column:year"] == ["5000"]
        assert neighbors["string_column:year"] == []
        assert neighbors["number_column:division"] == ["5000"]
        assert neighbors["string_column:division"] == []
        assert neighbors["string_column:league"] == []
        assert neighbors["string_column:regular_season"] == []
        assert neighbors["number_column:regular_season"] == ["5000"]
        assert neighbors["string_column:playoffs"] == []
        assert neighbors["string_column:open_cup"] == []
        assert neighbors["number_column:open_cup"] == ["5000"]
        assert neighbors["number_column:avg_attendance"] == ["5000"]
        assert neighbors["string_column:avg_attendance"] == []
        assert set(neighbors["5000"]) == {
            "date_column:year",
            "number_column:year",
            "number_column:division",
            "number_column:avg_attendance",
            "number_column:regular_season",
            "number_column:open_cup",
        }
        assert neighbors["-1"] == ["date_column:year"]

예제 #30

0

파일 보기

파일: end2end_model.py 프로젝트: pig7788/LEONA

class IOBDatasetReader(DatasetReader):
    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.tokenizer_space = WhitespaceTokenizer()
        self.tokenizer_spacy = SpacyTokenizer(language="en_core_web_md",
                                              pos_tags=True,
                                              split_on_spaces=True)
        self.token_indexers = {
            'elmo_tokens':
            ELMoTokenCharactersIndexer(),
            'token_characters':
            TokenCharactersIndexer(namespace='character_vocab',
                                   min_padding_length=6),
            'pos_tags':
            SingleIdTokenIndexer(namespace='pos_tag_vocab',
                                 feature_name='tag_'),
            'ner_tags':
            SingleIdTokenIndexer(namespace='ner_tag_vocab',
                                 feature_name='ent_type_')
        }

        self.slot_indexers = {
            'elmo_tokens':
            ELMoTokenCharactersIndexer(),
            'token_characters':
            TokenCharactersIndexer(namespace='character_vocab',
                                   min_padding_length=6)
        }

    def text_to_instance(self,
                         tokens: List[Token],
                         slot: List[Token],
                         s1_tags: List[str] = None,
                         tags: List[str] = None) -> Instance:
        sentence_field = TextField(tokens, self.token_indexers)
        slot_field = TextField(slot, self.slot_indexers)

        fields = {"sentence": sentence_field, "slot": slot_field}

        if s1_tags:
            s1_field = SequenceLabelField(labels=s1_tags,
                                          sequence_field=sentence_field,
                                          label_namespace="s1_labels")
            fields["s1_labels"] = s1_field
        if tags:
            label_field = SequenceLabelField(labels=tags,
                                             sequence_field=sentence_field)
            fields["labels"] = label_field

        return Instance(fields)

    def _read(self, file_path: str) -> Iterable[Instance]:
        with open(file_path) as f:
            for line in f:
                sentence, s1_label, description, tags = line.strip().split(
                    '\t')
                yield self.text_to_instance(
                    self.tokenizer_spacy.tokenize(sentence),
                    self.tokenizer_spacy.tokenize(description),
                    [iob for iob in s1_label.split()],
                    [iob for iob in tags.split()])