class TestDepLabelIndexer(AllenNlpTestCase): def setUp(self): super(TestDepLabelIndexer, self).setUp() self.tokenizer = SpacyWordSplitter(parse=True) def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = DepLabelIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["dep_labels"] == {"ROOT": 1, "nsubj": 1, "det": 1, "NONE": 2, "attr": 1, "punct": 1} def test_tokens_to_indices_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() root_index = vocab.add_token_to_namespace('ROOT', namespace='dep_labels') none_index = vocab.add_token_to_namespace('NONE', namespace='dep_labels') indexer = DepLabelIndexer() assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {"tokens1": [root_index]} assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {"tokens-1": [none_index]} def test_padding_functions(self): indexer = DepLabelIndexer() assert indexer.get_padding_token() == 0 assert indexer.get_padding_lengths(0) == {} def test_as_array_produces_token_sequence(self): indexer = DepLabelIndexer() padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {}) assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
class TestNerTagIndexer(AllenNlpTestCase): def setUp(self): super(TestNerTagIndexer, self).setUp() self.tokenizer = SpacyWordSplitter(ner=True) def test_count_vocab_items_uses_ner_tags(self): tokens = self.tokenizer.split_words("Larry Page is CEO of Google.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = NerTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["ner_tags"] == {'PERSON': 2, 'ORG': 1, 'NONE': 6} def test_token_to_indices_uses_ner_tags(self): tokens = self.tokenizer.split_words("Larry Page is CEO of Google.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() person_index = vocab.add_token_to_namespace('PERSON', namespace='ner_tags') none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags') vocab.add_token_to_namespace('ORG', namespace='ner_tags') indexer = NerTagIndexer() assert indexer.token_to_indices(tokens[1], vocab) == person_index assert indexer.token_to_indices(tokens[-1], vocab) == none_index def test_padding_functions(self): indexer = NerTagIndexer() assert indexer.get_padding_token() == 0 assert indexer.get_padding_lengths(0) == {} def test_as_array_produces_token_sequence(self): indexer = NerTagIndexer() padded_tokens = indexer.pad_token_sequence([1, 2, 3, 4, 5], 10, {}) assert padded_tokens == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
class TestPosTagIndexer(AllenNlpTestCase): def setUp(self): super(TestPosTagIndexer, self).setUp() self.tokenizer = SpacyWordSplitter(pos_tags=True) def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tags"] == {'DT': 2, 'VBZ': 1, '.': 1, 'NN': 1, 'NONE': 2} indexer._coarse_tags = True # pylint: disable=protected-access counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tags"] == {'VERB': 1, 'PUNCT': 1, 'DET': 2, 'NOUN': 1, 'NONE': 2} def test_tokens_to_indices_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags') cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags') none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags') # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them vocab.add_token_to_namespace('DET', namespace='pos_tags') vocab.add_token_to_namespace('NOUN', namespace='pos_tags') vocab.add_token_to_namespace('PUNCT', namespace='pos_tags') indexer = PosTagIndexer(coarse_tags=True) indices = indexer.tokens_to_indices(tokens, vocab, "tokens") assert len(indices) == 1 assert "tokens" in indices assert indices["tokens"][1] == verb_index assert indices["tokens"][-1] == none_index indexer._coarse_tags = False # pylint: disable=protected-access assert indexer.tokens_to_indices([tokens[1]], vocab, "coarse") == {"coarse": [cop_index]} def test_padding_functions(self): indexer = PosTagIndexer() assert indexer.get_padding_token() == 0 assert indexer.get_padding_lengths(0) == {} def test_as_array_produces_token_sequence(self): indexer = PosTagIndexer() padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {}) assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
class TestNerTagIndexer(AllenNlpTestCase): def setUp(self): super(TestNerTagIndexer, self).setUp() self.tokenizer = SpacyWordSplitter(ner=True) def test_count_vocab_items_uses_ner_tags(self): tokens = self.tokenizer.split_words("Larry Page is CEO of Google.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = NerTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["ner_tokens"] == {'PERSON': 2, 'ORG': 1, 'NONE': 6} def test_tokens_to_indices_uses_ner_tags(self): tokens = self.tokenizer.split_words("Larry Page is CEO of Google.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() person_index = vocab.add_token_to_namespace('PERSON', namespace='ner_tags') none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags') vocab.add_token_to_namespace('ORG', namespace='ner_tags') indexer = NerTagIndexer(namespace='ner_tags') assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {"tokens1": [person_index]} assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {"tokens-1": [none_index]} def test_padding_functions(self): indexer = NerTagIndexer() assert indexer.get_padding_token() == 0 assert indexer.get_padding_lengths(0) == {} def test_as_array_produces_token_sequence(self): indexer = NerTagIndexer() padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {}) assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]} def test_blank_ner_tag(self): tokens = [Token(token) for token in "allennlp is awesome .".split(" ")] for token in tokens: token.ent_type_ = "" indexer = NerTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) # spacy uses a empty string to indicate "no NER tag" # we convert it to "NONE" assert counter["ner_tokens"]["NONE"] == 4 vocab = Vocabulary(counter) none_index = vocab.get_token_index('NONE', 'ner_tokens') # should raise no exception indices = indexer.tokens_to_indices(tokens, vocab, index_name="ner") assert {"ner": [none_index, none_index, none_index, none_index]} == indices
class DialogQAPredictor(Predictor): def __init__(self, model: Model, dataset_reader: DatasetReader, language: str = 'en_core_web_sm') -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyWordSplitter(language=language) def predict(self, jsonline: str) -> JsonDict: """ Make a dialog-style question answering prediction on the supplied input. The supplied input json must contain a list of question answer pairs, containing question, answer, yesno, followup, id as well as the context (passage). Parameters ---------- jsonline: ``str`` A json line that has the same format as the quac data file. Returns ---------- A dictionary that represents the prediction made by the system. The answer string will be under the "best_span_str" key. """ return self.predict_json(json.loads(jsonline)) @overrides def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects json that looks like the original quac data file. """ paragraph_json = json_dict["paragraphs"][0] paragraph = paragraph_json['context'] tokenized_paragraph = self._tokenizer.split_words(paragraph) qas = paragraph_json['qas'] metadata = {} metadata["instance_id"] = [qa['id'] for qa in qas] question_text_list = [qa["question"].strip().replace("\n", "") for qa in qas] answer_texts_list = [[answer['text'] for answer in qa['answers']] for qa in qas] metadata["answer_texts_list"] = answer_texts_list metadata["question"] = question_text_list span_starts_list = [[answer['answer_start'] for answer in qa['answers']] for qa in qas] span_ends_list = [] for st_list, an_list in zip(span_starts_list, answer_texts_list): span_ends = [start + len(answer) for start, answer in zip(st_list, an_list)] span_ends_list.append(span_ends) yesno_list = [str(qa['yesno']) for qa in qas] followup_list = [str(qa['followup']) for qa in qas] instance = self._dataset_reader.text_to_instance(question_text_list, paragraph, span_starts_list, span_ends_list, tokenized_paragraph, yesno_list, followup_list, metadata) return instance
class TestPosTagIndexer(AllenNlpTestCase): def setUp(self): super(TestPosTagIndexer, self).setUp() self.tokenizer = SpacyWordSplitter(pos_tags=True) def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tags"] == {'DT': 2, 'VBZ': 1, '.': 1, 'NN': 1, 'NONE': 2} indexer._coarse_tags = True # pylint: disable=protected-access counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tags"] == {'VERB': 1, 'PUNCT': 1, 'DET': 2, 'NOUN': 1, 'NONE': 2} def test_token_to_indices_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags') cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags') none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags') indexer = PosTagIndexer(coarse_tags=True) assert indexer.token_to_indices(tokens[1], vocab) == verb_index assert indexer.token_to_indices(tokens[-1], vocab) == none_index indexer._coarse_tags = False # pylint: disable=protected-access assert indexer.token_to_indices(tokens[1], vocab) == cop_index def test_padding_functions(self): indexer = PosTagIndexer() assert indexer.get_padding_token() == 0 assert indexer.get_padding_lengths(0) == {} def test_as_array_produces_token_sequence(self): indexer = PosTagIndexer() padded_tokens = indexer.pad_token_sequence([1, 2, 3, 4, 5], 10, {}) assert padded_tokens == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
class TestSpacyWordSplitter(AllenNlpTestCase): def setUp(self): super(TestSpacyWordSplitter, self).setUp() self.word_splitter = SpacyWordSplitter() def test_tokenize_handles_complex_punctuation(self): sentence = "this (sentence) has 'crazy' \"punctuation\"." expected_tokens = ["this", "(", "sentence", ")", "has", "'", "crazy", "'", '"', "punctuation", '"', "."] tokens = self.word_splitter.split_words(sentence) token_text = [t.text for t in tokens] assert token_text == expected_tokens for token in tokens: start = token.idx end = start + len(token.text) assert sentence[start:end] == token.text def test_tokenize_handles_contraction(self): # note that "would've" is kept together, while "ain't" is not. sentence = "it ain't joe's problem; would been yesterday" expected_tokens = ["it", "ai", "n't", "joe", "'s", "problem", ";", "would", "been", "yesterday"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_handles_multiple_contraction(self): sentence = "wouldn't've" expected_tokens = ["would", "n't", "'ve"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_handles_final_apostrophe(self): sentence = "the jones' house" expected_tokens = ["the", "jones", "'", "house"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_removes_whitespace_tokens(self): sentence = "the\n jones' house \x0b 55" expected_tokens = ["the", "jones", "'", "house", "55"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_handles_special_cases(self): # note that the etc. doesn't quite work --- we can special case this if we want. sentence = "Mr. and Mrs. Jones, etc., went to, e.g., the store" expected_tokens = ["Mr.", "and", "Mrs.", "Jones", ",", "etc", ".", ",", "went", "to", ",", "e.g.", ",", "the", "store"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens
class SentenceTaggerPredictor(Predictor): """ Predictor for any model that takes in a sentence and returns a single set of tags for it. In particular, it can be used with the :class:`~allennlp.models.crf_tagger.CrfTagger` model and also the :class:`~allennlp.models.simple_tagger.SimpleTagger` model. """ def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True) def predict(self, sentence: str) -> JsonDict: return self.predict_json({"sentence" : sentence}) @overrides def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"sentence": "..."}``. Runs the underlying model, and adds the ``"words"`` to the output. """ sentence = json_dict["sentence"] tokens = self._tokenizer.split_words(sentence) return self._dataset_reader.text_to_instance(tokens)
def setUp(self): super(TestSpacyWordSplitter, self).setUp() self.word_splitter = SpacyWordSplitter()
def normal_tokenizer(x: str): return [ w.text for w in SpacyWordSplitter(language='en_core_web_sm', pos_tags=False).split_words(x) [:max_seq_len] ]
def text_to_instance( self, para_id: str, sentence_texts: List[str], participants: List[str], states: List[ List[str]] = None, # states[i][j] is ith participant at time j filename: str = '', score: float = None) -> Instance: tokenizer = WordTokenizer(word_splitter=SpacyWordSplitter( pos_tags=True)) paragraph = " ".join(sentence_texts) # Tokenize the sentences sentences = [ tokenizer.tokenize(sentence_text) for sentence_text in sentence_texts ] # Find the verbs verb_indexes = [[ 1 if token.pos_ == "VERB" else 0 for token in sentence ] for sentence in sentences] if states is not None: # Actions is (num_participants, num_events) actions = [_infer_actions(states_i) for states_i in states] tokenized_states = [[ tokenizer.tokenize(state_ij) for state_ij in states_i ] for states_i in states] location_spans = [ _compute_location_spans(states_i, sentences) for states_i in tokenized_states ] # Create indicators for the participants. participant_tokens = [ tokenizer.tokenize(participant) for participant in participants ] participant_indicators: List[List[List[int]]] = [] for participant_i_tokens in participant_tokens: targets = [ list(token_group) for is_semicolon, token_group in itertools.groupby( participant_i_tokens, lambda t: t.text == ";") if not is_semicolon ] participant_i_indicators: List[List[int]] = [] for sentence in sentences: sentence_indicator = [0 for _ in sentence] for target in targets: start = 0 while True: span_start, span_end = _find_span(target, sentence, start, target_is_noun=True) if span_start >= 0: for j in range(span_start, span_end + 1): sentence_indicator[j] = 1 start = span_start + 1 else: break participant_i_indicators.append(sentence_indicator) participant_indicators.append(participant_i_indicators) fields: Dict[str, Field] = {} fields["paragraph"] = TextField(tokenizer.tokenize(paragraph), self._token_indexers) fields["participants"] = ListField([ TextField(tokenizer.tokenize(participant), self._token_indexers) for participant in participants ]) # One per sentence fields["sentences"] = ListField([ TextField(sentence, self._token_indexers) for sentence in sentences ]) # One per sentence fields["verbs"] = ListField([ SequenceLabelField(verb_indexes[i], fields["sentences"].field_list[i]) for i in range(len(sentences)) ]) # And also at the paragraph level fields["paragraph_verbs"] = SequenceLabelField([ verb_indicator for verb_indexes_i in verb_indexes for verb_indicator in verb_indexes_i ], fields["paragraph"]) if states is not None: # Outer ListField is one per participant fields["actions"] = ListField([ # Inner ListField is one per sentence ListField([ # action is an Enum, so call .value to get an int LabelField(action.value, skip_indexing=True) for action in participant_actions ]) for participant_actions in actions ]) # Outer ListField is one per participant fields["before_locations"] = ListField([ # Inner ListField is one per sentence ListField([ SpanField(start, end, fields["sentences"].field_list[i]) for i, ((start, end), _) in enumerate(participant_location_spans) ]) for participant_location_spans in location_spans ]) # Outer ListField is one per participant fields["after_locations"] = ListField([ # Inner ListField is one per sentence ListField([ SpanField(start, end, fields["sentences"].field_list[i]) for i, (_, (start, end)) in enumerate(participant_location_spans) ]) for participant_location_spans in location_spans ]) # one per participant fields["participant_indicators"] = ListField([ # one per sentence ListField([ SequenceLabelField(sentence_indicator, fields["sentences"].field_list[i]) for i, sentence_indicator in enumerate(participant_i_indicators) ]) for participant_i_indicators in participant_indicators ]) # and also at the paragraph level # one per participant fields["paragraph_participant_indicators"] = ListField([ SequenceLabelField([ indicator for sentence_indicator in participant_i_indicators for indicator in sentence_indicator ], fields["paragraph"]) for participant_i_indicators in participant_indicators ]) # Finally, we want to indicate before / inside / after for each sentence. paragraph_sentence_indicators: List[SequenceLabelField] = [] for i in range(len(sentences)): before_length = sum(len(sentence) for sentence in sentences[:i]) sentence_length = len(sentences[i]) after_length = sum( len(sentence) for sentence in sentences[(i + 1):]) paragraph_sentence_indicators.append( SequenceLabelField([0] * before_length + [1] * sentence_length + [2] * after_length, fields["paragraph"])) fields["paragraph_sentence_indicators"] = ListField( paragraph_sentence_indicators) # These fields are passed on to the decoder trainer that internally uses it # to compute commonsense scores for predicted actions fields["para_id"] = MetadataField(para_id) fields["participant_strings"] = MetadataField(participants) fields["filename"] = MetadataField(filename) if score is not None: fields["score"] = MetadataField(score) return Instance(fields)
class MTClassifierDatasetReader(DatasetReader): """ Reads a file in the MT Classifier assignment format. Parameters ---------- source_language : ``str``, optional, (default = 'de_core_news_sm') The name of the spaCy model used to tokenize the source sentences. Models can be found here <https://spacy.io/models/>. candidate_language : ``str``, optional (default = 'en_core_web_sm') The name of the spaCy model uwed to tokenize the candidate sentences. token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``) The token indexers to be applied to the words TextField. """ def __init__(self, source_language: str = 'de_core_news_sm', candidate_language: str = 'en_core_web_sm', token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self._source_tokenizer = SpacyWordSplitter(language=source_language) self._candidate_tokenizer = \ SpacyWordSplitter(language=candidate_language) @overrides def _read(self, file_path: str): file_path = cached_path(file_path) with open(file_path, 'r') as mt_file: logger.info("Reading MT instances dataset at: %s", file_path) for line in mt_file: if not line: continue else: inputs = line.strip().split("\t") source = inputs[0] candidate = inputs[1] label = inputs[2] yield self.text_to_instance(source, candidate, label) @overrides def text_to_instance(self, # type: ignore source: str, candidate: str, label: str = None) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- source : ``str``, required The translation's source sentence. candidate : ``str``, required The translation candidate. label : ``str``, optional (default = None) Whether the candidate is human- or machine-translated, if known. """ fields: Dict[str, Field] = {} source_tokens = self._source_tokenizer.split_words(source) candidate_tokens = self._candidate_tokenizer.split_words(candidate) fields["candidate"] = TextField(candidate_tokens, self._token_indexers) fields["source"] = TextField(source_tokens, self._token_indexers) tags = " ".join([c[1] for c in nltk.pos_tag(nltk.word_tokenize(candidate))]) tag_tokens = self._candidate_tokenizer.split_words(tags) fields["candidate_pos"] = TextField(tag_tokens, self._token_indexers) source_lengths = [len(token) for token in source_tokens] candidate_lengths = [len(token) for token in candidate_tokens] source_punctuation = [1 if len(token) == 1 else 0 for token in source_tokens] candidate_punctuation = [1 if len(token) == 1 else 0 for token in candidate_tokens] german = ["ä", "ö", "ü", "ß"] test = [1 if any(c in token.text for c in german) else 0 for token in candidate_tokens] german2 = ["lich", "enz", "ionen", "jek", "stech", "nik"] test2 = [1 if any(c in token.text for c in german2) else 0 for token in candidate_tokens] features = [ len(source_tokens), len(candidate_tokens), sum(source_lengths), sum(candidate_lengths), sum(source_punctuation), sum(candidate_punctuation), sum(test), sum(test2) ] fields["features"] = ArrayField(np.array(features)) # print("source_punctuation") # print(source_punctuation) if label: fields["label"] = LabelField(label) fields["metadata"] = MetadataField({"source": source, "candidate": candidate}) return Instance(fields)
def setUp(self): super().setUp() self.word_splitter = SpacyWordSplitter()
def setUp(self): super(TestNerTagIndexer, self).setUp() self.tokenizer = SpacyWordSplitter(ner=True)
class BiaffineDependencyParserPredictor(Predictor): """ Predictor for the :class:`~allennlp.models.BiaffineDependencyParser` model. """ def __init__(self, model: Model, dataset_reader: DatasetReader, language: str = "en_core_web_sm") -> None: super().__init__(model, dataset_reader) # TODO(Mark) Make the language configurable and based on a model attribute. self._tokenizer = SpacyWordSplitter(language=language, pos_tags=True) def predict(self, sentence: str) -> JsonDict: """ Predict a dependency parse for the given sentence. Parameters ---------- sentence The sentence to parse. Returns ------- A dictionary representation of the dependency tree. """ return self.predict_json({"sentence": sentence}) @overrides def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"sentence": "..."}``. """ spacy_tokens = self._tokenizer.split_words(json_dict["sentence"]) sentence_text = [token.text for token in spacy_tokens] if self._dataset_reader.use_language_specific_pos: # type: ignore # fine-grained part of speech pos_tags = [token.tag_ for token in spacy_tokens] else: # coarse-grained part of speech (Universal Depdendencies format) pos_tags = [token.pos_ for token in spacy_tokens] return self._dataset_reader.text_to_instance(sentence_text, pos_tags) @overrides def predict_instance(self, instance: Instance) -> JsonDict: outputs = self._model.forward_on_instance(instance) words = outputs["words"] pos = outputs["pos"] heads = outputs["predicted_heads"] tags = outputs["predicted_dependencies"] outputs["hierplane_tree"] = self._build_hierplane_tree( words, heads, tags, pos) return sanitize(outputs) @overrides def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]: outputs = self._model.forward_on_instances(instances) for output in outputs: words = output["words"] pos = output["pos"] heads = output["predicted_heads"] tags = output["predicted_dependencies"] output["hierplane_tree"] = self._build_hierplane_tree( words, heads, tags, pos) return sanitize(outputs) @staticmethod def _build_hierplane_tree(words: List[str], heads: List[int], tags: List[str], pos: List[str]) -> Dict[str, Any]: """ Returns ------- A JSON dictionary render-able by Hierplane for the given tree. """ word_index_to_cumulative_indices: Dict[int, Tuple[int, int]] = {} cumulative_index = 0 for i, word in enumerate(words): word_length = len(word) + 1 word_index_to_cumulative_indices[i] = (cumulative_index, cumulative_index + word_length) cumulative_index += word_length def node_constuctor(index: int): children = [] for next_index, child in enumerate(heads): if child == index + 1: children.append(node_constuctor(next_index)) # These are the icons which show up in the bottom right # corner of the node. attributes = [pos[index]] start, end = word_index_to_cumulative_indices[index] hierplane_node = { "word": words[index], # The type of the node - all nodes with the same # type have a unified colour. "nodeType": tags[index], # Attributes of the node. "attributes": attributes, # The link between the node and it's parent. "link": tags[index], "spans": [{ "start": start, "end": end }], } if children: hierplane_node["children"] = children return hierplane_node # We are guaranteed that there is a single word pointing to # the root index, so we can find it just by searching for 0 in the list. root_index = heads.index(0) hierplane_tree = { "text": " ".join(words), "root": node_constuctor(root_index), "nodeTypeToStyle": NODE_TYPE_TO_STYLE, "linkToPosition": LINK_TO_POSITION, } return hierplane_tree
class QaSrlParserPredictor(Predictor): def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True) self._model_vocab = model.vocab self._verb_map = read_verb_file( "data/wiktionary/en_verb_inflections.txt") self._pretrained_vectors = read_pretrained_file( "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz" ) def _sentence_to_qasrl_instances( self, json_dict: JsonDict) -> Tuple[List[Instance], JsonDict]: sentence = json_dict["sentence"] tokens = self._tokenizer.split_words(sentence) words = [token.text for token in tokens] text = " ".join(words) result_dict: JsonDict = {"words": words, "verbs": []} instances: List[Instance] = [] verb_indexes = [] for i, word in enumerate(tokens): if word.pos_ == "VERB" and not word.text.lower() in AUX_VERBS: verb = word.text result_dict["verbs"].append(verb) instance = self._dataset_reader._make_instance_from_text( text, i) instances.append(instance) verb_indexes.append(i) return instances, result_dict, words, verb_indexes @overrides def predict_json(self, inputs: JsonDict, cuda_device: int = 0) -> JsonDict: instances, results, words, verb_indexes = self._sentence_to_qasrl_instances( inputs) # Expand vocab cleansed_words = cleanse_sentence_text(words) added_words = [] added_vectors = [] for w in cleansed_words: w = w.lower() if self._model_vocab.get_token_index( w) == 1 and w in self._pretrained_vectors: added_words.append(w) added_vectors.append(self._pretrained_vectors[w]) if added_words: first_ind = self._model_vocab.get_vocab_size("tokens") for w in added_words: self._model_vocab.add_token_to_namespace(w, "tokens") num_added_words = len(added_words) added_weights = torch.cat(added_vectors, dim=0) span_weights = self._model.span_detector.text_field_embedder.token_embedder_tokens.weight.data num_words, embsize = span_weights.size() new_weights = span_weights.new().resize_( num_words + num_added_words, embsize) new_weights[:num_words].copy_(span_weights) new_weights[num_words:].copy_( torch.reshape( added_weights, (added_weights.shape[0] / new_weights[num_words:].shape[1], added_weights.shape[0] / new_weights[num_words:].shape[0]))) self._model.span_detector.text_field_embedder.token_embedder_tokens.weight = Parameter( new_weights) ques_weights = self._model.question_predictor.text_field_embedder.token_embedder_tokens.weight.data num_words, embsize = ques_weights.size() new_weights = ques_weights.new().resize_( num_words + num_added_words, embsize) new_weights[:num_words].copy_(ques_weights) new_weights[num_words:].copy_( torch.reshape( added_weights, (added_weights.shape[0] / new_weights[num_words:].shape[1], added_weights.shape[0] / new_weights[num_words:].shape[0]))) self._model.question_predictor.text_field_embedder.token_embedder_tokens.weight = Parameter( new_weights) verbs_for_instances = results["verbs"] results["verbs"] = [] instances_with_spans = [] instance_spans = [] if instances: span_outputs = self._model.span_detector.forward_on_instances( instances) for instance, span_output in zip(instances, span_outputs): field_dict = instance.fields text_field = field_dict['text'] spans = [s[0] for s in span_output['spans'] if s[1] >= 0.5] if len(spans) > 0: instance_spans.append(spans) labeled_span_field = ListField([ SpanField(span.start(), span.end(), text_field) for span in spans ]) field_dict['labeled_spans'] = labeled_span_field instances_with_spans.append(Instance(field_dict)) if instances_with_spans: outputs = self._model.question_predictor.forward_on_instances( instances_with_spans) for output, spans, verb, index in zip(outputs, instance_spans, verbs_for_instances, verb_indexes): questions = {} for question, span in zip(output['questions'], spans): question_text = self.make_question_text(question, verb) span_text = " ".join([ words[i] for i in range(span.start(), span.end() + 1) ]) span_rep = { "start": span.start(), "end": span.end(), "text": span_text } questions.setdefault(question_text, []).append(span_rep) qa_pairs = [] for question, spans in questions.items(): qa_pairs.append({"question": question, "spans": spans}) results["verbs"].append({ "verb": verb, "qa_pairs": qa_pairs, "index": index }) return results def make_question_text(self, slots, verb): slots = list(slots) verb_slot = slots[3] split = verb_slot.split(" ") verb = verb.lower() if verb in self._verb_map: split[-1] = self._verb_map[verb][split[-1]] else: split[-1] = verb slots[3] = " ".join(split) sent_text = " ".join([slot for slot in slots if slot != "_"]) + "?" sent_text = sent_text[0].upper() + sent_text[1:] return sent_text
def __init__( self, knowledge_graph: KnowledgeGraph, utterance_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], tokenizer: Tokenizer = None, feature_extractors: List[str] = None, entity_tokens: List[List[Token]] = None, linking_features: List[List[List[float]]] = None, include_in_vocab: bool = True, max_table_tokens: int = None, ) -> None: self.knowledge_graph = knowledge_graph self._tokenizer = tokenizer or WordTokenizer( word_splitter=SpacyWordSplitter(pos_tags=True)) if not entity_tokens: entity_texts = [ knowledge_graph.entity_text[entity].lower() for entity in knowledge_graph.entities ] # TODO(mattg): Because we do tagging on each of these entities in addition to just # tokenizations, this is quite slow, and about half of our data processing time just # goes to this (~15 minutes when there are 7k instances). The reason we do tagging is # so that we can add lemma features. If we can remove the need for lemma / other # hand-written features, like with a CNN, we can cut down our data processing time by a # factor of 2. self.entity_texts = self._tokenizer.batch_tokenize(entity_texts) else: self.entity_texts = entity_tokens self.utterance_tokens = utterance_tokens self._token_indexers: Dict[str, TokenIndexer] = token_indexers self._include_in_vocab = include_in_vocab self._indexed_entity_texts: Dict[str, TokenList] = None self._max_table_tokens = max_table_tokens feature_extractors = (feature_extractors if feature_extractors is not None else [ "number_token_match", "exact_token_match", "contains_exact_token_match", "lemma_match", "contains_lemma_match", "edit_distance", "related_column", "related_column_lemma", "span_overlap_fraction", "span_lemma_overlap_fraction", ]) self._feature_extractors: List[Callable[ [str, List[Token], Token, int, List[Token]], float]] = [] for feature_extractor_name in feature_extractors: extractor = getattr(self, "_" + feature_extractor_name, None) if not extractor: raise ConfigurationError( f"Invalid feature extractor name: {feature_extractor_name}" ) self._feature_extractors.append(extractor) if not linking_features: # For quicker lookups in our feature functions, we'll additionally store some # dictionaries that map entity strings to useful information about the entity. self._entity_text_map: Dict[str, List[Token]] = {} for entity, entity_text in zip(knowledge_graph.entities, self.entity_texts): self._entity_text_map[entity] = entity_text self._entity_text_exact_text: Dict[str, Set[str]] = {} for entity, entity_text in zip(knowledge_graph.entities, self.entity_texts): self._entity_text_exact_text[entity] = set( e.text for e in entity_text) self._entity_text_lemmas: Dict[str, Set[str]] = {} for entity, entity_text in zip(knowledge_graph.entities, self.entity_texts): self._entity_text_lemmas[entity] = set(e.lemma_ for e in entity_text) self.linking_features = self._compute_linking_features() else: self.linking_features = linking_features
def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = WordTokenizer(word_splitter=SpacyWordSplitter( pos_tags=True)) self.nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
class ConstituencyParserPredictor(Predictor): """ Predictor for the :class:`~allennlp.models.SpanConstituencyParser` model. """ def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True) def predict(self, sentence: str) -> JsonDict: """ Predict a constituency parse for the given sentence. Parameters ---------- sentence The sentence to parse. Returns ------- A dictionary representation of the constituency tree. """ return self.predict_json({"sentence": sentence}) @overrides def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]: """ Expects JSON that looks like ``{"sentence": "..."}``. """ spacy_tokens = self._tokenizer.split_words(json_dict["sentence"]) sentence_text = [token.text for token in spacy_tokens] pos_tags = [token.tag_ for token in spacy_tokens] return self._dataset_reader.text_to_instance(sentence_text, pos_tags), {} @overrides def predict_json(self, inputs: JsonDict) -> JsonDict: instance, return_dict = self._json_to_instance(inputs) outputs = self._model.forward_on_instance(instance) return_dict.update(outputs) # format the NLTK tree as a string on a single line. tree = return_dict.pop("trees") return_dict["hierplane_tree"] = self._build_hierplane_tree( tree, 0, is_root=True) return_dict["trees"] = tree.pformat(margin=1000000) return sanitize(return_dict) @overrides def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]: instances, return_dicts = zip(*self._batch_json_to_instances(inputs)) outputs = self._model.forward_on_instances(instances) for output, return_dict in zip(outputs, return_dicts): return_dict.update(output) # format the NLTK tree as a string on a single line. tree = return_dict.pop("trees") return_dict["hierplane_tree"] = self._build_hierplane_tree( tree, 0, is_root=True) return_dict["trees"] = tree.pformat(margin=1000000) return sanitize(return_dicts) def _build_hierplane_tree(self, tree: Tree, index: int, is_root: bool) -> JsonDict: """ Recursively builds a JSON dictionary from an NLTK ``Tree`` suitable for rendering trees using the `Hierplane library<https://allenai.github.io/hierplane/>`. Parameters ---------- tree : ``Tree``, required. The tree to convert into Hierplane JSON. index : int, required. The character index into the tree, used for creating spans. is_root : bool An indicator which allows us to add the outer Hierplane JSON which is required for rendering. Returns ------- A JSON dictionary render-able by Hierplane for the given tree. """ children = [] for child in tree: if isinstance(child, Tree): # If the child is a tree, it has children, # as NLTK leaves are just strings. children.append( self._build_hierplane_tree(child, index, is_root=False)) else: # We're at a leaf, so add the length of # the word to the character index. index += len(child) label = tree.label() span = " ".join(tree.leaves()) hierplane_node = { "word": span, "nodeType": label, "attributes": [label], "link": label } if children: hierplane_node["children"] = children # TODO(Mark): Figure out how to span highlighting to the leaves. if is_root: hierplane_node = { "linkNameToLabel": LINK_TO_LABEL, "nodeTypeToStyle": NODE_TYPE_TO_STYLE, "text": span, "root": hierplane_node } return hierplane_node
from allennlp.data.instance import Instance from allennlp.data.fields import TextField from allennlp.data.vocabulary import Vocabulary import torch import bert_indexer logger = logging.getLogger() # # parser = argparse.ArgumentParser('description: experiments on datasets') # parser.add_argument('input_file') # parser.add_argument('output_file') # args = parser.parse_args() tokenizer = WordTokenizer( word_splitter=SpacyWordSplitter(pos_tags=True, ner=True)) token_indexer = bert_indexer.PretrainedBertIndexer( '../TransformerCoqa/bert-base-uncased-vocab.txt', do_lowercase=False, max_pieces=8, doc_stride=3) token_embedder = PretrainedBertEmbedder( '../TransformerCoqa/bert-base-uncased.tar.gz') # with open(args.input_file, 'w') as f: # data = json.load(f)['data'] # # for article in data: # story = article['story'] a = "the man went to the store and bought a gallon of milk"
def __init__(self, model, dataset_reader): super(SentenceTaggerPredictor, self).__init__(model, dataset_reader) self._tokenizer = SpacyWordSplitter(language=u'en_core_web_sm', pos_tags=True)
def setUp(self): super(TestPosTagIndexer, self).setUp() self.tokenizer = SpacyWordSplitter(pos_tags=True)
def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) # TODO(Mark) Make the language configurable and based on a model attribute. self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True)
class SentenceTaggerPredictor(Predictor): """ Predictor for any model that takes in a sentence and returns a single set of tags for it. In particular, it can be used with the :class:`~allennlp.models.crf_tagger.CrfTagger` model and also the :class:`~allennlp.models.simple_tagger.SimpleTagger` model. """ def __init__(self, model: Model, dataset_reader: DatasetReader, language: str = 'en_core_web_sm') -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyWordSplitter(language=language, pos_tags=True) def predict(self, sentence: str) -> JsonDict: return self.predict_json({"sentence": sentence}) @overrides def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"sentence": "..."}``. Runs the underlying model, and adds the ``"words"`` to the output. """ sentence = json_dict["sentence"] tokens = self._tokenizer.split_words(sentence) return self._dataset_reader.text_to_instance(tokens) @overrides def predictions_to_labeled_instances( self, instance: Instance, outputs: Dict[str, numpy.ndarray]) -> List[Instance]: """ This function currently only handles BIOUL tags. Imagine an NER model predicts three named entities (each one with potentially multiple tokens). For each individual entity, we create a new Instance that has the label set to only that entity and the rest of the tokens are labeled as outside. We then return a list of those Instances. For example: Mary went to Seattle to visit Microsoft Research U-Per O O U-Loc O O B-Org L-Org We create three instances. Mary went to Seattle to visit Microsoft Research U-Per O O O O O O O Mary went to Seattle to visit Microsoft Research O O O U-LOC O O O O Mary went to Seattle to visit Microsoft Research O O O O O O B-Org L-Org """ predicted_tags = outputs['tags'] predicted_spans = [] i = 0 while i < len(predicted_tags): tag = predicted_tags[i] # if its a U, add it to the list if tag[0] == 'U': current_tags = [ t if idx == i else 'O' for idx, t in enumerate(predicted_tags) ] predicted_spans.append(current_tags) # if its a B, keep going until you hit an L. elif tag[0] == 'B': begin_idx = i while tag[0] != 'L': i += 1 tag = predicted_tags[i] end_idx = i current_tags = [t if idx >= begin_idx and idx <= end_idx else 'O' \ for idx, t in enumerate(predicted_tags)] predicted_spans.append(current_tags) i += 1 # Creates a new instance for each contiguous tag instances = [] for labels in predicted_spans: new_instance = deepcopy(instance) text_field: TextField = instance['tokens'] # type: ignore new_instance.add_field('tags', SequenceLabelField(labels, text_field), self._model.vocab) instances.append(new_instance) return instances
class SimpleSeq2SeqPredictor(Predictor): """ Predictor for the :class:`~allennlp.models.encoder_decoder.simple_seq2seq` model. """ def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyWordSplitter(language="en_core_web_sm") @overrides def predict_instance(self, instance: Instance) -> JsonDict: outputs = self._model.forward_on_instance(instance) del outputs["logits"] del outputs["class_probabilities"] return sanitize(outputs) def predict(self, source: str) -> JsonDict: pred_json = self.predict_json({"source": source}) return pred_json @overrides def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"source": "..."}``. """ # print(json_dict) paragraph_json = json_dict all_questions = paragraph_json['questions'] golden_answers = paragraph_json['answers'] paragraph_id = paragraph_json['id'] # READ THE BIDAF++ OUTPUTS bidafplus_output_filename = os.path.join( os.path.dirname(os.path.realpath(file_path)), 'bidafplus_output_formatted.json') with open(bidafplus_output_filename) as bidafplus_outputs: best_span_str_json = json.load(bidafplus_outputs) best_span_str = best_span_str_json['data'] # extractive outputs from BIDAF++ best_span_str_list = best_span_str[paragraph_id] # metadata metadata = {} metadata['paragraph_id'] = paragraph_id metadata['questions'] = [ ques["input_text"].strip().replace("\n", "") for ques in all_questions ][:15] questions_list = [ ques["input_text"].strip().replace("\n", "") for ques in all_questions ][:15] golden_rationale_list = [ answer['span_text'].strip().replace("\n", "") for answer in golden_answers ][:15] answers_list = [ answer['input_text'].strip().replace("\n", "") for answer in golden_answers ][:15] bidafplus_rationale_list = [ answer['answer_text'].strip().replace("\n", "") for answer in best_span_str_list ][:15] ques_rat_list = [ ' '.join([ bidafplus_rationale_list[i], self.question_tag, questions_list[i] ]) for i in range(len(questions_list)) ] for i in range(len(questions_list)): yield self.text_to_instance(ques_rat_list[i], answers_list[i], paragraph_id, i) # yield self.text_to_instance(rationale_list[i], answers_list[i]) def text_to_instance(self, source_string: str, target_string: str = None, paragraph_id: str = None, turn_id: int = 0) -> Instance: # type: ignore # pylint: disable=arguments-differ tokenized_source = self._tokenizer.tokenize(source_string) tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._token_indexers) if target_string is not None: tokenized_target = self._tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._token_indexers) return Instance({"source_tokens": source_field}) else: return Instance({"source_tokens": source_field})
def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) # TODO(Mark) Make the language configurable and based on a model attribute. self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True)
class SemanticRoleLabelerPredictor(Predictor): """ Wrapper for the :class:`~allennlp.models.bidaf.SemanticRoleLabeler` model. """ def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True) @staticmethod def make_srl_string(words: List[str], tags: List[str]) -> str: frame = [] chunk = [] for (token, tag) in zip(words, tags): if tag.startswith("I-"): chunk.append(token) else: if chunk: frame.append("[" + " ".join(chunk) + "]") chunk = [] if tag.startswith("B-"): chunk.append(tag[2:] + ": " + token) elif tag == "O": frame.append(token) if chunk: frame.append("[" + " ".join(chunk) + "]") return " ".join(frame) @overrides def _json_to_instance(self, json_dict: JsonDict): raise NotImplementedError( "The SRL model uses a different API for creating instances.") def _sentence_to_srl_instances( self, json_dict: JsonDict) -> Tuple[List[Instance], JsonDict]: """ The SRL model has a slightly different API from other models, as the model is run forward for every verb in the sentence. This means that for a single sentence, we need to generate a ``List[Instance]``, where the length of this list corresponds to the number of verbs in the sentence. Additionally, all of these verbs share the same return dictionary after being passed through the model (as really we care about all the frames of the sentence together, rather than separately). Parameters ---------- json_dict : ``JsonDict``, required. JSON that looks like ``{"sentence": "..."}``. Returns ------- instances : ``List[Instance]`` One instance per verb. result_dict : ``JsonDict`` A dictionary containing the words of the sentence and the verbs extracted by the Spacy POS tagger. These will be replaced in ``predict_json`` with the SRL frame for the verb. """ sentence = json_dict["sentence"] tokens = self._tokenizer.split_words(sentence) words = [token.text for token in tokens] result_dict: JsonDict = {"words": words, "verbs": []} instances: List[Instance] = [] for i, word in enumerate(tokens): if word.pos_ == "VERB": verb = word.text result_dict["verbs"].append(verb) verb_labels = [0 for _ in words] verb_labels[i] = 1 instance = self._dataset_reader.text_to_instance( tokens, verb_labels) instances.append(instance) return instances, result_dict @overrides def predict_batch_json(self, inputs: List[JsonDict], cuda_device: int = -1) -> List[JsonDict]: """ Expects JSON that looks like ``[{"sentence": "..."}, {"sentence": "..."}, ...]`` and returns JSON that looks like .. code-block:: js [ {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]}, {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]} ] """ # For SRL, we have more instances than sentences, but the user specified # a batch size with respect to the number of sentences passed, so we respect # that here by taking the batch size which we use to be the number of sentences # we are given. batch_size = len(inputs) instances_per_sentence, return_dicts = zip( *[self._sentence_to_srl_instances(json) for json in inputs]) flattened_instances = [ instance for sentence_instances in instances_per_sentence for instance in sentence_instances ] if not flattened_instances: return sanitize(return_dicts) # Make the instances into batches and check the last batch for # padded elements as the number of instances might not be perfectly # divisible by the batch size. batched_instances = group_by_count(flattened_instances, batch_size, None) batched_instances[-1] = [ instance for instance in batched_instances[-1] if instance is not None ] # Run the model on the batches. outputs = [] for batch in batched_instances: outputs.extend(self._model.forward_on_instances( batch, cuda_device)) sentence_index = 0 for results in return_dicts: # We just added the verbs to the list in _sentence_to_srl_instances # but we actually want to replace them with their frames, so we # reset them here. verbs_for_sentence: List[str] = results["verbs"] results["verbs"] = [] # The verbs are in order, but nested as we have multiple sentences. # The outputs are already flattened from running through the model, # so we just index into this flat list for each verb, updating as we go. for verb in verbs_for_sentence: output = outputs[sentence_index] tags = output['tags'] description = self.make_srl_string(results["words"], tags) results["verbs"].append({ "verb": verb, "description": description, "tags": tags, }) sentence_index += 1 return sanitize(return_dicts) @overrides def predict_json(self, inputs: JsonDict, cuda_device: int = -1) -> JsonDict: """ Expects JSON that looks like ``{"sentence": "..."}`` and returns JSON that looks like .. code-block:: js {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]} """ instances, results = self._sentence_to_srl_instances(inputs) # We just added the verbs to the list in _sentence_to_srl_instances # but we actually want to replace them with their frames, so we # reset them here. verbs_for_instances: List[str] = results["verbs"] results["verbs"] = [] if not instances: return sanitize(results) outputs = self._model.forward_on_instances(instances, cuda_device) for output, verb in zip(outputs, verbs_for_instances): tags = output['tags'] description = self.make_srl_string(results["words"], tags) results["verbs"].append({ "verb": verb, "description": description, "tags": tags, }) return sanitize(results)
def train(model_dir): # prepare data #reader = CoqaDatasetReader() #reader = CoqaDatasetReader(tokenizer=lambda x: WordTokenizer().tokenize(text=x)) #reader = LanguageModelingReader(tokenizer=WordTokenizer(word_splitter=SpacyWordSplitter(language='en_core_web_sm'))) reader = SimpleLanguageModelingDatasetReader(tokenizer=WordTokenizer( word_splitter=SpacyWordSplitter(language='en_core_web_sm'))) train_dataset = reader.read( cached_path( '/mnt/DATA/ML/data/corpora/QA/CoQA/stories_only/coqa-train-v1.0_extract100.json' )) validation_dataset = reader.read( cached_path( '/mnt/DATA/ML/data/corpora/QA/CoQA/stories_only/coqa-dev-v1.0.json' )) vocab = None model_fn = os.path.join(model_dir, 'model.th') vocab_fn = os.path.join(model_dir, 'vocab') if os.path.exists(model_dir): if os.path.exists(vocab_fn): logging.info('load vocab from: %s...' % vocab_fn) vocab = Vocabulary.from_files(vocab_fn) else: os.makedirs(model_dir) if vocab is None: #vocab = Vocabulary.from_instances(train_dataset + validation_dataset) vocab = Vocabulary.from_instances(train_dataset) #TODO: re-add! #vocab.extend_from_instances(validation_dataset) logging.info('save vocab to: %s...' % vocab_fn) vocab.save_to_files(vocab_fn) logging.info('data prepared') model = create_model(vocab) if os.path.exists(model_fn): logging.info('load model wheights from: %s...' % model_fn) with open(model_fn, 'rb') as f: model.load_state_dict(torch.load(f)) logging.info('model prepared') # prepare training # optimizer = optim.SGD(model.parameters(), lr=0.1) optimizer = optim.Adam(model.parameters(), lr=0.01) iterator = BasicIterator(batch_size=32) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=10, num_epochs=10) logging.info('training prepared') trainer.train() logging.info('save model to: %s...' % model_fn) with open(model_fn, 'wb') as f: torch.save(model.state_dict(), f)
from numpy import dot from numpy.linalg import norm import json from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter tokenizer = SpacyWordSplitter(language='en_core_web_sm', keep_spacy_tokens=True) def cosine_similarity(a, b): return dot(a, b) / (norm(a) * norm(b)) def extract_keys(lines, key: str): return [json[key] for json in lines] def get_from_rankings(rankings, dictionary): return [dictionary[index] for _, index in rankings] def split_data(data, dev_start: float, test_start: float, is_test: bool): train, dev, test = split_all_data(data, dev_start, test_start) if is_test: return train, test else: return train, dev def split_all_data(data, dev_start: float, test_start: float): return (data[:int(dev_start * len(data))], data[int(dev_start * len(data)): int(test_start * len(data))], data[int(test_start * len(data)):])
def tokenizer(x: str): return [ w.text for w in SpacyWordSplitter(language='en_core_web_sm', pos_tags=False).split_words(x) ]
class TestSpacyWordSplitter(AllenNlpTestCase): def setUp(self): super().setUp() self.word_splitter = SpacyWordSplitter() def test_tokenize_handles_complex_punctuation(self): sentence = "this (sentence) has 'crazy' \"punctuation\"." expected_tokens = [ "this", "(", "sentence", ")", "has", "'", "crazy", "'", '"', "punctuation", '"', ".", ] tokens = self.word_splitter.split_words(sentence) token_text = [t.text for t in tokens] assert token_text == expected_tokens for token in tokens: start = token.idx end = start + len(token.text) assert sentence[start:end] == token.text def test_tokenize_handles_contraction(self): # note that "would've" is kept together, while "ain't" is not. sentence = "it ain't joe's problem; would been yesterday" expected_tokens = [ "it", "ai", "n't", "joe", "'s", "problem", ";", "would", "been", "yesterday", ] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_handles_multiple_contraction(self): sentence = "wouldn't've" expected_tokens = ["would", "n't", "'ve"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_handles_final_apostrophe(self): sentence = "the jones' house" expected_tokens = ["the", "jones", "'", "house"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_removes_whitespace_tokens(self): sentence = "the\n jones' house \x0b 55" expected_tokens = ["the", "jones", "'", "house", "55"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_handles_special_cases(self): # note that the etc. doesn't quite work --- we can special case this if we want. sentence = "Mr. and Mrs. Jones, etc., went to, e.g., the store" expected_tokens = [ "Mr.", "and", "Mrs.", "Jones", ",", "etc", ".", ",", "went", "to", ",", "e.g.", ",", "the", "store", ] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_batch_tokenization(self): sentences = [ "This is a sentence", "This isn't a sentence.", "This is the 3rd sentence." "Here's the 'fourth' sentence.", ] batch_split = self.word_splitter.batch_split_words(sentences) separately_split = [self.word_splitter.split_words(sentence) for sentence in sentences] assert len(batch_split) == len(separately_split) for batch_sentence, separate_sentence in zip(batch_split, separately_split): assert len(batch_sentence) == len(separate_sentence) for batch_word, separate_word in zip(batch_sentence, separate_sentence): assert batch_word.text == separate_word.text def test_keep_spacy_tokens(self): word_splitter = SpacyWordSplitter() sentence = "This should be an allennlp Token" tokens = word_splitter.split_words(sentence) assert tokens assert all(isinstance(token, Token) for token in tokens) word_splitter = SpacyWordSplitter(keep_spacy_tokens=True) sentence = "This should be a spacy Token" tokens = word_splitter.split_words(sentence) assert tokens assert all(isinstance(token, spacy.tokens.Token) for token in tokens)
class CoQAPredictor(Predictor): def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyWordSplitter(language="en_core_web_sm") def predict(self, jsonline: str) -> JsonDict: return self.predict_json(json.loads(jsonline)) @overrides def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects json that looks like the original data file. """ file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) with open(file_path) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json["data"] logger.info("Reading the dataset...") paragraph_json = json_dict # for paragraph_json in dataset: paragraph = paragraph_json["story"] tokenized_paragraph = self._tokenizer.split_words(paragraph) questions = paragraph_json["questions"] golden_answers = paragraph_json["answers"] self.handle_unknown_answers(golden_answers, len(paragraph)) metadata = {} paragraph_id = paragraph_json["id"] metadata["instance_id"] = [str(paragraph_id) + "_" + str(ques["turn_id"]) for ques in questions] if (len(metadata["instance_id"]) > 15): metadata["instance_id"] = metadata["instance_id"][:15] question_text_list = [ques["input_text"].strip().replace("\n", "") for ques in questions] if (len(question_text_list) > 15): question_text_list = question_text_list[:15] answer_texts_list = [[answer["span_text"]] for answer in golden_answers] if (len(answer_texts_list) > 15): answer_texts_list = answer_texts_list[:15] metadata["question"] = question_text_list metadata["answer_texts_list"] = answer_texts_list span_start_list = [[answer["span_start"]] for answer in golden_answers] span_end_list = [[answer["span_end"]] for answer in golden_answers] if (len(span_end_list) > 15): span_end_list = span_end_list[:15] # for st_list, an_list in zip(span_starts_list, answer_texts_list): # span_ends = [start + len(answer) for start, answer in zip(st_list, an_list)] # span_ends_list.append(span_ends) yesno_list = [str("x") for ques in questions][:15] followup_list = [str("n") for ques in questions][:15] instance = self._dataset_reader.text_to_instance(question_text_list, paragraph, span_start_list, span_end_list, tokenized_paragraph, yesno_list, followup_list, metadata) return instance def text_to_instance(self, # type: ignore question_text_list: List[str], passage_text: str, start_span_list: List[List[int]] = None, end_span_list: List[List[int]] = None, passage_tokens: List[Token] = None, yesno_list: List[int] = None, followup_list: List[int] = None, additional_metadata: Dict[str, Any] = None) -> Instance: # pylint: disable=arguments-differ # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we"ll actually use for supervision. answer_token_span_list = [] passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] for start_list, end_list in zip(start_span_list, end_span_list): token_spans: List[Tuple[int, int]] = [] for char_span_start, char_span_end in zip(start_list, end_list): (span_start, span_end), error = my_util.char_span_to_token_span(passage_offsets, (char_span_start, char_span_end)) if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) answer_token_span_list.append(token_spans) question_list_tokens = [self._tokenizer.tokenize(q) for q in question_text_list] # Map answer texts to "CANNOTANSWER" if more than half of them marked as so. additional_metadata["answer_texts_list"] = [util.handle_cannot(ans_list) for ans_list \ in additional_metadata["answer_texts_list"]] return util.make_reading_comprehension_instance_quac(question_list_tokens, passage_tokens, self._token_indexers, passage_text, answer_token_span_list, yesno_list, followup_list, additional_metadata, self._num_context_answers) def handle_unknown_answers(self, answers, plen): for ans in answers: if ans["span_start"] < 0: ans["span_start"] = 0 if ans["span_end"] < 0: ans["span_end"] = plen - 1
def __init__(self, model: Model, dataset_reader: DatasetReader, language: str = 'en_core_web_sm') -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyWordSplitter(language=language, pos_tags=True)
class SemanticRoleLabelerPredictor(Predictor): """ Predictor for the :class:`~allennlp.models.bidaf.SemanticRoleLabeler` model. """ def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True) def predict(self, sentence: str) -> JsonDict: """ Predicts the semantic roles of the supplied sentence and returns a dictionary with the results. .. code-block:: js {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]} Parameters ---------- sentence, ``str`` The sentence to parse via semantic role labeling. Returns ------- A dictionary representation of the semantic roles in the sentence. """ return self.predict_json({"sentence" : sentence}) @staticmethod def make_srl_string(words: List[str], tags: List[str]) -> str: frame = [] chunk = [] for (token, tag) in zip(words, tags): if tag.startswith("I-"): chunk.append(token) else: if chunk: frame.append("[" + " ".join(chunk) + "]") chunk = [] if tag.startswith("B-"): chunk.append(tag[2:] + ": " + token) elif tag == "O": frame.append(token) if chunk: frame.append("[" + " ".join(chunk) + "]") return " ".join(frame) @overrides def _json_to_instance(self, json_dict: JsonDict): raise NotImplementedError("The SRL model uses a different API for creating instances.") def _sentence_to_srl_instances(self, json_dict: JsonDict) -> List[Instance]: """ The SRL model has a slightly different API from other models, as the model is run forward for every verb in the sentence. This means that for a single sentence, we need to generate a ``List[Instance]``, where the length of this list corresponds to the number of verbs in the sentence. Additionally, all of these verbs share the same return dictionary after being passed through the model (as really we care about all the frames of the sentence together, rather than separately). Parameters ---------- json_dict : ``JsonDict``, required. JSON that looks like ``{"sentence": "..."}``. Returns ------- instances : ``List[Instance]`` One instance per verb. """ sentence = json_dict["sentence"] tokens = self._tokenizer.split_words(sentence) words = [token.text for token in tokens] instances: List[Instance] = [] for i, word in enumerate(tokens): if word.pos_ == "VERB": verb_labels = [0 for _ in words] verb_labels[i] = 1 instance = self._dataset_reader.text_to_instance(tokens, verb_labels) instances.append(instance) return instances @overrides def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]: """ Expects JSON that looks like ``[{"sentence": "..."}, {"sentence": "..."}, ...]`` and returns JSON that looks like .. code-block:: js [ {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]}, {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]} ] """ # For SRL, we have more instances than sentences, but the user specified # a batch size with respect to the number of sentences passed, so we respect # that here by taking the batch size which we use to be the number of sentences # we are given. batch_size = len(inputs) instances_per_sentence = [self._sentence_to_srl_instances(json) for json in inputs] flattened_instances = [instance for sentence_instances in instances_per_sentence for instance in sentence_instances] if not flattened_instances: return sanitize([{"verbs": [], "words": self._tokenizer.split_words(x["sentence"])} for x in inputs]) # Make the instances into batches and check the last batch for # padded elements as the number of instances might not be perfectly # divisible by the batch size. batched_instances = group_by_count(flattened_instances, batch_size, None) batched_instances[-1] = [instance for instance in batched_instances[-1] if instance is not None] # Run the model on the batches. outputs = [] for batch in batched_instances: outputs.extend(self._model.forward_on_instances(batch)) verbs_per_sentence = [len(sent) for sent in instances_per_sentence] return_dicts: List[JsonDict] = [{"verbs": []} for x in inputs] output_index = 0 for sentence_index, verb_count in enumerate(verbs_per_sentence): if verb_count == 0: # We didn't run any predictions for sentences with no verbs, # so we don't have a way to extract the original sentence. # Here we just tokenize the input again. original_text = self._tokenizer.split_words(inputs[sentence_index]["sentence"]) return_dicts[sentence_index]["words"] = original_text continue for _ in range(verb_count): output = outputs[output_index] words = output["words"] tags = output['tags'] description = self.make_srl_string(words, tags) return_dicts[sentence_index]["words"] = words return_dicts[sentence_index]["verbs"].append({ "verb": output["verb"], "description": description, "tags": tags, }) output_index += 1 return sanitize(return_dicts) @overrides def predict_json(self, inputs: JsonDict) -> JsonDict: """ Expects JSON that looks like ``{"sentence": "..."}`` and returns JSON that looks like .. code-block:: js {"words": [...], "verbs": [ {"verb": "...", "description": "...", "tags": [...]}, ... {"verb": "...", "description": "...", "tags": [...]}, ]} """ instances = self._sentence_to_srl_instances(inputs) if not instances: return sanitize({"verbs": [], "words": self._tokenizer.split_words(inputs["sentence"])}) outputs = self._model.forward_on_instances(instances) results = {"verbs": [], "words": outputs[0]["words"]} for output in outputs: tags = output['tags'] description = self.make_srl_string(output["words"], tags) results["verbs"].append({ "verb": output["verb"], "description": description, "tags": tags, }) return sanitize(results)
def setUp(self): super(TestNerTagIndexer, self).setUp() self.tokenizer = SpacyWordSplitter(ner=True)
class MTBDatasetReader(DatasetReader): """ Reads a JSON-lines file containing papers from the Semantic Scholar database, and creates a dataset suitable for document classification using these papers. Expected format for each input line: {"paperAbstract": "text", "title": "text", "venue": "text"} The JSON could have other fields, too, but they are ignored. The output of ``read`` is a list of ``Instance`` s with the fields: title: ``TextField`` abstract: ``TextField`` label: ``LabelField`` where the ``label`` is derived from the venue of the paper. Parameters ---------- lazy : ``bool`` (optional, default=False) Passed to ``DatasetReader``. If this is ``True``, training will start sooner, but will take longer per batch. This also allows training with datasets that are too large to fit in memory. tokenizer : ``Tokenizer``, optional Tokenizer to use to split the title and abstrct into words or other kinds of tokens. Defaults to ``WordTokenizer()``. token_indexers : ``Dict[str, TokenIndexer]``, optional Indexers used to define input token representations. Defaults to ``{"tokens": SingleIdTokenIndexer()}``. """ def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self.spacy_splitter = SpacyWordSplitter(keep_spacy_tokens=True) self.TRAIN_DATA = "meta_train" self.TEST_DATA = "meta_test" @overrides def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: logger.info("Reading instances from json files at: %s", data_file) data = json.load(data_file) labels = data[1] data = data[0] for x, l in zip(data, labels): yield self.text_to_instance(x, l) @overrides def text_to_instance( self, data: dict, relation_type: int = None) -> Instance: # type: ignore # pylint: disable=arguments-differ N_relations = [] location_list = [] all_tokens_sentences = [] for i, K_examples in enumerate(data[self.TRAIN_DATA]): toknized_sentences = [] sentences_location = [] clean_text_for_debug = [] for relation in K_examples: tokenized_tokens = self._tokenizer.tokenize(" ".join( relation["tokens"])) head_location, tail_location = self.addStartEntityTokens( tokenized_tokens, relation['h'], relation['t']) assert tokenized_tokens[head_location].text == head_start_token assert tokenized_tokens[tail_location].text == tail_start_token field_of_tokens = TextField(tokenized_tokens, self._token_indexers) locations_of_entities = MetadataField({ "head": head_location, "tail": tail_location }) clean_text_for_debug.append(MetadataField(tokenized_tokens)) sentences_location.append(locations_of_entities) toknized_sentences.append(field_of_tokens) assert len(sentences_location) == len(toknized_sentences) == len( clean_text_for_debug) sentences_location = ListField(sentences_location) clean_text_for_debug = ListField(clean_text_for_debug) toknized_sentences = ListField(toknized_sentences) all_tokens_sentences.append(clean_text_for_debug) location_list.append(sentences_location) N_relations.append(toknized_sentences) assert len(N_relations) == len(location_list) == len( all_tokens_sentences) N_relations = ListField(N_relations) location_list = ListField(location_list) all_tokens_sentences = ListField(all_tokens_sentences) fields = { 'sentences': N_relations, "locations": location_list, "clean_tokens": all_tokens_sentences } test_dict = data[self.TEST_DATA] tokenized_tokens = self._tokenizer.tokenize(" ".join( test_dict["tokens"])) head_location, tail_location = self.addStartEntityTokens( tokenized_tokens, test_dict['h'], test_dict['t']) test_clean_text_for_debug = MetadataField(tokenized_tokens) locations_of_entities = MetadataField({ "head": head_location, "tail": tail_location }) field_of_tokens = TextField(tokenized_tokens, self._token_indexers) fields['test'] = field_of_tokens fields['test_location'] = locations_of_entities fields['test_clean_text'] = test_clean_text_for_debug if relation_type is not None: fields['label'] = IndexField(relation_type, N_relations) return Instance(fields) def addStartEntityTokens(self, tokens_list, head_full_data, tail_full_data): if len(head_full_data[0]) > len( tail_full_data[0] ): # this is for handling nested tail and head entities #for example: head = NEC and tail = NEC corp # solution, make sure no overlapping entities mention head_start_location, head_end_location = self.find_locations( head_full_data, tokens_list) tail_start_location, tail_end_location = self.find_locations( tail_full_data, tokens_list) if tail_start_location[0] >= head_start_location[ 0] and tail_start_location[0] <= head_end_location[0]: tail_end_location, tail_start_location = self.deny_overlapping( tokens_list, head_end_location, tail_full_data) else: tail_start_location, tail_end_location = self.find_locations( tail_full_data, tokens_list) head_start_location, head_end_location = self.find_locations( head_full_data, tokens_list) if head_start_location[0] >= tail_start_location[ 0] and head_start_location[0] <= tail_end_location[0]: head_end_location, head_start_location = self.deny_overlapping( tokens_list, tail_end_location, head_full_data) # todo try different approchs on which entity location to choose h_start_location, head_end_location, tail_start_location, tail_end_location = find_closest_distance_between_entities \ (head_start_location, head_end_location, tail_start_location, tail_end_location) x = self._tokenizer.tokenize(head_start_token) y = self._tokenizer.tokenize(head_end_token) z = self._tokenizer.tokenize(tail_start_token) w = self._tokenizer.tokenize(tail_end_token) offset_tail = 2 * (tail_start_location > h_start_location) tokens_list.insert(h_start_location, x[0]) # arbetrary pick a token for that tokens_list.insert(head_end_location + 1 + 1, y[0]) # arbetrary pick a token for that tokens_list.insert(tail_start_location + offset_tail, z[0]) # arbetrary pick a token for that tokens_list.insert(tail_end_location + 2 + offset_tail, w[0]) # arbetrary pick a token for that return h_start_location + 2 - offset_tail, tail_start_location + offset_tail def deny_overlapping(self, tokens_list, longest_entity_end_location, shortest_entity_full_data): start_location, end_location = self.find_locations( shortest_entity_full_data, tokens_list[longest_entity_end_location[0] + 1:]) start_location[0] = start_location[0] + longest_entity_end_location[0] end_location[0] = end_location[0] + longest_entity_end_location[0] return end_location, start_location def return_lower_text_from_tokens(self, tokens): return list(map(lambda x: x.text.lower(), tokens)) def compare_two_token_lists(self, x, y): return self.return_lower_text_from_tokens( x) == self.return_lower_text_from_tokens(y) def spacy_work_toknizer(self, text): return list( map(lambda x: x.text, self.spacy_splitter.split_words(text))) def find_locations(self, head_full_data, token_list): end_location, start_location = self._find_entity_name( token_list, head_full_data) if len(end_location) == 0 or len(start_location) == 0: end_location, start_location = self._find_entity_name( token_list, head_full_data, True) assert len(start_location) == len(end_location) assert len(start_location) == len(head_full_data[2]) return start_location, end_location def _find_entity_name(self, token_list, head_full_data, use_spacy_toknizer_before=False): if use_spacy_toknizer_before: spacy_head_tokens = self.spacy_work_toknizer(head_full_data[0]) head = self._tokenizer.tokenize(" ".join(spacy_head_tokens)) else: head = self._tokenizer.tokenize(" ".join([head_full_data[0]])) start_head_entity_name = head[0] start_location = [] end_location = [] for i, token in enumerate(token_list): if self.compare_two_token_lists([token], [start_head_entity_name]): if self.compare_two_token_lists(token_list[i:i + len(head)], head): start_location.append(i) end_location.append(i + len(head) - 1) if len(start_location) == len(head_full_data[2]): break return end_location, start_location
def __init__(self, language): super().__init__() self.tokenizer = SpacyWordSplitter(language=language, pos_tags=True)
def setUp(self): super().setUp() self.tokenizer = SpacyWordSplitter(pos_tags=True)
class BiaffineDependencyParserPredictor(Predictor): """ Predictor for the :class:`~allennlp.models.BiaffineDependencyParser` model. """ def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) # TODO(Mark) Make the language configurable and based on a model attribute. self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True) def predict(self, sentence: str) -> JsonDict: """ Predict a dependency parse for the given sentence. Parameters ---------- sentence The sentence to parse. Returns ------- A dictionary representation of the dependency tree. """ return self.predict_json({"sentence" : sentence}) @overrides def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"sentence": "..."}``. """ spacy_tokens = self._tokenizer.split_words(json_dict["sentence"]) sentence_text = [token.text for token in spacy_tokens] if self._dataset_reader.use_language_specific_pos: # type: ignore # fine-grained part of speech pos_tags = [token.tag_ for token in spacy_tokens] else: # coarse-grained part of speech (Universal Depdendencies format) pos_tags = [token.pos_ for token in spacy_tokens] return self._dataset_reader.text_to_instance(sentence_text, pos_tags) @overrides def predict_instance(self, instance: Instance) -> JsonDict: outputs = self._model.forward_on_instance(instance) words = outputs["words"] pos = outputs["pos"] heads = outputs["predicted_heads"] tags = outputs["predicted_dependencies"] outputs["hierplane_tree"] = self._build_hierplane_tree(words, heads, tags, pos) return sanitize(outputs) @overrides def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]: outputs = self._model.forward_on_instances(instances) for output in outputs: words = output["words"] pos = output["pos"] heads = output["predicted_heads"] tags = output["predicted_dependencies"] output["hierplane_tree"] = self._build_hierplane_tree(words, heads, tags, pos) return sanitize(outputs) @staticmethod def _build_hierplane_tree(words: List[str], heads: List[int], tags: List[str], pos: List[str]) -> Dict[str, Any]: """ Returns ------- A JSON dictionary render-able by Hierplane for the given tree. """ word_index_to_cumulative_indices: Dict[int, Tuple[int, int]] = {} cumulative_index = 0 for i, word in enumerate(words): word_length = len(word) + 1 word_index_to_cumulative_indices[i] = (cumulative_index, cumulative_index + word_length) cumulative_index += word_length def node_constuctor(index: int): children = [] for next_index, child in enumerate(heads): if child == index + 1: children.append(node_constuctor(next_index)) # These are the icons which show up in the bottom right # corner of the node. attributes = [pos[index]] start, end = word_index_to_cumulative_indices[index] hierplane_node = { "word": words[index], # The type of the node - all nodes with the same # type have a unified colour. "nodeType": tags[index], # Attributes of the node. "attributes": attributes, # The link between the node and it's parent. "link": tags[index], "spans": [{"start": start, "end": end}] } if children: hierplane_node["children"] = children return hierplane_node # We are guaranteed that there is a single word pointing to # the root index, so we can find it just by searching for 0 in the list. root_index = heads.index(0) hierplane_tree = { "text": " ".join(words), "root": node_constuctor(root_index), "nodeTypeToStyle": NODE_TYPE_TO_STYLE, "linkToPosition": LINK_TO_POSITION } return hierplane_tree
class TestPosTagIndexer(AllenNlpTestCase): def setUp(self): super().setUp() self.tokenizer = SpacyWordSplitter(pos_tags=True) def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tokens"] == { 'DT': 2, 'VBZ': 1, '.': 1, 'NN': 1, 'NONE': 2 } indexer._coarse_tags = True # pylint: disable=protected-access counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tokens"] == { 'VERB': 1, 'PUNCT': 1, 'DET': 2, 'NOUN': 1, 'NONE': 2 } def test_tokens_to_indices_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags') cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags') none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags') # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them vocab.add_token_to_namespace('DET', namespace='pos_tags') vocab.add_token_to_namespace('NOUN', namespace='pos_tags') vocab.add_token_to_namespace('PUNCT', namespace='pos_tags') indexer = PosTagIndexer(namespace='pos_tags', coarse_tags=True) indices = indexer.tokens_to_indices(tokens, vocab, "tokens") assert len(indices) == 1 assert "tokens" in indices assert indices["tokens"][1] == verb_index assert indices["tokens"][-1] == none_index indexer._coarse_tags = False # pylint: disable=protected-access assert indexer.tokens_to_indices([tokens[1]], vocab, "coarse") == { "coarse": [cop_index] } def test_padding_functions(self): indexer = PosTagIndexer() assert indexer.get_padding_lengths(0) == {} def test_as_array_produces_token_sequence(self): indexer = PosTagIndexer() padded_tokens = indexer.as_padded_tensor({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {}) assert padded_tokens["key"].tolist() == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0] def test_blank_pos_tag(self): tokens = [ Token(token)._replace(pos_="") for token in "allennlp is awesome .".split(" ") ] indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) # spacy uses a empty string to indicate "no POS tag" # we convert it to "NONE" assert counter["pos_tokens"]["NONE"] == 4 vocab = Vocabulary(counter) none_index = vocab.get_token_index('NONE', 'pos_tokens') # should raise no exception indices = indexer.tokens_to_indices(tokens, vocab, index_name="pos") assert { "pos": [none_index, none_index, none_index, none_index] } == indices
class ConstituencyParserPredictor(Predictor): """ Predictor for the :class:`~allennlp.models.SpanConstituencyParser` model. """ def __init__(self, model: Model, dataset_reader: DatasetReader, language: str = 'en_core_web_sm') -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyWordSplitter(language=language, pos_tags=True) def predict(self, sentence: str) -> JsonDict: """ Predict a constituency parse for the given sentence. Parameters ---------- sentence The sentence to parse. Returns ------- A dictionary representation of the constituency tree. """ return self.predict_json({"sentence" : sentence}) @overrides def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"sentence": "..."}``. """ spacy_tokens = self._tokenizer.split_words(json_dict["sentence"]) sentence_text = [token.text for token in spacy_tokens] pos_tags = [token.tag_ for token in spacy_tokens] return self._dataset_reader.text_to_instance(sentence_text, pos_tags) @overrides def predict_instance(self, instance: Instance) -> JsonDict: outputs = self._model.forward_on_instance(instance) # format the NLTK tree as a string on a single line. tree = outputs.pop("trees") outputs["hierplane_tree"] = self._build_hierplane_tree(tree, 0, is_root=True) outputs["trees"] = tree.pformat(margin=1000000) return sanitize(outputs) @overrides def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]: outputs = self._model.forward_on_instances(instances) for output in outputs: # format the NLTK tree as a string on a single line. tree = output.pop("trees") output["hierplane_tree"] = self._build_hierplane_tree(tree, 0, is_root=True) output["trees"] = tree.pformat(margin=1000000) return sanitize(outputs) def _build_hierplane_tree(self, tree: Tree, index: int, is_root: bool) -> JsonDict: """ Recursively builds a JSON dictionary from an NLTK ``Tree`` suitable for rendering trees using the `Hierplane library<https://allenai.github.io/hierplane/>`. Parameters ---------- tree : ``Tree``, required. The tree to convert into Hierplane JSON. index : int, required. The character index into the tree, used for creating spans. is_root : bool An indicator which allows us to add the outer Hierplane JSON which is required for rendering. Returns ------- A JSON dictionary render-able by Hierplane for the given tree. """ children = [] for child in tree: if isinstance(child, Tree): # If the child is a tree, it has children, # as NLTK leaves are just strings. children.append(self._build_hierplane_tree(child, index, is_root=False)) else: # We're at a leaf, so add the length of # the word to the character index. index += len(child) label = tree.label() span = " ".join(tree.leaves()) hierplane_node = { "word": span, "nodeType": label, "attributes": [label], "link": label } if children: hierplane_node["children"] = children # TODO(Mark): Figure out how to span highlighting to the leaves. if is_root: hierplane_node = { "linkNameToLabel": LINK_TO_LABEL, "nodeTypeToStyle": NODE_TYPE_TO_STYLE, "text": span, "root": hierplane_node } return hierplane_node
def setUp(self): super(TestDepLabelIndexer, self).setUp() self.tokenizer = SpacyWordSplitter(parse=True)
def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyWordSplitter(language="en_core_web_sm")
def setUp(self): super(TestPosTagIndexer, self).setUp() self.tokenizer = SpacyWordSplitter(pos_tags=True)
def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True)
def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = WordTokenizer(word_splitter=SpacyWordSplitter( pos_tags=True))