def _make_instance_from_text(self, sent_tokens, pred_index, annotations = None, sent_id = None): instance_dict = {} if isinstance(sent_tokens, str): sent_tokens = sent_tokens.split() sent_tokens = cleanse_sentence_text(sent_tokens) text_field = TextField([Token(t) for t in sent_tokens], self._token_indexers) instance_dict['text'] = text_field instance_dict['predicate_indicator'] = SequenceLabelField([1 if i == pred_index else 0 for i in range(len(sent_tokens))], text_field) if annotations is not None: for i, slot_name in enumerate(self._slot_labels): span_slot = ListField([LabelField(ann.slots[i], label_namespace="slot_%s"%slot_name) for ann in annotations for span in ann.all_spans]) instance_dict['span_slot_%s'%slot_name] = span_slot labeled_span_field = ListField([SpanField(span.start(), span.end(), text_field) for ann in annotations for span in ann.all_spans]) instance_dict['labeled_spans'] = labeled_span_field if self._bio_labels: bio_labels = ["O"] * len(sent_tokens) bio_labels[pred_index] = "B-V" for span in self._resolve_spans(annotations, pred_index): bio_labels[span.start()] = "B-ARG" for i in range(span.start()+1, span.end()+1): bio_labels[i] = "I-ARG" instance_dict["bio_label"] = SequenceLabelField(bio_labels, text_field, label_namespace="bio_labels") instance_dict['annotations'] = MetadataField({'annotations':annotations}) metadata = {'pred_index' : pred_index, 'sent_text': " ".join(sent_tokens)} if sent_id is not None: metadata['sent_id'] = sent_id instance_dict['metadata'] = MetadataField(metadata) return Instance(instance_dict)
def _fix_tokenization(tokenized_sent, bert_embs, old_det_to_new_ind, obj_to_type, token_indexers, pad_ind=-1): """ Turn a detection list into what we want: some text, as well as some tags. :param tokenized_sent: Tokenized sentence with detections collapsed to a list. :param old_det_to_new_ind: Mapping of the old ID -> new ID (which will be used as the tag) :param obj_to_type: [person, person, pottedplant] indexed by the old labels :return: tokenized sentence """ new_tokenization_with_tags = [] for tok in tokenized_sent: if isinstance(tok, list): for int_name in tok: obj_type = obj_to_type[int_name] new_ind = old_det_to_new_ind[int_name] if new_ind < 0: raise ValueError( "Oh no, the new index is negative! that means it's invalid. {} {}" .format(tokenized_sent, old_det_to_new_ind)) text_to_use = GENDER_NEUTRAL_NAMES[ new_ind % len(GENDER_NEUTRAL_NAMES )] if obj_type == 'person' else obj_type new_tokenization_with_tags.append((text_to_use, new_ind)) else: new_tokenization_with_tags.append((tok, pad_ind)) text_field = BertField([Token(x[0]) for x in new_tokenization_with_tags], bert_embs, padding_value=0) tags = SequenceLabelField([x[1] for x in new_tokenization_with_tags], text_field) return text_field, tags
def text_to_instance(self, # type: ignore formalism: str, position_in_corpus : int, am_sentence: AMSentence) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- formalism : str. The formalism of this instance (e.g. DM, PSD, ...) position_in_corpus : ``int``, required. The index of this sentence in the corpus. am_sentence : ``AMSentence``, required. The words in the sentence to be encoded. Returns ------- An instance containing words, pos tags, dependency edge labels, head indices, supertags and lexical labels as fields. """ fields: Dict[str, Field] = {} tokens = TextField([Token(w) for w in am_sentence.get_tokens(shadow_art_root=True)], self._token_indexers) fields["words"] = tokens fields["pos_tags"] = SequenceLabelField(am_sentence.get_pos(), tokens, label_namespace="pos") fields["ner_tags"] = SequenceLabelField(am_sentence.get_ner(), tokens, label_namespace="ner_labels") fields["lemmas"] = SequenceLabelField(am_sentence.get_lemmas(), tokens, label_namespace="lemmas") fields["supertags"] = SequenceLabelField(am_sentence.get_supertags(), tokens, label_namespace=formalism+"_supertag_labels") fields["lexlabels"] = SequenceLabelField(am_sentence.get_lexlabels(), tokens, label_namespace=formalism+"_lex_labels") fields["head_tags"] = SequenceLabelField(am_sentence.get_edge_labels(),tokens, label_namespace=formalism+"_head_tags") #edge labels fields["head_indices"] = SequenceLabelField(am_sentence.get_heads(),tokens,label_namespace="head_index_tags") fields["metadata"] = MetadataField({"words": am_sentence.words, "attributes": am_sentence.attributes, "formalism": formalism, "position_in_corpus" : position_in_corpus, "token_ranges" : am_sentence.get_ranges(), "is_annotated" : am_sentence.is_annotated()}) return Instance(fields)
def text_to_instance( self, # type: ignore sentence: str = None, tokens: List[Token] = None, targets: List[str] = None, ) -> Instance: """ Parameters ---------- sentence : ``str``, optional A sentence containing [MASK] tokens that should be filled in by the model. This input is superceded and ignored if ``tokens`` is given. tokens : ``List[Token]``, optional An already-tokenized sentence containing some number of [MASK] tokens to be predicted. targets : ``List[str]``, optional Contains the target tokens to be predicted. The length of this list should be the same as the number of [MASK] tokens in the input. """ if not tokens: tokens = self._tokenizer.tokenize(sentence) input_field = TextField(tokens, self._token_indexers) mask_positions = [] for i, token in enumerate(tokens): if token.text == "[MASK]": mask_positions.append(i) if not mask_positions: raise ValueError("No [MASK] tokens found!") if targets and len(targets) != len(mask_positions): raise ValueError(f"Found {len(mask_positions)} mask tokens and {len(targets)} targets") mask_position_field = ListField([IndexField(i, input_field) for i in mask_positions]) # TODO(mattg): there's a problem if the targets get split into multiple word pieces... fields: Dict[str, Field] = {"tokens": input_field, "mask_positions": mask_position_field} if targets is not None: target_field = TextField([Token(target) for target in targets], self._token_indexers) fields["target_ids"] = target_field return Instance(fields)
def read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as data_file: instances = [] logger.info("Reading instances from lines in file at: %s", file_path) for line in tqdm.tqdm(data_file): line = line.strip("\n") # skip blank lines if not line: continue pred_id = int(line.split()[0]) tokens_and_tags = line.split(maxsplit=1)[1].split( self._word_tag_delimiter) tokens = [Token(token) for token in tokens_and_tags[0].split()] tags = [tag for tag in tokens_and_tags[1].split()] pred_tags = [ 0 if i != pred_id else 1 for i in range(len(tokens)) ] sequence = TextField(tokens, self._token_indexers) sequence_tags = SequenceLabelField(tags, sequence) sequence_pred_tags = SequenceLabelField(pred_tags, sequence) instances.append( Instance({ 'tokens': sequence, 'tags': sequence_tags, 'verb_indicator': sequence_pred_tags })) if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def text_to_instance(self, words: List[str], upos_tags: List[str], dependencies: List[Tuple[str, int]] = None) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- words : ``List[str]``, required. The words in the sentence to be encoded. upos_tags : ``List[str]``, required. The universal dependencies POS tags for each word. dependencies ``List[Tuple[str, int]]``, optional (default = None) A list of (head tag, head index) tuples. Indices are 1 indexed, meaning an index of 0 corresponds to that word being the root of the dependency tree. Returns ------- An instance containing words, upos tags, dependency head tags and head indices as fields. """ fields: Dict[str, Field] = {} tokens = TextField([Token(w) for w in words], self._token_indexers) fields["words"] = tokens fields["pos_tags"] = SequenceLabelField(upos_tags, tokens, label_namespace="pos") if dependencies is not None: # We don't want to expand the label namespace with an additional dummy token, so we'll # always give the 'ROOT_HEAD' token a label of 'root'. fields["head_tags"] = SequenceLabelField([x[0] for x in dependencies], tokens, label_namespace=self._task_type + "_head_tags") fields["head_indices"] = SequenceLabelField([int(x[1]) for x in dependencies], tokens, label_namespace="head_index_tags") fields["metadata"] = MetadataField({"words": words, "pos": upos_tags}) return Instance(fields)
def _read(self, file_dir: str): # file_dir should point to the conllu.tar.gz file plus train, dev, or test # example: file_dir="data/en/conllu.tar.gz/train" file, split = os.path.split(file_dir) tar = tarfile.open(file, "r:gz") file_names = [tarinfo for tarinfo in tar.getmembers() if split in tarinfo.name and ".conllu" in tarinfo.name] if split == "train" and self._num_examples > -1: file_names = file_names[:self._num_examples] for fname in file_names: content = tar.extractfile(fname) language = content.readline().decode("utf8").rstrip("\n")[-2:] rating = content.readline().decode("utf8").rstrip("\n")[-1] doc_id = content.readline().decode("utf8").rstrip("\n").split()[-1] tokens = [] num_sents = 0 num_tokens = 0 for line in content: line = line.decode("utf8") if line[0] == '#': continue if not line.rstrip("\n"): num_sents += 1 continue else: tokens.append(Token(line.split("\t")[1])) num_tokens += 1 #content = tar.extractfile(fname).read() yield self.text_to_instance(tokens, doc_id, rating, num_sents, num_tokens)
def text_to_instance(self, # type: ignore query: List[str], prelinked_entities: Dict[str, Dict[str, str]] = None, sql: List[str] = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} tokens = TextField([Token(t) for t in query], self._token_indexers) fields["tokens"] = tokens if sql is not None: try: action_sequence, all_actions = self._world.get_action_sequence_and_all_actions(sql, prelinked_entities) except ParseError: return None index_fields: List[Field] = [] production_rule_fields: List[Field] = [] for production_rule in all_actions: nonterminal, _ = production_rule.split(' ->') production_rule = ' '.join(production_rule.split(' ')) field = ProductionRuleField(production_rule, self._world.is_global_rule(nonterminal)) production_rule_fields.append(field) valid_actions_field = ListField(production_rule_fields) fields["valid_actions"] = valid_actions_field action_map = {action.rule: i # type: ignore for i, action in enumerate(valid_actions_field.field_list)} for production_rule in action_sequence: index_fields.append(IndexField(action_map[production_rule], valid_actions_field)) action_sequence_field = ListField(index_fields) fields["action_sequence"] = action_sequence_field return Instance(fields)
def _read(self, file_path: str): for sentence in open(cached_path(file_path), "r"): tokens = sentence.strip().split(" ") clusters: DefaultDict[int, List[Tuple[ int, int]]] = collections.defaultdict(list) words = [] for index, token in enumerate(tokens): # Coreference is annotated using [square brackets] # or (round brackets) around coreferent phrases. if "[" in token and "]" in token: clusters[0].append((index, index)) elif "[" in token: clusters[0].append((index, index)) elif "]" in token: old_span = clusters[0][-1] clusters[0][-1] = (old_span[0], index) if "(" in token and ")" in token: clusters[1].append((index, index)) elif "(" in token: clusters[1].append((index, index)) elif ")" in token: old_span = clusters[1][-1] clusters[1][-1] = (old_span[0], index) if token.endswith("."): # Winobias is tokenised, but not for full stops. # We'll just special case them here. token = token[:-1] words.append(token.strip("[]()")) words.append(".") else: words.append(token.strip("[]()")) yield self.text_to_instance([Token(x) for x in words], [x for x in clusters.values()])
def text_to_instance(self, data: Dict[str, Any]) -> Instance: # pylint: disable=arguments-differ # Flatten and pad tokens tokens = data['tokens'] tokens = [Token(x) for x in tokens] fields = {'tokens': TextField(tokens, self._token_indexers)} # If annotations are provided, process them into arrays. if 'annotations' in data: # Initialize arrays and book keeping data structures. seen_entities: Set[str] = set() entity_types = np.zeros(shape=(len(tokens), )) entity_ids = np.zeros(shape=(len(tokens), )) mention_lengths = np.ones(shape=(len(tokens), )) # Process annotations for annotation in data['annotations']: seen_entities.add(annotation['id']) start, end = annotation['span'] length = end - start for i in range(*annotation['span']): # Note: +1 offset to account for start token. entity_types[i] = 1 entity_ids[i] = len(seen_entities) mention_lengths[i] = length length -= 1 fields['entity_types'] = SequentialArrayField(entity_types, dtype=np.uint8) fields['entity_ids'] = SequentialArrayField(entity_ids, dtype=np.int64) fields['mention_lengths'] = SequentialArrayField(mention_lengths, dtype=np.int64) return Instance(fields)
def text_to_instance( self, annotation_id: str, document: str, query: str = None, label: str = None, rationale: List[int] = None, tokens_existing: List[str] = None, ) -> Instance: # type: ignore # pylint: disable=arguments-differ fields = {} tokens = [Token(w) for w in tokens_existing] rationale_tokens = rationale keep_tokens = [0 if t != '[SEP]' else 1 for t in tokens] fields["document"] = TextField(tokens, self._token_indexers) fields["rationale"] = SequenceLabelField(rationale_tokens, fields["document"], "rationale_labels") metadata = { "annotation_id": annotation_id, "tokens": tokens, "keep_tokens": keep_tokens, "token_rationale": rationale_tokens, "document": document, "query": query, "convert_tokens_to_instance": self.convert_tokens_to_instance, "label": label, } fields["metadata"] = MetadataField(metadata) if label is not None: fields["label"] = LabelField(label, label_namespace="labels") return Instance(fields)
def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: """ Finalize predictions. This method overrides ``Model.decode``, which gets called after ``Model.forward``, at test time, to finalize predictions. The logic for the decoder part of the encoder-decoder lives within the ``forward`` method. This method trims the output predictions to the first end symbol, replaces indices with corresponding tokens, and adds a field called ``predicted_tokens`` to the ``output_dict``. """ predicted_indices = output_dict["predictions"] if not isinstance(predicted_indices, numpy.ndarray): predicted_indices = predicted_indices.detach().cpu().numpy() all_predicted_tokens = [] for indices in predicted_indices: # Beam search gives us the top k results for each source sentence in the batch # but we just want the single best. if len(indices.shape) > 1: indices = indices[0] indices = list(indices) if self._end_index in indices: indices = indices[:indices.index(self._end_index)] predicted_tokens = list() for x in indices: if x in [self._end_index, self._start_index, self._pad_index]: continue if x >= self._num_classes: index = x - self._num_classes predicted_tokens.append(Token("@entity_%d" % index)) else: w = self.vocab.get_token_from_index(x, namespace=self._target_namespace) predicted_tokens.append(w) all_predicted_tokens.append(predicted_tokens) output_dict["predicted_tokens"] = all_predicted_tokens return output_dict
def text_to_instance(self, tokens: List[str], arc_indices: List[Tuple[int, int]] = None, arc_tags: List[str] = None, upos_tags: List[str] = None, xpos_tags: List[str] = None) -> Instance: fields: Dict[str, Field] = {} if self.use_lowercase: tokens = list(map(str.lower, tokens)) tokens = self.tokenizer.tokenize(' '.join(tokens)) \ if self.tokenizer is not None else [Token(t) for t in tokens] text_field = TextField(tokens, self._token_indexers) fields['tokens'] = text_field if upos_tags is not None: fields['upos_tags'] = SequenceLabelField(upos_tags, text_field, label_namespace='upos') if xpos_tags is not None: fields['xpos_tags'] = SequenceLabelField(xpos_tags, text_field, label_namespace='xpos') if arc_indices is not None and arc_tags is not None: fields['adjacency_matrix'] = AdjacencyField( arc_indices, text_field, arc_tags, label_namespace='dependency', padding_value=-1) return Instance(fields)
def test_get_valid_actions_in_world_without_comparable_columns(self): question_tokens = [Token(x) for x in ['what', 'was', 'the', 'first', 'title', '?']] table_file = self.FIXTURES_ROOT / 'data' / 'corenlp_processed_tables' / 'TEST-1.table' table_context = TableQuestionContext.read_from_file(table_file, question_tokens) # The table does not have date or number columns. assert "date" not in table_context.column_types.values() assert "number" not in table_context.column_types.values() world = WikiTablesVariableFreeWorld(table_context) actions = world.get_valid_actions() assert set(actions.keys()) == { "<r,<g,s>>", "<r,<g,r>>", "<r,<t,<s,r>>>", "<n,<n,<n,d>>>", "<r,r>", "<r,n>", "d", "n", "s", "t", "r", "@start@", } assert set([str(type_) for type_ in world.get_basic_types()]) == {'n', 'd', 's', 'r', 't', 'g'}
def _read(self, file_path: str) -> Iterator[Instance]: all_letters = string.ascii_letters + " .,;'" n_letters = len(all_letters) names = [] countries = [] # Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427 def unicodeToAscii(s): return ''.join( c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn' and c in all_letters) # Read a file and split into lines def readLines(file_path): lines = open(file_path, encoding='utf-8').read().strip().split('\n') return [unicodeToAscii(line) for line in lines] lines = readLines(file_path) for pair in lines: yield self.name_to_instance([Token(pair.strip().split()[0])], pair.strip().split()[1])
def main(): opts = options() # load zero pronoun detector with open(opts.tagger_param_file, mode='rb') as f: tagger_params = pickle.load(f) print(tagger_params) tagger_vocab = Vocabulary.from_files(opts.tagger_vocab_file) tagger_model = Tagger.build(tagger_params, tagger_vocab) tagger_model, tagger_indexer = load_model(tagger_model, tagger_params, opts.tagger_model_file, opts.gpuid) # prepare dataset readers tagger_reader = TaggerDatasetReader( token_indexers={"tokens": tagger_indexer}) with codecs.open(opts.input_file, "r", encoding="utf8") as f_in, codecs.open( opts.output_file, "w", encoding="utf8") as f_out: for line in f_in: line = line.strip() toks = [Token(tok) for tok in line.split(" ")] tagger_instance = tagger_reader.text_to_instance(toks) output = tagger_model.forward_on_instance(tagger_instance) f_out.write(" ".join(output["tags"]) + "\n")
def _process_sentence( self, sentence_tokens: List[str], verbal_predicates: List[int], predicate_argument_labels: List[List[str]]) -> List[Instance]: """ Parameters ---------- sentence_tokens : ``List[str]``, required. The tokenised sentence. verbal_predicates : ``List[int]``, required. The indexes of the verbal predicates in the sentence which have an associated annotation. predicate_argument_labels : ``List[List[str]]``, required. A list of predicate argument labels, one for each verbal_predicate. The internal lists are of length: len(sentence). Returns ------- A list of Instances. """ tokens = [Token(t) for t in sentence_tokens] if not verbal_predicates: # Sentence contains no predicates. tags = ["O" for _ in sentence_tokens] verb_label = [0 for _ in sentence_tokens] return [self.text_to_instance(tokens, verb_label, tags)] else: instances = [] for verb_index, annotation in zip(verbal_predicates, predicate_argument_labels): tags = annotation verb_label = [0 for _ in sentence_tokens] verb_label[verb_index] = 1 instances.append( self.text_to_instance(tokens, verb_label, tags)) return instances
def fill_token_indices(tokens, text, uncased): new_tokens = [] text_idx = 0 if uncased: text = text.lower() for token in tokens: first_char_idx = 2 if len( token.text) > 2 and token.text[:2] == "##" else 0 while text[text_idx] == ' ' or text[text_idx] == '\xa0': text_idx += 1 new_tokens.append(Token(text=token.text, idx=text_idx)) token_len = len(token.text) - first_char_idx if token.text == '[UNK]': token_len = 1 text_idx += token_len return new_tokens
def clean(self, passage, question, answer, passage_tagging, question_tagging): passage_tokens = [Token(w) for w in passage_tagging['words']] spans = DropReader.find_valid_spans(passage_tokens, answer['spans']) new_answer_texts = [] cleaned = False for answer_text in answer['spans']: valid = True for span in spans: span_text = ' '.join(passage_tagging['words'][span[0]:span[1] + 1]).lower() if answer_text.lower() != span_text: continue if any(tag != 'O' for tag in passage_tagging['tags'][span[0]:span[1] + 1]): valid = False cleaned = True break if valid: new_answer_texts.append(answer_text) if not cleaned: return None new_answer = answer.copy() new_answer['spans'] = new_answer_texts return {'answer': new_answer}
def _read(self, file_path: str) -> Iterator[Instance]: with open(file_path, 'r', encoding='utf-8') as f: columns = next(f).strip().split( "\t") # первая строка для названий колонок tokens, labels = [], [] for line in f: line = line.strip() if len(line) == 0: if tokens: yield self.text_to_instance(tokens, labels) tokens, labels = [], [] continue if len(columns) == 4: # для train.csv _, _, word, gram = line.split('\t') pos, _ = gram.split('#') labels.append(pos) else: # для test.csv _, _, word = line.split('\t') tokens.append(Token(word)) if tokens: yield self.text_to_instance(tokens, labels)
def _read(self, file_path: str) -> Iterable[Instance]: # if `file_path` is a URL, redirect to the cache if file_path[-8:] == 'test.txt': data = snips_reader('test.txt', valid_class=self.valid_class, random_seed=self.random_seed, drop_empty=self.drop_empty) elif file_path[-9:] == 'train.txt': data = snips_reader('train.txt', valid_class=self.valid_class, random_seed=self.random_seed, drop_empty=self.drop_empty) else: data = snips_reader('valid.txt', valid_class=self.valid_class, random_seed=self.random_seed, drop_empty=self.drop_empty) # if file_path[-9:] == 'train.txt': # print(data[:10]) for fields in data: # unzipping trick returns tuples, but our Fields need lists tokens, ner_tags = [list(field) for field in zip(*fields)] # TextField requires ``Token`` objects tokens = [Token(token) for token in tokens] sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} # Add "feature labels" to instance if 'ner' in self.feature_labels: instance_fields['ner_tags'] = SequenceLabelField( ner_tags, sequence, "ner_tags") # Add "tag label" to instance instance_fields['tags'] = SequenceLabelField(ner_tags, sequence) yield Instance(instance_fields)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(t) for t in sentence.words] ########################## result = self.dependency_tree_predictor.predict( sentence=" ".join(sentence.words)) # print(result['words']) root_dict = result['hierplane_tree']['root'] adj = {} self.traverse_tree(adj, root_dict['word'], root_dict) predicte_adj = {} ######################### if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] yield self.text_to_instance(tokens, verb_label, adj, tags) else: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] yield self.text_to_instance(tokens, verb_indicator, adj, tags)
def process_documents(self, content): # documents = {} documents_obj = {} curr_id = -1 for is_divider, lines in tqdm.tqdm( itertools.groupby(content, _is_divider)): # Ignore the document divider chunks, so that `lines` corresponds to the # a single sentence. for line in lines: line = line.rstrip('\n') tokens = line.split() if tokens[0] == 'ID' and is_divider: curr_id = tokens[1] if curr_id in documents_obj: warnings.warn(f'duplicate {curr_id}') else: # documents[curr_id] = [] documents_obj[curr_id] = Doc(curr_id, []) elif not (line.strip() == ''): tokens = [Token(self.vocab[int(idx)]) for idx in tokens] # documents[curr_id].append(tokens) sent = SentLabel(tokens, ['O'] * len(tokens)) documents_obj[curr_id].sentences.append(sent) return documents_obj
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache # file_path = cached_path(file_path) for filename in os.listdir(file_path): filename_splitted = filename.split('_') task_name = filename_splitted[-3] domain_name = filename_splitted[-2] if task_name not in self._tasks or domain_name not in self._domains: continue with open(os.path.join(file_path, filename), "r") as data_file: logger.info("Reading instances from lines in file at: %s", filename) for line in Tqdm.tqdm(data_file): line = line.strip("\n") # skip blank lines if not line: continue tokens_and_tags = [ pair.rsplit(self._word_tag_delimiter, 1) for pair in line.split(self._token_delimiter) ] tokens_and_tags = ([['<<' + task_name + '>>', 'O'], ['<<' + domain_name + '>>', 'O']] + tokens_and_tags) tokens = [Token(token) for token, tag in tokens_and_tags] tags = [tag for token, tag in tokens_and_tags] task_field = LabelField(task_name, label_namespace="task_labels") sequence = TextField(tokens, self._token_indexers) sequence_tags = SequenceLabelField( tags, sequence, label_namespace='labels') yield Instance({ 'task_token': task_field, 'tokens': sequence, 'tags': sequence_tags })
def _read(self, file_path: str) -> Iterable[Instance]: # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) # Group into alternative divider / sentence chunks. for is_divider, lines in itertools.groupby(data_file, _is_divider): # Ignore the divider chunks, so that `lines` corresponds to the words # of a single sentence. if not is_divider: fields = [line.strip().split() for line in lines] # unzipping trick returns tuples, but our Fields need lists fields = [list(field) for field in zip(*fields)] if self.ignore_ner_tags: tokens_, pos_tags, chunk_tags = fields[:3] ner_tags = None else: tokens_, pos_tags, chunk_tags, ner_tags = fields # TextField requires ``Token`` objects tokens = [Token(token) for token in tokens_] yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags)
def _read(self, file_path: str) -> Iterable[Instance]: file_path = cached_path(file_path) with open(file_path, 'r') as conll_file: logger.info( "Reading Target CONLL instances from CONLL " "dataset at: %s", file_path) # Group into alternative divider / sentence chunks. for is_divider, lines in itertools.groupby(conll_file, _is_divider): # Ignore the divider chunks, so that `lines` corresponds to the words # of a single sentence. if is_divider: continue fields = [line.strip().split() for line in lines] # unzipping trick returns tuples, but our Fields need lists fields = [list(field) for field in zip(*fields)] tokens_ = fields[0] tags = fields[1] # TextField requires ``Token`` objects tokens = [Token(token) for token in tokens_] yield self.text_to_instance(tokens, tags)
def make_instance(question: str, choices: List[str]) -> List[Instance]: """Given a question and a list of choices text, convert to BERT NSP instances. Parameters ---------- question : str Question choices : List[str] List of five choices Returns ------- List[Instance] List of Allennlp Instances """ question_tokens = TOKENIZER.tokenize(question) instances = [] for choice in choices: choice_tokens = TOKENIZER.tokenize(choice) tokens = question_tokens + [Token('[SEP]')] + choice_tokens instance = Instance( {"tokens": TextField(tokens, {"bert": WORD_INDEXER})}) instances.append(instance) return instances
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier, ) for sentence in self._ontonotes_subset( ontonotes_reader, file_path, self._domain_identifier ): tokens = [Token(t) for t in sentence.words] if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] yield self.text_to_instance(tokens, verb_label, tags) else: for (_, tags) in sentence.srl_frames: verb_indicator = [1 if label[-2:] == "-V" else 0 for label in tags] yield self.text_to_instance(tokens, verb_indicator, tags)
def _fix_visual_concept(visual_concept, visual_concept_num, h5fn, pad_ind): """ Turn a detection list into what we want: some text, as well as some tags. :param tokenized_sent: Tokenized sentence with detections collapsed to a list. :param old_det_to_new_ind: Mapping of the old ID -> new ID (which will be used as the tag) :param obj_to_type: [person, person, pottedplant] indexed by the old labels :return: tokenized sentence """ bert_embs = np.zeros([len(visual_concept), 768]) new_tokenization_with_tags = [] for i, tok in enumerate(visual_concept): new_tokenization_with_tags.append((tok, pad_ind)) with h5py.File(h5fn, 'r') as h5: grp_items = { k: np.array(v) for k, v in h5[str(visual_concept_num[i])].items() } bert_embs[i, :] = grp_items[f'word'] text_field = BertField([Token(x[0]) for x in new_tokenization_with_tags], bert_embs, padding_value=0) tags = SequenceLabelField([x[1] for x in new_tokenization_with_tags], text_field) return text_field, tags
def clean(self, passage, question, answer, passage_tagging, question_tagging): passage_tokens = [Token(w) for w in passage_tagging['words']] spans = find_valid_spans(passage_tokens, answer['spans']) new_answer_texts = [] cleaned = False for answer_text in answer['spans']: if self.should_remove_answer(answer_text): continue valid = True for span in spans: span_text = ' '.join(passage_tagging['words'][span[0]:span[1]+1]).lower() span_text = span_text.replace(' - ', '-') if answer_text.lower() != span_text: continue if self.should_remove_span(passage_tagging['tags'][span[0]:span[1]+1]): valid = False cleaned = True break if valid: new_answer_texts.append(answer_text) if not cleaned: return None new_answer = answer.copy() new_answer['spans'] = new_answer_texts return {'answer': new_answer}