def test_iob1_to_bioul(self): tag_sequence = ["I-ORG", "O", "I-MISC", "O"] bioul_sequence = span_utils.to_bioul(tag_sequence, encoding="IOB1") assert bioul_sequence == ["U-ORG", "O", "U-MISC", "O"] tag_sequence = ["O", "I-PER", "B-PER", "I-PER", "I-PER", "B-PER"] bioul_sequence = span_utils.to_bioul(tag_sequence, encoding="IOB1") assert bioul_sequence == ["O", "U-PER", "B-PER", "I-PER", "L-PER", "U-PER"]
def test_iob1_to_bioul(self): tag_sequence = ['I-ORG', 'O', 'I-MISC', 'O'] bioul_sequence = span_utils.to_bioul(tag_sequence, encoding="IOB1") assert bioul_sequence == ['U-ORG', 'O', 'U-MISC', 'O'] tag_sequence = ['O', 'I-PER', 'B-PER', 'I-PER', 'I-PER', 'B-PER'] bioul_sequence = span_utils.to_bioul(tag_sequence, encoding="IOB1") assert bioul_sequence == ['O', 'U-PER', 'B-PER', 'I-PER', 'L-PER', 'U-PER']
def test_bio_to_bioul(self): tag_sequence = ['B-ORG', 'O', 'B-MISC', 'O', 'B-MISC', 'I-MISC', 'I-MISC'] bioul_sequence = span_utils.to_bioul(tag_sequence, encoding="BIO") assert bioul_sequence == ['U-ORG', 'O', 'U-MISC', 'O', 'B-MISC', 'I-MISC', 'L-MISC'] # Encoding in IOB format should throw error with incorrect encoding. with self.assertRaises(span_utils.InvalidTagSequence): tag_sequence = ['O', 'I-PER', 'B-PER', 'I-PER', 'I-PER', 'B-PER'] bioul_sequence = span_utils.to_bioul(tag_sequence, encoding="BIO")
def test_bio_to_bioul(self): tag_sequence = ["B-ORG", "O", "B-MISC", "O", "B-MISC", "I-MISC", "I-MISC"] bioul_sequence = span_utils.to_bioul(tag_sequence, encoding="BIO") assert bioul_sequence == ["U-ORG", "O", "U-MISC", "O", "B-MISC", "I-MISC", "L-MISC"] # Encoding in IOB format should throw error with incorrect encoding. with pytest.raises(span_utils.InvalidTagSequence): tag_sequence = ["O", "I-PER", "B-PER", "I-PER", "I-PER", "B-PER"] bioul_sequence = span_utils.to_bioul(tag_sequence, encoding="BIO")
def test_iob1_to_bioul(self): tag_sequence = [u'I-ORG', u'O', u'I-MISC', u'O'] bioul_sequence = span_utils.to_bioul(tag_sequence, encoding=u"IOB1") assert bioul_sequence == [u'U-ORG', u'O', u'U-MISC', u'O'] tag_sequence = [u'O', u'I-PER', u'B-PER', u'I-PER', u'I-PER', u'B-PER'] bioul_sequence = span_utils.to_bioul(tag_sequence, encoding=u"IOB1") assert bioul_sequence == [ u'O', u'U-PER', u'B-PER', u'I-PER', u'L-PER', u'U-PER' ]
def test_bio_to_bioul(self): tag_sequence = [ 'B-ORG', 'O', 'B-MISC', 'O', 'B-MISC', 'I-MISC', 'I-MISC' ] bioul_sequence = span_utils.to_bioul(tag_sequence, encoding="BIO") assert bioul_sequence == [ 'U-ORG', 'O', 'U-MISC', 'O', 'B-MISC', 'I-MISC', 'L-MISC' ] # Encoding in IOB format should throw error with incorrect encoding. with self.assertRaises(span_utils.InvalidTagSequence): tag_sequence = ['O', 'I-PER', 'B-PER', 'I-PER', 'I-PER', 'B-PER'] bioul_sequence = span_utils.to_bioul(tag_sequence, encoding="BIO")
def test_decode_runs_correctly(self): training_tensors = self.dataset.as_tensor_dict() output_dict = self.model(**training_tensors) decode_output_dict = self.model.make_output_human_readable(output_dict) lengths = get_lengths_from_binary_sequence_mask(decode_output_dict["mask"]).data.tolist() # Hard to check anything concrete which we haven't checked in the above # test, so we'll just check that the tags are equal to the lengths # of the individual instances, rather than the max length. for prediction, length in zip(decode_output_dict["tags"], lengths): assert len(prediction) == length # Checks that the output is a well formed BIO sequence, # as otherwise an exception is thrown. to_bioul(prediction, encoding="BIO")
def test_decode_runs_correctly(self): training_tensors = self.dataset.as_tensor_dict() output_dict = self.model(**training_tensors) decode_output_dict = self.model.make_output_human_readable(output_dict) lengths = get_lengths_from_binary_sequence_mask(decode_output_dict["mask"]).data.tolist() # Hard to check anything concrete which we haven't checked in the above # test, so we'll just check that the tags are equal to the lengths # of the individual instances, rather than the max length. for prediction, length in zip(decode_output_dict["wordpiece_tags"], lengths): assert len(prediction) == length for prediction, length in zip(decode_output_dict["tags"], lengths): # to_bioul throws an exception if the tag sequence is not well formed, # so here we can easily check that the sequence we produce is good. to_bioul(prediction, encoding="BIO")
def convert_conll2003_ner_to_bioul(filename: str, out_filename: str): """ Converts the conll2003 file to bilou tagged strings and writes it to out_filename The out_filename will have the first column as word and the next three columns as the NER tags Parameters ---------- filename: str Convert the file in conll2003 format to bioul tags out_filename: str Writes the file to bioul format Returns ------- None """ msg_printer = wasabi.Printer() lines: List[List[str]] = [] labels: List[List[str]] = [] with open(filename) as fp: lines_: List[str] = [] labels_: List[str] = [] # every list is a label for one namespace for text in fp: text_ = text.strip() if bool(text_): line_labels = text_.split() line_ = line_labels[0] label_ = line_labels[3] # all 3 tags lines_.append(line_) labels_.append(label_) elif text_ == "-DOCSTART-": # skip next empty line as well lines_ = [] labels_ = [] next(fp) else: if len(lines_) > 0 and len(labels_) > 0: lines.append(lines_) labels.append(labels_) lines_ = [] labels_ = [] bilou_tags = [] for label in labels: bilou_ = to_bioul(tag_sequence=label, encoding="IOB1") bilou_tags.append(bilou_) with msg_printer.loading(f"writing BILOU tags for {filename}"): with open(out_filename, "w") as fp: for line, bilou_tags_ in zip(lines, bilou_tags): assert len(line) == len(bilou_tags_) for word, tag in zip(line, bilou_tags_): fp.write(" ".join([word, tag, tag, tag])) fp.write("\n") fp.write("\n") msg_printer.good(f"Finished writing BILOU tags for {filename}")
def spans_to_bio_tags(spans, length): tag_sequence = ['O'] * length for span in spans: is_inner_span = False for span_2 in spans: if (not is_same_span(span, span_2)) and is_x_in_y(span, span_2): is_inner_span = True if is_inner_span: continue start, end, label = span tag_sequence[start] = 'B-' + label for ix in range(start + 1, end): tag_sequence[ix] = 'I-' + label return to_bioul(tag_sequence, encoding='BIO')
def text_to_instance( self, # type: ignore tokens: List[Token], verb_label: List[int], parseTree: Tree, tags: List[str] = None, fout=None) -> Instance: """ We take `pre-tokenized` input here, along with a verb label. The verb label should be a one-hot binary vector, the same length as the tokens, indicating the position of the verb to find arguments for. """ # pylint: disable=arguments-differ # Convert tags to BIOUL QUESTION - BIO or IOB1? # print(f"Tags before: {tags}") if (self.label_encoding == "BIOUL"): if (tags is not None): old_tags = deepcopy(tags) tags = to_bioul(tags, encoding="BIO") try: spans = bioul_tags_to_spans(tags) except InvalidTagSequence: print(f"Old tags: {old_tags}") print(f"New tags: {tags}\n") # Create span matrix from parse tree leftLabelsTree = leftMost(parseTree) rightLabelsTree = rightMost(parseTree) # leaves = [] # right_leaves = [] # get_leaves(parseTree, leaves) # get_leaves(parseTree, right_leaves) # assert(leaves == right_leaves) # leaf2idx = {} # for idx, leaf in enumerate(leaves): # leaf2idx[leaf] = idx leftList = [] rightList = [] addToList(leftLabelsTree, leftList) addToList(rightLabelsTree, rightList) if len(leftList) != len(rightList): raise Exception( f"For tree {parseTree}, leftList and rightList lengths do not match" ) span_matrix = np.zeros([len(tokens), len(tokens)]) for idx in range(len(leftList)): leftLabel, rightLabel = leftList[idx], rightList[idx] if (leftLabel == rightLabel): continue span_matrix[leftLabel, rightLabel] = 1 # print(f"Tags after: {tags}\n") # print(tokens) # print(verb_label) # print(tags) fields: Dict[str, Field] = {} text_field = TextField(tokens, token_indexers=self._token_indexers) fields['tokens'] = text_field fields['verb_indicator'] = SequenceLabelField(verb_label, text_field) if (self.label_encoding == "BIOUL"): fields['span_matrix'] = ArrayField(span_matrix) if all([x == 0 for x in verb_label]): verb = None else: verb = tokens[verb_label.index(1)].text metadata_dict = {"words": [x.text for x in tokens], "verb": verb} if tags: fields['tags'] = SequenceLabelField(tags, text_field) metadata_dict["gold_tags"] = tags fields["metadata"] = MetadataField(metadata_dict) if (fout is not None): srl_dict = {"parse_tree": parseTree, "span_matrix": span_matrix} pickle.dump(srl_dict, fout) return Instance(fields)