def __init__(self, encoding_type: str = 'bio', target_pad_val=0, bigram=False): if encoding_type == 'bio': self.convert_tag = iob2 else: self.convert_tag = lambda words: iob2bioes(iob2(words)) self.target_pad_val = int(target_pad_val) self.bigram = bigram
def test_iob2bioes(self): tags = [ 'B-NP', 'O', 'B-NP', 'B-VP', 'B-NP', 'I-NP', 'O', 'B-NP', 'B-PP', 'B-NP', 'I-NP', 'O', 'B-NP', 'I-NP', 'B-NP', 'O', 'B-NP', 'I-NP', 'I-NP' ] convert_tags = [ 'S-NP', 'O', 'S-NP', 'S-VP', 'B-NP', 'E-NP', 'O', 'S-NP', 'S-PP', 'B-NP', 'E-NP', 'O', 'B-NP', 'E-NP', 'S-NP', 'O', 'B-NP', 'I-NP', 'E-NP' ] self.assertSequenceEqual(convert_tags, iob2bioes(tags))
def prepare_ptb(args): datas = {} datas["pos"] = (ConllLoader(headers=["words", "pos"], indexes=[0, 1]).load(args.pos).datasets) chunk_data = (ConllLoader(headers=["words", "chunk"], indexes=[0, 2]).load(args.chunk).datasets) chunk_data['train'], chunk_data['dev'] = chunk_data['train'].split(0.1) datas['chunk'] = chunk_data datas["ner"] = (ConllLoader(headers=["words", "ner"], indexes=[0, 3]).load(args.ner).datasets) for ds in datas['chunk'].values(): ds.apply_field(lambda x: iob2(x), 'chunk', 'chunk') for ds in datas['ner'].values(): ds.apply_field(lambda x: iob2bioes(iob2(x)), 'ner', 'ner') vocabs = {} src_vocab = Vocabulary() for idx, task_name in enumerate(["pos", "chunk", "ner"]): data = datas[task_name] filter_docstart(data) vocab = Vocabulary(padding=None, unknown=None) vocab.from_dataset(*list(data.values()), field_name=task_name) src_vocab.from_dataset(*list(data.values()), field_name="words") vocabs[task_name] = vocab task_lst = [] for idx, task_name in enumerate(["pos", "chunk", "ner"]): data = datas[task_name] src_vocab.index_dataset(*list(data.values()), field_name="words", new_field_name="words") vocabs[task_name].index_dataset(*list(data.values()), field_name=task_name, new_field_name=task_name) for ds in data.values(): ds.apply_field(len, 'words', 'seq_len') task_lst.append( Task(idx, task_name, data["train"], data["dev"], data["test"])) vocabs["words"] = src_vocab return task_lst, vocabs