def read_txt(self, file: str, number: int = -1) -> List[Instance]: print( f"[Data Info] Reading file: {file}, labels will be converted to IOBES encoding" ) print( f"[Data Info] Modify src/data/ner_dataset.read_txt function if you have other requirements" ) insts = [] with open(file, 'r', encoding='utf-8') as f: words = [] ori_words = [] labels = [] for line in tqdm(f.readlines()): line = line.rstrip() if line == "": labels = convert_iobes(labels) insts.append( Instance(words=words, ori_words=ori_words, labels=labels)) words = [] ori_words = [] labels = [] if len(insts) == number: break continue ls = line.split() word, label = ls[0], ls[-1] ori_words.append(word) word = re.sub('\d', '0', word) words.append(word) labels.append(label) print("number of sentences: {}".format(len(insts))) return insts
def read_from_sentences(self, sents: List[List[str]]): """ sents = [['word_a', 'word_b'], ['word_aaa', 'word_bccc', 'word_ccc']] """ insts = [] for sent in sents: insts.append(Instance(words=sent, ori_words=sent)) return insts