def read_txt(self, file: str, number: int = -1) -> List[Instance]: print("Reading file: " + file) insts = [] with open(file, 'r', encoding='utf-8') as f: words = [] labels = [] for line in tqdm(f.readlines()): line = line.rstrip() if line == "": inst = Instance(Sentence(words), labels) inst.set_id(len(insts)) insts.append(inst) words = [] labels = [] if len(insts) == number: break continue word, label = line.split('<|>') if self.digit2zero: word = re.sub('\d', '0', word) # replace digit with 0. words.append(word) self.vocab.add(word) labels.append(label) print("number of sentences: {}".format(len(insts))) return insts
def read_txt( self, file: str, number: int = -1 ) -> List[Instance]: # expected type -> return type count_0 = 0 print("Reading file: " + file) insts = [] with open(file, 'r', encoding='utf-8') as f: words = [] labels = [] for line in tqdm(f.readlines()): line = line.rstrip() if line == "": assert len(words) == len(labels) inst = Instance(Sentence(words), labels) inst.set_id(len(insts)) insts.append(inst) words = [] labels = [] if len(insts) == number: break continue #for x = line.split() if len(x) == 1: word, label = '&', x[0] # '&' elif len(x) == 2: word, label = x[0], x[1] else: print(x) if self.digit2zero: word = re.sub( '\d', '0', word ) # replace all digits with 0. '\d' - unicode decimal digits [0-9] count_0 += 1 words.append(word) self.vocab.add(word) labels.append(label) print("numbers being replaced by zero:", count_0) print("number of sentences: {}".format(len(insts))) return insts