def read_txt(self, file: str, number: int = -1, is_train: bool = True) -> List[Instance]: print("Reading file: " + file) insts = [] # vocab = set() ## build the vocabulary with open(file, 'r', encoding='utf-8') as f: words = [] labels = [] tags = [] for line in tqdm(f.readlines()): line = line.rstrip() if line == "": insts.append(Instance(Sentence(words, None, None, tags), labels)) words = [] labels = [] tags = [] if len(insts) == number: break continue if "conll2003" in file: word, pos, label = line.split() else: vals = line.split() word = vals[1] pos = vals[3] label = vals[10] if self.digit2zero: word = re.sub('\d', '0', word) # replace digit with 0. words.append(word) tags.append(pos) self.vocab.add(word) labels.append(label) print("number of sentences: {}".format(len(insts))) return insts
def read_from_file(self, file, number=-1, is_train=True): print("Reading file: " + file) insts = [] # vocab = set() ## build the vocabulary with open(file, 'r', encoding='utf-8') as f: words = [] labels = [] for line in tqdm(f.readlines()): line = line.rstrip() if line == "": insts.append(Instance(Sentence(words), labels)) words = [] labels = [] if len(insts) == number: break continue word, _, label = line.split() if self.digit2zero: word = re.sub('\d', '0', word) words.append(word) if is_train: self.train_vocab[word]=0 else: self.test_vocab[word]=0 labels.append(label) return insts
def read_from_file(self, file, number=-1, is_train=True): print("Reading file: " + file) insts = [] # vocab = set() ## build the vocabulary id = 0 with open(file, 'r', encoding='utf-8') as f: words = [] labels = [] # for line in f.readlines(): for line in tqdm(f.readlines()): line = line.rstrip() if line == "": inst = Instance(Sentence(words), labels) inst.set_id(id) id += 1 insts.append(inst) words = [] labels = [] if len(insts) == number: break continue if self.dataset == "conll2003": word, _, label = line.split() elif self.dataset == "conll2002" or self.dataset == "ecommerce" or self.dataset == "youku": x = line.split() if len(x) == 1: word = "," label = x[0] else: word = x[0] label = x[1] # word, label = line.split() else: raise Exception("unknown dataset: " + self.dataset + " during read data") if self.digit2zero: word = re.sub('\d', '0', word) words.append(word) if is_train: if word not in self.train_vocab: self.train_vocab.append(word) else: if word not in self.test_vocab: self.test_vocab.append(word) labels.append(label) return insts
def read_conll(res_file: str, number: int = -1) -> List[Instance]: print("Reading file: " + res_file) insts = [] # vocab = set() ## build the vocabulary with open(res_file, 'r', encoding='utf-8') as f: words = [] heads = [] deps = [] labels = [] tags = [] preds = [] for line in tqdm(f.readlines()): line = line.rstrip() if line == "": inst = Instance(Sentence(words, heads, deps, tags), labels) inst.prediction = preds insts.append(inst) words = [] heads = [] deps = [] labels = [] tags = [] preds = [] if len(insts) == number: break continue vals = line.split() word = vals[1] pos = vals[2] head = int(vals[3]) dep_label = vals[4] label = vals[5] pred_label = vals[6] words.append(word) heads.append(head) ## because of 0-indexed. deps.append(dep_label) tags.append(pos) labels.append(label) preds.append(pred_label) print("number of sentences: {}".format(len(insts))) return insts
def read_conll(self, file: str, number: int = -1, is_train: bool = True) -> List[Instance]: print("Reading file: " + file) insts = [] num_entity = 0 # vocab = set() ## build the vocabulary find_root = False with open(file, 'r', encoding='utf-8') as f: words = [] heads = [] deps = [] labels = [] tags = [] for line in tqdm(f.readlines()): line = line.rstrip() if line == "": insts.append(Instance(Sentence(words, heads, deps, tags), labels)) words = [] heads = [] deps = [] labels = [] tags = [] find_root = False if len(insts) == number: break continue # if "conll2003" in file: # word, pos, head, dep_label, label = line.split() # else: vals = line.split() word = vals[1] head = int(vals[6]) dep_label = vals[7] pos = vals[3] label = vals[10] if self.digit2zero: word = re.sub('\d', '0', word) # replace digit with 0. words.append(word) if head == 0 and find_root: raise err("already have a root") heads.append(head - 1) ## because of 0-indexed. deps.append(dep_label) tags.append(pos) self.vocab.add(word) labels.append(label) if label.startswith("B-"): num_entity +=1 print("number of sentences: {}, number of entities: {}".format(len(insts), num_entity)) return insts