def read_data(input_file): """Reads a BIO data.""" with codecs.open(input_file, "r", encoding="utf-8") as f: lines = [] words = [] labels = [] f_lines = f.readlines() for line in tqdm(f_lines, total=len(f_lines), desc="Process {}".format(input_file)): contends = line.strip() word = line.strip().split(' ')[0] label = line.strip().split(' ')[-1] if contends.startswith("-DOCSTART-"): words.append('') continue if len(contends) == 0 and not len(words): words.append("") if len(contends) == 0 and words[-1] == '.': lbl = ' '.join([label for label in labels if len(label) > 0]) w = ' '.join([word for word in words if len(word) > 0]) lines.append([lbl, w]) words = [] labels = [] continue words.append(word) labels.append(label.replace("-", "_")) return lines
def create_vocabs(df, tokenizer, idx2labels_path, markup="IO", idx2cls_path=None, pad_idx=0, is_cls=False, idx2labels=None, idx2cls=None): if idx2labels is None: label2idx = {"[PAD]": pad_idx, '[CLS]': 1, '[SEP]': 2, "X": 3} idx2label = ["[PAD]", '[CLS]', '[SEP]', "X"] else: label2idx = {label: idx for idx, label in enumerate(idx2labels)} idx2label = idx2labels idx2cls = idx2cls cls2idx = None if is_cls: idx2cls = [] cls2idx = {label: idx for idx, label in enumerate(idx2cls)} for _, row in tqdm(df.iterrows(), total=len(df), leave=False, desc="Creating labels vocabs"): labels = row.labels.split() origin_tokens = row.text.split() if is_cls and row.cls not in cls2idx: cls2idx[row.cls] = len(cls2idx) idx2cls.append(row.cls) prev_label = "" for origin_token, label in zip(origin_tokens, labels): if markup == "BIO": prefix = "B_" else: prefix = "I_" if label != "O": label = label.split("_")[1] if label == prev_label: prefix = "I_" prev_label = label else: prev_label = label cur_tokens = tokenizer.tokenize(origin_token) bert_label = [prefix + label] + ["X"] * (len(cur_tokens) - 1) for label_ in bert_label: if label_ not in label2idx: label2idx[label_] = len(label2idx) idx2label.append(label_) with open(idx2labels_path, "w", encoding="utf-8") as f: for label in idx2label: f.write("{}\n".format(label)) if is_cls: with open(idx2cls_path, "w", encoding="utf-8") as f: for label in idx2cls: f.write("{}\n".format(label)) return label2idx, idx2label, cls2idx, idx2cls
def predict(dl, model, id2cls): model.eval() idx = 0 preds_cpu_cls = [] for batch in tqdm(dl, total=len(dl), leave=False, desc="Predicting"): idx += 1 preds_cls = model.forward(batch) preds_cpu_ = transformed_result_cls([preds_cls], [preds_cls], id2cls, False) preds_cpu_cls.extend(preds_cpu_) return preds_cpu_cls
def fact_ru_eval_preprocess(dev_dir, test_dir, dev_df_path, test_df_path): dev_reader = Reader(dev_dir) dev_reader.read_dir() dev_texts, dev_tags = dev_reader.split() res_tags = [] res_tokens = [] for tag, tokens in tqdm(zip(dev_tags, dev_texts), total=len(dev_tags), desc="Process FactRuEval2016 dev set."): if len(tag): res_tags.append(tag) res_tokens.append(tokens) dev = pd.DataFrame({ "labels": list(map(" ".join, res_tags)), "text": list(map(" ".join, res_tokens)) }) dev["clf"] = dev["labels"].apply( lambda x: all([y.split("_")[0] == "O" for y in x.split()])) dev.to_csv(dev_df_path, index=False, sep="\t") test_reader = Reader(test_dir) test_reader.read_dir() test_texts, test_tags = test_reader.split() res_tags = [] res_tokens = [] for tag, tokens in tqdm(zip(test_tags, test_texts), total=len(test_tags), desc="Process FactRuEval2016 test set."): if len(tag): res_tags.append(tag) res_tokens.append(tokens) valid = pd.DataFrame({ "labels": list(map(" ".join, res_tags)), "text": list(map(" ".join, res_tokens)) }) valid["clf"] = valid["labels"].apply( lambda x: all([y.split("_")[0] == "O" for y in x.split()])) valid.to_csv(test_df_path, index=False, sep="\t")
def read_data_pos(input_file): with codecs.open(input_file, "r", encoding="utf-8") as f: lines = [] words = [] labels = [] poses = [] f_lines = f.readlines() # i is for test i = 0 for line in tqdm(f_lines, total=len(f_lines), desc="Process {}".format(input_file)): contends = line.strip() if contends.startswith("-DOCSTART-"): words.append('') continue # this is Not used for Thai data if len(contends) == 0 and not len(words): words.append("") # this is blank-line as ending of sentence # also check labels > 0 so that it is not blank line if len(contends) == 0 and len(labels) > 0: lbl = ' '.join([label for label in labels if len(label) > 0]) w = ' '.join([word for word in words if len(word) > 0]) p = ' '.join([pos for pos in poses if len(pos) > 0]) lines.append([lbl, w, p]) words = [] labels = [] poses = [] continue if len(line.strip().split(' ')) >= 2: word = line.strip().split(' ')[0] label = line.strip().split(' ')[1] pos = line.strip().split(' ')[2] words.append(word) labels.append(label.replace("-", "_")) poses.append(pos) # if (i < 10): # print(lbl) # print(w) # i += 1 return lines
def validate_step(dl, model, id2cls): model.eval() idx = 0 preds_cpu_cls, targets_cpu_cls = [], [] for batch in tqdm(dl, total=len(dl), leave=False, desc="Validation"): idx += 1 preds_cls = model.forward(batch) preds_cpu_, targets_cpu_ = transformed_result_cls([preds_cls], [batch[-1]], id2cls) preds_cpu_cls.extend(preds_cpu_) targets_cpu_cls.extend(targets_cpu_) clf_report_cls = flat_classification_report([targets_cpu_cls], [preds_cpu_cls], digits=4) return clf_report_cls
def train_step(dl, model, optimizer, num_epoch=1): model.train() epoch_loss = 0 idx = 0 pr = tqdm(dl, total=len(dl), leave=False) for batch in pr: idx += 1 model.zero_grad() loss = model.score(batch) loss.backward() optimizer.step() optimizer.zero_grad() loss = loss.data.cpu().tolist() epoch_loss += loss pr.set_description("train loss: {}".format(epoch_loss / idx)) torch.cuda.empty_cache() logging.info("\nepoch {}, average train epoch loss={:.5}\n".format( num_epoch, epoch_loss / idx))
def predict(dl, model, id2label, id2cls=None): model.eval() idx = 0 preds_cpu = [] preds_cpu_cls = [] for batch in tqdm(dl, total=len(dl), leave=False, desc="Predicting"): idx += 1 labels_mask, labels_ids = batch[1], batch[3] preds = model.forward(batch) if id2cls is not None: preds, preds_cls = preds preds_cpu_ = transformed_result_cls([preds_cls], [preds_cls], id2cls, False) preds_cpu_cls.extend(preds_cpu_) preds_cpu_ = transformed_result([preds], [labels_mask], id2label) preds_cpu.extend(preds_cpu_) if id2cls is not None: return preds_cpu, preds_cpu_cls return preds_cpu
def create_vocabs(df, idx2cls_path, idx2cls=None): idx2cls = idx2cls cls2idx = {} if idx2cls is not None: cls2idx = {label: idx for idx, label in enumerate(idx2cls)} else: idx2cls = [] for _, row in tqdm(df.iterrows(), total=len(df), leave=False, desc="Creating labels vocabs"): if row.cls not in cls2idx: cls2idx[row.cls] = len(cls2idx) idx2cls.append(row.cls) with open(idx2cls_path, "w", encoding="utf-8") as f: for label in idx2cls: f.write("{}\n".format(label)) return cls2idx, idx2cls
def validate_step(dl, model, id2label, sup_labels, id2cls=None): model.eval() idx = 0 preds_cpu, targets_cpu = [], [] preds_cpu_cls, targets_cpu_cls = [], [] for batch in tqdm(dl, total=len(dl), leave=False): idx += 1 labels_mask, labels_ids = batch[1], batch[3] preds = model.forward(batch) if id2cls is not None: preds, preds_cls = preds preds_cpu_, targets_cpu_ = transformed_result_cls([preds_cls], [batch[-1]], id2cls) preds_cpu_cls.extend(preds_cpu_) targets_cpu_cls.extend(targets_cpu_) preds_cpu_, targets_cpu_ = transformed_result([preds], [labels_mask], id2label, [labels_ids]) preds_cpu.extend(preds_cpu_) targets_cpu.extend(targets_cpu_) clf_report = flat_classification_report(targets_cpu, preds_cpu, labels=sup_labels, digits=3) if id2cls is not None: clf_report_cls = flat_classification_report([targets_cpu_cls], [preds_cpu_cls], digits=3) return clf_report, clf_report_cls return clf_report