def train(self,trainfile, devfile, multifolds=1,as_text=False): p = StdOutFilter() p.start() train_feats, _, _, _, _ = read_conll(trainfile,genre_pat=self.genre_pat,as_text=as_text) train_for_rnn, scalers = feats2rnn(train_feats) dev_feats, _, _, _, _ = read_conll(devfile,genre_pat=self.genre_pat,as_text=as_text) dev_for_rnn, scalers = feats2rnn(dev_feats) with io.open(self.corpus_dir + os.sep + "dev."+self.ext,'w',encoding="utf8",newline="\n") as f: f.write(dev_for_rnn) # NCRFpp expects a test file, we reuse dev with io.open(self.corpus_dir + os.sep + "test."+self.ext,'w',encoding="utf8",newline="\n") as f: f.write(dev_for_rnn) if multifolds > 1: train_chunks, test_chunks = self.split_dataset(train_for_rnn,multifolds=multifolds) all_preds = [] all_labs = [] for i in range(opts.multifolds): with io.open(self.corpus_dir + os.sep + "train."+self.ext,'w',encoding="utf8",newline="\n") as f: f.write(train_chunks[i]) with io.open(self.corpus_dir + os.sep + "raw."+self.ext,'w',encoding="utf8",newline="\n") as f: f.write(test_chunks[i]) # TRAIN ON FOLD sys.stderr.write("\no Training on fold " + str(i+1)+"/" + str(multifolds) + "\n") config = ncrf_dir + os.sep + self.segtype + ".train"+self.auto+".config" ncrf(config) # PREDICT config = ncrf_dir + os.sep + self.segtype + ".decode"+self.auto+".config" ncrf(config,status="decode") labs, scores = self.read_preds() all_labs += labs all_preds += scores # SERIALIZE MULTITRAIN PREDS with io.open(self.corpus_dir + os.sep + corpus + self.auto + "_multitrain.tab",'w',newline='\n') as f: for j, pred in enumerate(all_preds): if self.conn: lab = all_labs[j] else: lab = 1 if pred >= 0.15 else 0 # Arbitrary threshold, we will only use prob as feature for metalearner f.write(str(lab) + "\t" + str(pred) + "\n") # TRAIN MODEL ON ALL TRAIN sys.stderr.write("\no Training on full train set\n") with io.open(self.corpus_dir + os.sep + "train."+self.ext,'w',encoding="utf8",newline="\n") as f: f.write(train_for_rnn) config = ncrf_dir + os.sep + self.segtype + ".train"+self.auto+".config" ncrf(config) p.end()
def predict(self,testfile,as_text=True): test_feats, _, _, _, _ = read_conll(testfile,genre_pat=self.genre_pat,as_text=as_text) test_for_rnn, scalers = feats2rnn(test_feats) with io.open(self.corpus_dir + os.sep + "raw."+self.ext,'w',encoding="utf8",newline="\n") as f: f.write(test_for_rnn) p = StdOutFilter() p.start() config = ncrf_dir + os.sep + self.segtype+".decode"+self.auto+".config" ncrf(config,status="decode") p.end() labs, probas = self.read_preds() return zip(labs,probas)
udpipe = NLTKSentencer(lang=corpus[:3]) conllu = io.open(topred_file, encoding="utf8").read() filtered = [] for line in conllu.split("\n"): if "\t" in line: fields = line.split("\t") if "-" in fields[0]: continue filtered.append(line) conllu = "\n".join(filtered) + "\n" pred = [x[0] for x in udpipe.predict(conllu)] # pred = udpipe.predict(conllu) gold_feats, _, toks, _, _ = read_conll(gold_file, genre_pat="^(..)", mode="sent", as_text=False) gold = [int(t['wid'] == 1) for t in gold_feats] conf_mat = confusion_matrix(gold, pred) sys.stderr.write("Evaluating on " + corpus + "\n") sys.stderr.write(str(conf_mat) + "\n") true_positive = conf_mat[1][1] false_positive = conf_mat[0][1] false_negative = conf_mat[1][0] prec = true_positive / (true_positive + false_positive) rec = true_positive / (true_positive + false_negative) f1 = 2 * prec * rec / (prec + rec) sys.stderr.write("P: " + str(prec) + "\n") sys.stderr.write("R: " + str(rec) + "\n")
for corpus in corpora: corpus_start_time = time.time() train = glob(opts.data_dir + os.sep + corpus + os.sep + corpus + "_train.conll")[0] dev = glob(opts.data_dir + os.sep + corpus + os.sep + corpus + "_dev.conll")[0] test = glob(opts.data_dir + os.sep + corpus + os.sep + corpus + "_test.conll")[0] feats = DepFeatures() from conll_reader import read_conll outputs = read_conll(dev) train = outputs[0] # Extract features from file sys.stderr.write("\no Extracting features from training corpus " + corpus + "\n") train_feats = feats.extract_depfeatures(train) elapsed = time.time() - corpus_start_time sys.stderr.write("\nTime training on corpus:\n") sys.stderr.write(str(timedelta(seconds=elapsed)) + "\n\n") sys.stderr.write("\nTotal time:\n") elapsed = time.time() - start_time sys.stderr.write(str(timedelta(seconds=elapsed)) + "\n\n")
if opts.mode == "train": # Get Connectives: Raw frequency and ratio of the connectives found in the training data conn = connective.train(train_path) if opts.eval_test: dev_path = test_path with open(dev_path, 'r', encoding='utf-8') as f: lines = f.read() # Prediction on the devset pred_labels_probs = connective.predict(lines) resps = [ tok[1] if float(tok[2]) > 0.5 else "_" for tok in pred_labels_probs ] train_feats, _, _, _, _ = read_conll(dev_path, as_text=False) gold = io.open(dev_path, encoding="utf8").read() lines = gold.split("\n") processed = [] i = 0 for line in lines: if "\t" in line: fields = line.split('\t') if "-" in fields[0]: processed.append(line) continue else: fields[-1] = resps[i] processed.append("\t".join(fields)) i += 1 else:
def read_conll_sentbreak(self, infile, neighborwindowsize=5,as_text=True,cut=True,do_tag=True,multitrain=False): global TRAIN_LIMIT # read data from conll_reader vowels = "AEIOUaeiouéèàáíìúùòóаэыуояеёюи" numeric_entries = [] nonnumeric_entries = [] goldseg_entries = [] if as_text: conllu_in = infile else: conllu_in = io.open(infile, 'r', encoding='utf8').read() # # Reduce data if too large # if not cut and conllu_in.count("\n") > 100000 and multitrain: # sys.stderr.write("o Data too large; forcing cut and turning off multitraining\n") # cut = True # TRAIN_LIMIT = 100000 # if cut: # conllu_in = shuffle_cut_conllu(conllu_in,limit=TRAIN_LIMIT) train_feats,_,_,_,_ = read_conll(conllu_in,mode="seg",genre_pat=None,as_text=True,cap=None,char_bytes=False) featurekeys = ["label", "word", "pos", "cpos", "head", "head_dist", "deprel", "case", "tok_len", "depchunk", "conj", "s_len", "s_type", "sent_doc_percentile", "parentclauses"] for lnum, line in enumerate(train_feats): lfeatures = [line[x] for x in featurekeys] lfeatures[0] = int(lfeatures[0]!="_") firstletter = str(lfeatures[1][0].encode("utf8")[0]) if self.lang == "zho" else lfeatures[1][0] firstisupper = int (firstletter.upper() == firstletter) firstisconsonant = len(re.findall('[^'+vowels+']', firstletter)) firstisvowel = len(re.findall('['+vowels+']', firstletter)) firstisdigit = len(re.findall('[0-9]', firstletter)) firstisspecial = len(re.findall('[^A-Za-z0-9]', firstletter)) lastletter = str(lfeatures[1][-1].encode("utf8")[-1]) if self.lang == "zho" else lfeatures[1][-1] lastisupper = int(lastletter.upper() == lastletter) lastisconsonant = len(re.findall('[^'+vowels+']', lastletter)) lastisvowel = len(re.findall('['+vowels+']', lastletter)) lastisdigit = len(re.findall('[0-9]', lastletter)) lastisspecial = len(re.findall('[^A-Za-z0-9]', lastletter)) numconsonants = len(re.findall('[^'+vowels+']', lfeatures[1])) numvowels = len(re.findall('['+vowels+']', lfeatures[1])) numdigits = len(re.findall('[0-9]', lfeatures[1])) numspecials = len(re.findall('[^A-Za-z0-9]', lfeatures[1])) numeric_entries.append( [ numconsonants, numvowels, numdigits, numspecials, firstisupper, firstisconsonant, firstisvowel, firstisdigit, firstisspecial, lastisupper, lastisconsonant, lastisspecial, lfeatures[4], lfeatures[5], lfeatures[8], lfeatures[11], lfeatures[13] ]) nonnumeric_entries.append([lfeatures[2],lfeatures[3], firstletter,lastletter, lfeatures[6], lfeatures[7], lfeatures[9], lfeatures[10], lfeatures[12], re.sub(r'^([^\|]*\|[^\|]*)\|.*', r'\1', lfeatures[14]) ]) goldseg_entries.append(lfeatures[0]) # featurekeys = ["label", "word", "pos", "cpos", "head", "head_dist", "deprel", "case", "tok_len", 9 "depchunk", "conj", 11"s_len", "s_type", "sent_doc_percentile", "parentclauses"] numeric_colnames = ['numconsonants', 'numvowels', 'numdigits', 'numspecials', 'firstisupper', 'firstisconsonant', 'firstisvowel', 'firstisdigit','firstisspecial', 'lastisupper', 'lastisconsonant','lastisspecial', featurekeys[4], featurekeys[5], featurekeys[8], featurekeys[11], featurekeys[13] ] nonnumeric_colnames = ['gold_pos', 'gold_cpos','firstletter', 'lastletter', featurekeys[6], featurekeys[7], featurekeys[9], featurekeys[10], featurekeys[12], featurekeys[14] ] numeric_entries = np.array(numeric_entries, dtype=np.float32) nonnumeric_entries = np.array(nonnumeric_entries) # Dummy multi vectors cattodummy is much better unigram_entries, unigram_colnames = self.cattodummy(nonnumeric_entries, nonnumeric_colnames, numeric_entries, numeric_colnames) sys.stderr.write("o unigram dataframe ready\n") if neighborwindowsize >= 3: sys.stderr.write("o duplicating to %d-gram...\n" %neighborwindowsize) prev_entries, prev_colnames = self.copyforprevnext(unigram_entries, unigram_colnames, 'prev') next_entries, next_colnames = self.copyforprevnext(unigram_entries, unigram_colnames, 'next') ngram_entries, ngram_colnames = self.numpyconcat2df(prev_entries, prev_colnames, unigram_entries, unigram_colnames) ngram_entries, ngram_colnames = self.numpyconcat2df(ngram_entries,ngram_colnames, next_entries, next_colnames) del prev_entries, prev_colnames, next_colnames, next_entries if neighborwindowsize >=5: prevprev_entries, prevprev_colnames = self.copyforprevnext(unigram_entries, unigram_colnames, 'prevprev') nextnext_entries, nextnext_colnames = self.copyforprevnext(unigram_entries, unigram_colnames, 'nextnext') ngram_entries, ngram_colnames = self.numpyconcat2df(prevprev_entries, prevprev_colnames, ngram_entries, ngram_colnames) ngram_entries, ngram_colnames = self.numpyconcat2df(ngram_entries, ngram_colnames, nextnext_entries, nextnext_colnames) del prevprev_entries, prevprev_colnames, nextnext_colnames, nextnext_entries if neighborwindowsize == 7: prevprevprev_entries, prevprevprev_colnames = self.copyforprevnext(unigram_entries, unigram_colnames, 'prevprevprev') nextnextnext_entries, nextnextnext_colnames = self.copyforprevnext(unigram_entries, unigram_colnames, 'nextnextnext') ngram_entries, ngram_colnames = self.numpyconcat2df(prevprevprev_entries, prevprevprev_colnames, ngram_entries, ngram_colnames) ngram_entries, ngram_colnames = self.numpyconcat2df(ngram_entries, ngram_colnames, nextnextnext_entries, nextnextnext_colnames) del prevprevprev_entries, prevprevprev_colnames, nextnextnext_colnames, nextnextnext_entries else: ngram_entries, ngram_colnames = unigram_entries, unigram_colnames del unigram_entries, numeric_colnames, numeric_entries return ngram_entries, ngram_colnames, goldseg_entries, unigram_colnames
# Run test segmenter = LRSegmenter(lang=lang,model=corpusname,windowsize=args.windowsize) if args.verbose: segmenter.verbose = True if segmenter.verbose: sys.stderr.write("o Processing corpus "+corpusname+"\n") if args.mode == "train": # When running from CLI, we always train (predict mode is done on imported class) segmenter.train(data_folder + os.sep+ corpusname + os.sep + corpusname + "_train.conll",as_text=False,standardization=args.standardization,multitrain=args.multitrain) # Now evaluate model predictions, probas = zip(*segmenter.predict(data_folder + os.sep+ corpusname + os.sep +corpusname + "_dev.conll", as_text=False,standardization=args.standardization,do_tag=True)) # Get gold labels for comparison conllu_in = io.open(data_folder + os.sep+ corpusname + os.sep +corpusname + "_dev.conll", 'r', encoding='utf8').read() devfeats,_,_,_,_ = read_conll(conllu_in, mode="seg", genre_pat=None, as_text=True, cap=None, char_bytes=False) labels = [int(x["label"]!='_') for x in devfeats] # give dev F1 score from sklearn.metrics import classification_report, confusion_matrix print(classification_report(labels, predictions, digits=6)) print(confusion_matrix(labels, predictions)) elapsed = time.time() - start_time sys.stderr.write(str(timedelta(seconds=elapsed)) + "\n\n")
data_dir = os.path.abspath(lib + os.sep + ".." + os.sep + ".." + os.sep + "data_parsed") corpora = os.listdir(data_dir) corpora = [c for c in corpora if os.path.isdir(os.path.join(c, data_dir))] log = io.open("baseline_parsed.log", 'w', encoding="utf8") table_out = [] all_f = [] for corpus in corpora: if "pdtb" in corpus: continue dev = glob(data_dir + os.sep + corpus + os.sep + "*_test.conll")[0] train_feats, vocab, toks, firsts, lasts = read_conll(dev) labels = [int(t["label"] != "_") for t in train_feats] baseline_preds = [int(t["wid"] == 1) for t in train_feats] conf_mat = confusion_matrix(labels, baseline_preds) true_positive = conf_mat[1][1] false_positive = conf_mat[0][1] false_negative = conf_mat[1][0] prec = true_positive / (true_positive + false_positive) rec = true_positive / (true_positive + false_negative) f1 = 2 * prec * rec / (prec + rec) log.write("corpus: " + corpus + "\n") log.write("=" * 10 + "\n") log.write(str(confusion_matrix(labels, baseline_preds)) + "\n") log.write("P: " + str(prec) + "\n") log.write("R: " + str(rec) + "\n") log.write("F1: " + str(f1) + "\n\n")
def read_data(self, infile, size, as_text, rare_thresh, chosen_feats=None): cap = 3 * size if size is not None else None train_feats, vocab, toks, firsts, lasts = read_conll( infile, genre_pat=self.genre_pat, mode="seg", cap=cap, char_bytes=self.lang == "zho", as_text=as_text) vocab = Counter(vocab) top_n_words = vocab.most_common(rare_thresh) top_n_words, _ = zip(*top_n_words) for tok in train_feats: if tok["word"] not in top_n_words: tok["word"] = tok["pos"] tokens_by_abs_id = self.traverse_trees(train_feats) data, headers = self.n_gram(train_feats, tokens_by_abs_id) # Features to use for all n-gram tokens num_labels = [ "head_dist", "left_span", "right_span", "samepar_left", "tok_len" ] cat_labels = [ "case", "closest_left", "closest_right", "deprel", "farthest_left", "farthest_right", "pos", "word", "morph", "cpos", "depchunk" ] pref_cat = [] pref_num = [] for pref in ["mn2", "mn1", "par", "par_par", "pl1", "pl2"]: pref_cat += [pref + "_" + h for h in cat_labels] pref_num += [pref + "_" + h for h in num_labels] # Features only needed for node token cat_labels += [ "genre" ] + pref_cat #+ ["heading_first","heading_last"]#+ ["s_type"] num_labels += [ "dist2end", "sent_doc_percentile", "tok_id", "wid", "quote", "rank" ] + pref_num # + ["bracket"] num_labels += ["par_quote", "par_par_quote"] #,"par_bracket","par_par_bracket"] # Use specific feature subset if chosen_feats is not None: new_cat = [] new_num = [] for feat in chosen_feats: if feat in cat_labels: new_cat.append(feat) elif feat in num_labels: new_num.append(feat) cat_labels = new_cat num_labels = new_num data = pd.DataFrame(data, columns=headers) data_encoded, multicol_dict = self.multicol_fit_transform( data, pd.Index(cat_labels)) data_x = data_encoded[cat_labels + num_labels].values data_y = np.where(data_encoded['label'] == "_", 0, 1) return data_encoded, data_x, data_y, cat_labels, num_labels, multicol_dict, firsts, lasts, top_n_words
def predict(self, infile, eval_gold=False, as_text=True): """ Predict sentence splits using an existing model :param infile: File in DISRPT shared task *.tok or *.conll format (sentence breaks will be ignored in .conll) :param eval_gold: Whether to score the prediction; only applicable if using a gold .conll file as input :param genre_pat: A regex pattern identifying the document genre from document name comments :param as_text: Boolean, whether the input is a string, rather than a file name to read :return: tokenwise binary prediction vector if eval_gold is False, otherwise prints evaluation metrics and diff to gold """ if self.model is None: # Try default model location model_path = ".." + os.sep + ".." + os.sep + "models" + os.sep + "subtreeseg.pkl" else: model_path = self.model clf, num_labels, cat_labels, multicol_dict, top_n_words, firsts, lasts = joblib.load( model_path) feats, _, toks, _, _ = read_conll(infile, genre_pat=self.genre_pat, mode="seg", as_text=as_text, char_bytes=self.lang == "zho") tokens_by_abs_id = self.traverse_trees(feats) feats, headers = self.n_gram(feats, tokens_by_abs_id, dummies=False) temp = [] headers_with_oov = [ "first", "last", "deprel", "closest_left", "closest_right", "farthest_left", "farthest_right", "pos", "cpos", "morph", "s_type", "depchunk" ] for pref in ["mn2", "mn1", "par", "par_par", "pl1", "pl2"]: temp += [pref + "_" + h for h in headers_with_oov] headers_with_oov += temp genre_warning = False for i, header in enumerate(headers): if header in headers_with_oov and header in cat_labels: for item in feats: if item[i] not in multicol_dict["encoder_dict"][ header].classes_: item[i] = "_" elif header == "genre" and "genre" in cat_labels: for item in feats: if item[i] not in multicol_dict["encoder_dict"][ "genre"].classes_: # New genre not in training data if not genre_warning: sys.stderr.write( "! WARN: Genre not in training data: " + item[i] + "; suppressing further warnings\n") genre_warning = True item[i] = "_" elif header.endswith("word") and header in cat_labels: for item in feats: # Replace rare words and words never seen before in this position with POS if item[i] not in top_n_words or item[ i] not in multicol_dict["encoder_dict"][ header].classes_: pos_col = headers.index(header.replace("word", "pos")) if item[pos_col] in multicol_dict["encoder_dict"][ header].classes_: item[i] = item[pos_col] else: item[i] = "_" data = feats data = pd.DataFrame(data, columns=headers) data_encoded = self.multicol_transform( data, columns=multicol_dict["columns"], all_encoders_=multicol_dict["all_encoders_"]) data_x = data_encoded[cat_labels + num_labels].values probas = clf.predict_proba(data_x) probas = [p[1] for p in probas] preds = [int(p > 0.5) for p in probas] for i, p in enumerate(preds): if data["tok_id"].values[ i] == 1: # Ensure tok_id 1 is always a segment start preds[i] = 1 if eval_gold: gold = np.where(data_encoded['label'] == "_", 0, 1) conf_mat = confusion_matrix(gold, preds) sys.stderr.write(str(conf_mat) + "\n") true_positive = conf_mat[1][1] false_positive = conf_mat[0][1] false_negative = conf_mat[1][0] prec = true_positive / (true_positive + false_positive) rec = true_positive / (true_positive + false_negative) f1 = 2 * prec * rec / (prec + rec) sys.stderr.write("P: " + str(prec) + "\n") sys.stderr.write("R: " + str(rec) + "\n") sys.stderr.write("F1: " + str(f1) + "\n") with io.open("diff.tab", 'w', encoding="utf8") as f: for i in range(len(gold)): f.write("\t".join([toks[i], str(gold[i]), str(preds[i])]) + "\n") return conf_mat, prec, rec, f1 else: return zip(preds, probas)
def tune(self, trainfile, devfile, max_evals=10, as_text=False): train_feats, _, _, _, _ = read_conll(trainfile,genre_pat=self.genre_pat,as_text=as_text) train_for_rnn, scalers = feats2rnn(train_feats) dev_feats, _, _, _, _ = read_conll(devfile,genre_pat=self.genre_pat,as_text=as_text) dev_for_rnn, scalers = feats2rnn(dev_feats) with io.open(self.corpus_dir + os.sep + "dev."+self.ext,'w',encoding="utf8",newline="\n") as f: f.write(dev_for_rnn) # NCRFpp expects a test file, we reuse dev with io.open(self.corpus_dir + os.sep + "test."+self.ext,'w',encoding="utf8",newline="\n") as f: f.write(dev_for_rnn) # TRAIN MODEL ON ALL TRAIN with io.open(self.corpus_dir + os.sep + "train."+self.ext,'w',encoding="utf8",newline="\n") as f: f.write(train_for_rnn) config = ncrf_dir + os.sep + self.segtype + ".train"+self.auto+".config" def objective(params): # sys.stderr.write(str(params)) data = Data() data.read_config(config) data.HP_batch_size = int(params['batch_size']) data.HP_lr = float(params['lr']) # data.word_emb_dim=int(params['word_emb_dim']) data.char_emb_dim=int(params['char_emb_dim']) data.word_feature_extractor=params['word_seq_feature'] data.char_feature_extractor=params['char_seq_feature'] #data.optimizer=params['optimizer'] data.HP_cnn_layer=int(params['cnn_layer']) #data.HP_char_hidden_dim=int(params['char_hidden_dim']) data.HP_hidden_dim=int(params['hidden_dim']) data.HP_dropout=float(params['dropout']) data.HP_lstm_layer=int(params['lstm_layer']) data.average_batch_loss=str2bool(params['ave_batch_loss']) p = StdOutFilter4Tune() p.start() ret, best_dev = ncrf(config=None, data=data) p.end() sys.stdout.write("F1 {:.3f} params {}".format(-best_dev, params)) if ret == 1: return {'loss': -best_dev, 'status': STATUS_OK } else: return {'status': STATUS_FAIL } space = { 'batch_size': scope.int(hp.quniform('batch_size', 10, 100, 10)), 'lr': hp.quniform('lr', 0.003, 0.18, 0.001), # 'word_emb_dim': scope.int(hp.quniform('word_emb_dim', 100, 300, 10)), 'char_emb_dim': scope.int(hp.quniform('char_emb_dim', 30, 70, 10)), 'word_seq_feature': hp.choice('word_seq_feature', ["LSTM","CNN"]), 'char_seq_feature': hp.choice('char_seq_feature', ["LSTM","CNN"]), #'optimizer': hp.choice('optimizer', ["SGD","AdaGrad","AdaDelta","RMSProp","Adam"]), 'optimizer': hp.choice('optimizer', ["AdaGrad"]), 'cnn_layer': scope.int(hp.quniform('cnn_layer', 1, 8, 1)), 'char_hidden_dim': scope.int(hp.quniform('char_hidden_dim', 50, 200, 10)), 'hidden_dim': scope.int(hp.quniform('hidden_dim', 100, 300, 20)), 'dropout': hp.quniform('dropout', 0.2, 0.8, 0.1), 'lstm_layer': scope.int(hp.quniform('lstm_layer', 1, 5, 1)), 'ave_batch_loss': hp.choice('ave_batch_loss', ["True","False"]) } best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=max_evals) best_params = space_eval(space,best_params) with io.open(script_dir + os.sep + "params" + os.sep + "RNNSegmenter"+self.auto+"_best_params.tab",'a',encoding="utf8") as bp: corpus = os.path.basename(trainfile).split("_")[0] for k, v in best_params.items(): bp.write("\t".join([corpus, 'RNNClassifier', k, str(v)])+"\n") return best_params