def read_data(self,training_file,rare_thresh=100,as_text=True, no_cache=False): if as_text: train = training_file else: train = io.open(training_file,encoding="utf8").read().strip().replace("\r","") + "\n" cat_labels = ["word","genre","deprel","s_type","morph"]#,"depchunk"]#,"first","last"]#,"pos"]#,"first","last"] num_labels = ["tok_len","tok_id","quote","bracket","sent_doc_percentile","s_len"] train_feats, vocab, toks, firsts, lasts = read_conll(train,genre_pat=self.genre_pat,mode="seg",as_text=True,char_bytes=self.lang=="zho") gold_feats, _, _, _, _ = read_conll_conn(train,mode="seg",as_text=True) gold_feats = [{"wid":0,"label":"_"}] + gold_feats + [{"wid":0,"label":"_"}] # Add dummies to gold # Ensure that "_" is in the possible values of first/last for OOV chars at test time oov_item = train_feats[-1] oov_item["first"] = "_" oov_item["last"] = "_" oov_item["lemma"] = "_" oov_item["word"] = "_" oov_item["deprel"] = "_" oov_item["pos"] = "_" oov_item["cpos"] = "_" oov_item["genre"] = "_" oov_item["depchunk"] = "_" train_feats.append(oov_item) train_feats = [oov_item] + train_feats toks.append("_") toks = ["_"] + toks vocab = Counter(vocab) top_n_words = vocab.most_common(rare_thresh) top_n_words, _ = zip(*top_n_words) headers = sorted(list(train_feats[0].keys())) data = [] preds = {} for e in self.estimators: if self.multitrain and e.name in ["RNNSegmenter"] and not no_cache: pred = e.predict_cached(train) # _, preds[e.name + "_B_prob"], preds[e.e.name + "_I_prob"] = [list(x) for x in zip(*pred)] else: pred = e.predict(train) preds[e.name + "_B_prob"] = [] preds[e.name + "_I_prob"] = [] if "Freq" in e.name: preds[e.name + "_freq"] = [] for tup in pred: if "RNN" in e.name: pred = tup[0] probas = tup[1] freqs = None else: pred = tup[1] probas = float(tup[2]) freqs = float(tup[3]) if "B-Con" in pred: preds[e.name + "_B_prob"].append(probas) preds[e.name + "_I_prob"].append(0.0) elif "I-Con" in pred: preds[e.name + "_B_prob"].append(0.0) preds[e.name + "_I_prob"].append(probas) else: preds[e.name + "_B_prob"].append(0.0) preds[e.name + "_I_prob"].append(0.0) if freqs is not None: preds[e.name + "_freq"].append(freqs) # _, preds[e.name + "_prob"], ratio, freq = [list(x) for x in zip(*pred)] preds[e.name + "_B_prob"] = [0.0] + preds[e.name + "_B_prob"] + [0.0] # Add dummy wrap for items -1 and +1 preds[e.name + "_I_prob"] = [0.0] + preds[e.name + "_I_prob"] + [0.0] # Add dummy wrap for items -1 and +1 if e.name == "FreqConnDetector": preds[e.name + "_freq"] = [0.0] + preds[e.name + "_freq"] + [0.0] # Add dummy wrap for items -1 and +1 headers.append(e.name + "_B_prob") headers.append(e.name + "_I_prob") num_labels.append(e.name + "_B_prob") num_labels.append(e.name + "_I_prob") if "Freq" in e.name: headers.append(e.name + "_freq") num_labels.append(e.name + "_freq") for i, item in enumerate(train_feats): if item["word"] not in top_n_words: item["word"] = item["pos"] for e in self.estimators: item[e.name + "_B_prob"] = preds[e.name + "_B_prob"][i] item[e.name + "_I_prob"] = preds[e.name + "_I_prob"][i] if e.name == "FreqConnDetector": item[e.name + "_freq"] = preds[e.name + "_freq"][i] feats = [] for k in headers: feats.append(item[k]) data.append(feats) data, headers, cat_labels, num_labels = self.n_gram(data, headers, cat_labels, num_labels) # No need for n_gram feats for the following: if "FreqConnDetector_B_prob_min1" in num_labels: num_labels.remove("FreqConnDetector_B_prob_min1") num_labels.remove("FreqConnDetector_B_prob_pls1") if "FreqConnDetector_I_prob_min1" in num_labels: num_labels.remove("FreqConnDetector_I_prob_min1") num_labels.remove("FreqConnDetector_I_prob_pls1") if "FreqConnDetector_freq_min1" in num_labels: num_labels.remove("FreqConnDetector_freq_min1") num_labels.remove("FreqConnDetector_freq_pls1") if "RNNSegmenter_B_prob_min1" in num_labels: num_labels.remove("RNNSegmenter_B_prob_min1") num_labels.remove("RNNSegmenter_B_prob_pls1") if "RNNSegmenter_I_prob_min1" in num_labels: num_labels.remove("RNNSegmenter_I_prob_min1") num_labels.remove("RNNSegmenter_I_prob_pls1") if "tok_id_min1" in num_labels: num_labels.remove("tok_id_min1") num_labels.remove("tok_id_pls1") if "genre_min1" in cat_labels: cat_labels.remove("genre_min1") cat_labels.remove("genre_pls1") if "s_type_min1" in cat_labels: cat_labels.remove("s_type_min1") cat_labels.remove("s_type_pls1") if "morph_min1" in cat_labels: cat_labels.remove("morph_min1") cat_labels.remove("morph_pls1") if "s_len_min1" in num_labels: num_labels.remove("s_len_min1") num_labels.remove("s_len_pls1") if "sent_doc_percentile_min1" in num_labels: num_labels.remove("sent_doc_percentile_min1") num_labels.remove("sent_doc_percentile_pls1") data = pd.DataFrame(data, columns=headers) data_encoded, multicol_dict = self.multicol_fit_transform(data, pd.Index(cat_labels)) data_x = data_encoded[cat_labels+num_labels].values data_y = [] for t in gold_feats: if "B-Conn" in t['label']: #data_y.append((1,0)) data_y.append(1) elif "I-Conn" in t["label"]: #data_y.append((0,1)) data_y.append(2) else: #data_y.append((0,0)) data_y.append(0) # data_y = [int(t['label'] != "_") for t in gold_feats] return data_encoded, data_x, data_y, cat_labels, num_labels, multicol_dict, firsts, lasts, top_n_words
def predict(self, conllu, eval_gold=False, as_text=True, serialize=False): """ Predict sentence splits using an existing model :param conllu: File in DISRPT shared task *.conll format :param eval_gold: Whether to score the prediction; only applicable if using a gold .conll file as input :param as_text: Boolean, whether the input is a string, rather than a file name to read :param serialize: Whether to serialize prediction as a .conll file :return: tokenwise prediction vector if eval_gold is False, otherwise prints evaluation metrics and diff to gold """ clf, num_labels, cat_labels, multicol_dict, vocab, firsts, lasts = joblib.load(self.model) self.clf = clf if not as_text: conllu = io.open(conllu,encoding="utf8").read() train_feats, _, toks, _, _ = read_conll(conllu,genre_pat=self.genre_pat,mode="seg",as_text=True) headers = sorted(list(train_feats[0].keys())) data = [] preds = {} for e in self.estimators: pred = e.predict(conllu) # _, preds[e.name + "_prob"] = [list(x) for x in zip(*pred)] preds[e.name + "_B_prob"] = [] preds[e.name + "_I_prob"] = [] if "Freq" in e.name: preds[e.name + "_freq"] = [] headers.append(e.name + "_freq") for tup in pred: if "RNN" in e.name: pred = tup[0] probas = tup[1] freqs = None else: pred = tup[1] probas = float(tup[2]) freqs = float(tup[3]) if "B-Conn" in pred: preds[e.name + "_B_prob"].append(probas) preds[e.name + "_I_prob"].append(0.0) elif "I-Conn" in pred: preds[e.name + "_B_prob"].append(0.0) preds[e.name + "_I_prob"].append(probas) else: preds[e.name + "_B_prob"].append(0.0) preds[e.name + "_I_prob"].append(0.0) if "Freq" in e.name: preds[e.name + "_freq"].append(freqs) headers.append(e.name + "_B_prob") headers.append(e.name + "_I_prob") temp = [] headers_with_oov = ["deprel","pos","cpos","morph","s_type","depchunk"] for pref in ["min1","pls1"]: temp += [pref + "_" + h for h in headers_with_oov] headers_with_oov += temp genre_warning = False for i, header in enumerate(headers): if header in headers_with_oov and header in cat_labels: for item in train_feats: if item[header] not in multicol_dict["encoder_dict"][header].classes_: item[header] = "_" for i, item in enumerate(train_feats): item["first"] = item["word"][0] if item["word"][0] in firsts else "_" item["last"] = item["word"][-1] if item["word"][-1] in lasts else "_" if "genre" in cat_labels: if item["genre"] not in multicol_dict["encoder_dict"]["genre"].classes_: # New genre not in training data if not genre_warning: sys.stderr.write("! WARN: Genre not in training data: " + item["genre"] + "; suppressing further warnings\n") genre_warning = True item["genre"] = "_" if item["word"] not in vocab: if item["pos"] in multicol_dict["encoder_dict"]["word"].classes_: item["word"] = item["pos"] else: item["word"] = "_" for e in self.estimators: item[e.name + "_B_prob"] = preds[e.name + "_B_prob"][i] item[e.name + "_I_prob"] = preds[e.name + "_I_prob"][i] if e.name == "FreqConnDetector": item[e.name + "_freq"] = preds[e.name + "_freq"][i] feats = [] for k in headers: feats.append(item[k]) data.append(feats) data, headers, _, _ = self.n_gram(data,headers,[],[]) data = pd.DataFrame(data, columns=headers) data_encoded = self.multicol_transform(data,columns=multicol_dict["columns"],all_encoders_=multicol_dict["all_encoders_"]) data_x = data_encoded[cat_labels+num_labels].values preds = clf.predict(data_x) if eval_gold: gold_feats, _,_,_,_ = read_conll(conllu,genre_pat=self.genre_pat,mode="seg",as_text=True) # Array to keep labels for diff gold = [] for t in gold_feats: if "B-Conn" in t['label']: gold.append("Seg=B-Conn") elif "I-Conn" in t['label']: gold.append("Seg=I-Conn") else: gold.append("_") gold = np.asarray(gold) # Generate response conllu lines = conllu.split("\n") processed = [] pred_labs = [] i = 0 for line in lines: if "\t" in line: fields = line.split('\t') if "-" in fields[0]: processed.append(line) continue else: if preds[i] == 0: pred = "_" elif preds[i] == 1: pred = "Seg=B-Conn" else: pred = "Seg=I-Conn" pred_labs.append(pred) fields[-1]=pred processed.append("\t".join(fields)) i+=1 else: processed.append(line) processed = "\n".join(processed) + "\n" score_dict = get_scores(conllu,processed,string_input=True) print("o Total tokens: " + str(score_dict["tok_count"])) print("o Gold " +score_dict["seg_type"]+": " + str(score_dict["gold_seg_count"])) print("o Predicted "+score_dict["seg_type"]+": " + str(score_dict["pred_seg_count"])) print("o Precision: " + str(score_dict["prec"])) print("o Recall: " + str(score_dict["rec"])) print("o F-Score: " + str(score_dict["f_score"])) if serialize: self.serialize(conllu,pred_labs) with io.open("diff.tab",'w',encoding="utf8") as f: for i in range(len(pred_labs)): f.write("\t".join([toks[i],str(gold[i]),str(pred_labs[i])])+"\n") return score_dict["f_score"] else: return preds
def predict(self, infile, model_path=None, eval_gold=False, as_text=False): """ Predict sentence splits using an existing model :param infile: File in DISRPT shared task *.tok or *.conll format (sentence breaks will be ignored in .conll) :param model: Pickled model file, default: models/sent_model.pkl :param eval_gold: Whether to score the prediction; only applicable if using a gold .conll file as input :param as_text: Boolean, whether the input is a string, rather than a file name to read :return: tokenwise binary prediction vector if eval_gold is False, otherwise prints evaluation metrics and diff to gold """ if model_path is None: # Try default model location model_path = script_dir + os.sep + "models" + os.sep + self.corpus + "_ensemble_sent.pkl" clf, num_labels, cat_labels, multicol_dict, vocab, firsts, lasts = joblib.load( model_path) if as_text: conllu = infile else: conllu = io.open(infile, encoding="utf8").read() #tagged = udpipe_tag(conllu,self.udpipe_model) tagged = tt_tag(conllu, self.lang) train_feats, _, toks, _, _ = read_conll(tagged, genre_pat=self.genre_pat, mode="sent", as_text=True, char_bytes=self.lang == "zho") headers = sorted(list(train_feats[0].keys())) data = [] preds = {} for e in self.estimators: pred = e.predict(tagged) _, preds[e.name + "_prob"] = [list(x) for x in zip(*pred)] headers.append(e.name + "_prob") genre_warning = False for i, item in enumerate(train_feats): item["first"] = item["word"][0] if item["word"][ 0] in firsts else "_" item["last"] = item["word"][-1] if item["word"][ -1] in lasts else "_" if "genre" in cat_labels: if item["genre"] not in multicol_dict["encoder_dict"][ "genre"].classes_: # New genre not in training data if not genre_warning: sys.stderr.write( "! WARN: Genre not in training data: " + item["genre"] + "; suppressing further warnings\n") genre_warning = True item["genre"] = "_" if "pos" in cat_labels: if item["pos"] not in multicol_dict["encoder_dict"][ "pos"].classes_: item["pos"] = "_" if "cpos" in cat_labels: if item["cpos"] not in multicol_dict["encoder_dict"][ "cpos"].classes_: item["cpos"] = "_" if item["word"] not in vocab and "word" in multicol_dict[ "encoder_dict"]: if item["pos"] in multicol_dict["encoder_dict"][ "word"].classes_: item["word"] = item["pos"] else: item["word"] = "_" for e in self.estimators: item[e.name + "_prob"] = preds[e.name + "_prob"][i] feats = [] for k in headers: feats.append(item[k]) data.append(feats) data, headers, _, _ = self.n_gram(data, headers, [], []) data = pd.DataFrame(data, columns=headers) data_encoded = self.multicol_transform( data, columns=multicol_dict["columns"], all_encoders_=multicol_dict["all_encoders_"]) data_x = data_encoded[cat_labels + num_labels].values pred = clf.predict(data_x) # Ensure first token in document is always a sentence break for i, x in enumerate(data_encoded["tok_id"].values): if x == 1: pred[i] = 1 if eval_gold: gold_feats, _, _, _, _ = read_conll(conllu, genre_pat=self.genre_pat, mode="sent", as_text=True) gold = [int(t['wid'] == 1) for t in gold_feats] conf_mat = confusion_matrix(gold, pred) sys.stderr.write(str(conf_mat) + "\n") true_positive = conf_mat[1][1] false_positive = conf_mat[0][1] false_negative = conf_mat[1][0] prec = true_positive / (true_positive + false_positive) rec = true_positive / (true_positive + false_negative) f1 = 2 * prec * rec / (prec + rec) sys.stderr.write("P: " + str(prec) + "\n") sys.stderr.write("R: " + str(rec) + "\n") sys.stderr.write("F1: " + str(f1) + "\n") with io.open("diff.tab", 'w', encoding="utf8") as f: for i in range(len(gold)): f.write("\t".join([toks[i], str(gold[i]), str(pred[i])]) + "\n") return conf_mat, prec, rec, f1 else: return pred
def train(self, training_file, rare_thresh=100, clf_params=None, model_path=None, chosen_feats=None, tune_mode=None, size=None, as_text=False, multitrain=True, chosen_clf=None): """ Train the EnsembleSentencer. Note that the underlying estimators are assumed to be pretrained already. :param training_file: File in DISRPT shared task .conll format :param model_path: Path to dump pickled model to :param rare_thresh: Rank of rarest word to include (rarer items are replace with POS) :param genre_pat: Regex pattern with capturing group to extract genre from document names :param as_text: Boolean, whether the input is a string, rather than a file name to read :return: """ if tune_mode is not None and size is None and tune_mode != "hyperopt": size = 5000 sys.stderr.write("o No sample size set - setting size to 5000\n") if not as_text: train = io.open( training_file, encoding="utf8").read().strip().replace( "\r", "") + "\n" else: train = training_file if size is not None: train = shuffle_cut_conllu(train, size) #tagged = udpipe_tag(train,self.udpipe_model) tagged = tt_tag(train, self.lang, preserve_sent=True) if model_path is None: # Try default model location model_path = script_dir + os.sep + "models" + os.sep + self.corpus + "_ensemble_sent.pkl" if clf_params is None: # Default classifier parameters #clf_params = {"n_estimators":125,"min_samples_leaf":1, "max_depth":15, "max_features":None, "n_jobs":4, "random_state":42, "oob_score":True, "bootstrap":True} clf_params = { "n_estimators": 100, "min_samples_leaf": 1, "min_samples_split": 5, "max_depth": 10, "max_features": None, "n_jobs": 4, "random_state": 42, "oob_score": True, "bootstrap": True } if chosen_clf is None: chosen_clf = RandomForestClassifier(n_jobs=4, oob_score=True, bootstrap=True) chosen_clf.set_params(**clf_params) cat_labels = ["word", "first", "last", "genre", "pos", "cpos"] num_labels = ["tok_len", "tok_id"] train_feats, vocab, toks, firsts, lasts = read_conll( tagged, genre_pat=self.genre_pat, mode="sent", as_text=True, char_bytes=self.lang == "zho") gold_feats, _, _, _, _ = read_conll(train, mode="sent", as_text=True) gold_feats = [{ "wid": 0 }] + gold_feats + [{ "wid": 0 }] # Add dummies to gold # Ensure that "_" is in the possible values of first/last for OOV chars at test time oov_item = train_feats[-1] oov_item["first"] = "_" oov_item["last"] = "_" oov_item["lemma"] = "_" oov_item["word"] = "_" oov_item["pos"] = "_" oov_item["cpos"] = "_" oov_item["genre"] = "_" train_feats.append(oov_item) train_feats = [oov_item] + train_feats toks.append("_") toks = ["_"] + toks vocab = Counter(vocab) top_n_words = vocab.most_common(rare_thresh) top_n_words, _ = zip(*top_n_words) headers = sorted(list(train_feats[0].keys())) data = [] preds = {} for e in self.estimators: if multitrain and e.name in ["LRSentencer", "DNNSentencer"]: pred = e.predict_cached(tagged) else: pred = e.predict(tagged) _, preds[e.name + "_prob"] = [list(x) for x in zip(*pred)] preds[e.name + "_prob"] = [0.0] + preds[e.name + "_prob"] + [ 0.0 ] # Add dummy wrap for items -1 and +1 headers.append(e.name + "_prob") num_labels.append(e.name + "_prob") for i, item in enumerate(train_feats): if item["word"] not in top_n_words: item["word"] = item["pos"] for e in self.estimators: item[e.name + "_prob"] = preds[e.name + "_prob"][i] feats = [] for k in headers: feats.append(item[k]) data.append(feats) data, headers, cat_labels, num_labels = self.n_gram( data, headers, cat_labels, num_labels) # No need for n_gram feats for the following: if "NLTKSentencer_prob_min1" in num_labels: num_labels.remove("NLTKSentencer_prob_min1") num_labels.remove("NLTKSentencer_prob_pls1") if "UDPipeSentencer_prob_min1" in num_labels: num_labels.remove("UDPipeSentencer_prob_min1") num_labels.remove("UDPipeSentencer_prob_pls1") if "LRSentencer_prob_min1" in num_labels: num_labels.remove("LRSentencer_prob_min1") num_labels.remove("LRSentencer_prob_pls1") if "RuleBasedSplitter_prob_min1" in num_labels: num_labels.remove("RuleBasedSplitter_prob_min1") num_labels.remove("RuleBasedSplitter_prob_pls1") if "DNNSentencer_prob_min1" in num_labels: num_labels.remove("DNNSentencer_prob_min1") num_labels.remove("DNNSentencer_prob_pls1") if "tok_id_min1" in num_labels: num_labels.remove("tok_id_min1") num_labels.remove("tok_id_pls1") if "genre_min1" in cat_labels: cat_labels.remove("genre_min1") cat_labels.remove("genre_pls1") # Use specific feature subset if chosen_feats is not None: new_cat = [] new_num = [] for feat in chosen_feats: if feat in cat_labels: new_cat.append(feat) elif feat in num_labels: new_num.append(feat) cat_labels = new_cat num_labels = new_num data = pd.DataFrame(data, columns=headers) data_encoded, multicol_dict = self.multicol_fit_transform( data, pd.Index(cat_labels)) data_x = data_encoded[cat_labels + num_labels].values data_y = [int(t['wid'] == 1) for t in gold_feats] sys.stderr.write("o Learning...\n") if tune_mode is not None: # Randomize samples for training data_x = data_encoded[cat_labels + num_labels + ["label"]].sample( frac=1, random_state=42) data_y = np.where(data_x['label'] == "_", 0, 1) data_x = data_x[cat_labels + num_labels] # Reserve 10% for validation val_x = data_x[int(len(data_y) / 9):] val_y = data_y[int(len(data_y) / 9):] data_x = data_x[:int(len(data_y) / 9)] data_y = data_y[:int(len(data_y) / 9)] if tune_mode == "importances": sys.stderr.write( "o Measuring correlation of categorical variables\n") theil_implications = report_theils_u(val_x, cat_labels) for (var1, var2) in theil_implications: if var1 in cat_labels and var2 in cat_labels: drop_var = var2 u = theil_implications[(var1, var2)] sys.stderr.write("o Removed feature " + drop_var + " due to Theil's U " + str(u)[:6] + " of " + var1 + "->" + var2 + "\n") cat_labels.remove(drop_var) sys.stderr.write( "o Measuring correlation of numerical variables\n") cor_mat = report_correlations(val_x[num_labels], thresh=0.95) for (var1, var2) in cor_mat: if var1 in num_labels and var2 in num_labels: drop_var = var2 corr_level = cor_mat[(var1, var2)] sys.stderr.write("o Removed feature " + drop_var + " due to correlation " + str(corr_level) + " of " + var1 + ":" + var2 + "\n") num_labels.remove(drop_var) return cat_labels, num_labels if tune_mode in ["paramwise", "full"]: best_params = {} # Tune individual params separately for speed, or do complete grid search if building final model params_list = [{ "n_estimators": [100, 125, 150] }, { 'max_depth': [10, 15, 20, None] }, { "min_samples_split": [5, 10, 15] }, { "min_samples_leaf": [1, 2, 3] }, { "max_features": [None, "sqrt", "log2"] }] if tune_mode == "full": # Flatten dictionary if doing full CV params_list = [{ k: v for d in params_list for k, v in d.items() }] for params in params_list: base_params = copy.deepcopy(clf_params) # Copy default params for p in params: if p in base_params: # Ensure base_params don't conflict with grid search params base_params.pop(p) grid = GridSearchCV(RandomForestClassifier(**base_params), params, cv=3, n_jobs=4, error_score="raise", refit=False) grid.fit(data_x, data_y) for param in params: best_params[param] = grid.best_params_[param] with io.open("best_params.tab", 'a', encoding="utf8") as bp: corpus = os.path.basename(training_file).split("_")[0] best_clf = RandomForestClassifier(**best_params) clf_name = best_clf.__class__.__name__ for k, v in best_params.items(): bp.write("\t".join([corpus, clf_name, k, str(v)])) bp.write("\n") return best_clf, best_params elif tune_mode == "hyperopt": from hyperopt import hp from hyperopt.pyll.base import scope space = { 'n_estimators': scope.int(hp.quniform('n_estimators', 50, 150, 10)), 'max_depth': scope.int(hp.quniform('max_depth', 5, 30, 1)), 'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)), 'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 10, 1)), 'max_features': hp.choice('max_features', ["sqrt", None, 0.5, 0.7, 0.9]), 'clf': hp.choice('clf', ["rf", "et", "gbm"]) } #space = { # 'n_estimators': scope.int(hp.quniform('n_estimators', 50, 150, 10)), # 'max_depth': scope.int(hp.quniform('max_depth', 3, 30, 1)), # 'eta': scope.float(hp.quniform('eta', 0.01, 0.2, 0.01)), # 'gamma': scope.float(hp.quniform('gamma', 0.01, 0.2, 0.01)), # 'colsample_bytree': hp.choice('colsample_bytree', [0.4,0.5,0.6,0.7,1.0]), # 'subsample': hp.choice('subsample', [0.5,0.6,0.7,0.8,1.0]), # 'clf': hp.choice('clf', ["xgb"]) #} best_clf, best_params = hyper_optimize(data_x, data_y, cat_labels=cat_labels, space=space, max_evals=50) return best_clf, best_params else: clf = chosen_clf clf.set_params(**clf_params) if clf.__class__.__name__ in [ "RandomForestClassifier", "ExtraTreesClassifier", "XGBClassifier" ]: clf.set_params( **{ "n_jobs": 3, "random_state": 42, "oob_score": True, "bootstrap": True }) else: clf.set_params(**{"random_state": 42}) clf.fit(data_x, data_y) feature_names = cat_labels + num_labels zipped = zip(feature_names, clf.feature_importances_) sorted_zip = sorted(zipped, key=lambda x: x[1], reverse=True) sys.stderr.write("o Feature importances:\n\n") for name, importance in sorted_zip: sys.stderr.write(name + "=" + str(importance) + "\n") if hasattr(clf, "oob_score_"): sys.stderr.write("\no OOB score: " + str(clf.oob_score_) + "\n") sys.stderr.write("\no Serializing model...\n") joblib.dump((clf, num_labels, cat_labels, multicol_dict, top_n_words, firsts, lasts), model_path, compress=3)
def predict(self, conllu, eval_gold=False, as_text=True, serialize=False): """ Predict sentence splits using an existing model :param infile: File in DISRPT shared task *.conll format :param eval_gold: Whether to score the prediction; only applicable if using a gold .conll file as input :param as_text: Boolean, whether the input is a string, rather than a file name to read :return: tokenwise binary prediction vector if eval_gold is False, otherwise prints evaluation metrics and diff to gold """ clf, num_labels, cat_labels, multicol_dict, vocab, firsts, lasts = joblib.load( self.model) self.clf = clf if not as_text: conllu = io.open(conllu, encoding="utf8").read() train_feats, _, toks, _, _ = read_conll(conllu, genre_pat=self.genre_pat, mode="seg", as_text=True) headers = sorted(list(train_feats[0].keys())) data = [] preds = {} for e in self.estimators: pred = e.predict(conllu) _, preds[e.name + "_prob"] = [list(x) for x in zip(*pred)] headers.append(e.name + "_prob") temp = [] headers_with_oov = [ "deprel", "pos", "cpos", "morph", "s_type", "depchunk" ] for pref in ["min1", "pls1"]: temp += [pref + "_" + h for h in headers_with_oov] headers_with_oov += temp genre_warning = False for i, header in enumerate(headers): if header in headers_with_oov and header in cat_labels: for item in train_feats: if item[header] not in multicol_dict["encoder_dict"][ header].classes_: item[header] = "_" for i, item in enumerate(train_feats): item["first"] = item["word"][0] if item["word"][ 0] in firsts else "_" item["last"] = item["word"][-1] if item["word"][ -1] in lasts else "_" if "genre" in cat_labels: if item["genre"] not in multicol_dict["encoder_dict"][ "genre"].classes_: # New genre not in training data if not genre_warning: sys.stderr.write( "! WARN: Genre not in training data: " + item["genre"] + "; suppressing further warnings\n") genre_warning = True item["genre"] = "_" if item["word"] not in vocab: if item["pos"] in multicol_dict["encoder_dict"][ "word"].classes_: item["word"] = item["pos"] else: item["word"] = "_" for e in self.estimators: item[e.name + "_prob"] = preds[e.name + "_prob"][i] feats = [] for k in headers: feats.append(item[k]) data.append(feats) data, headers, _, _ = self.n_gram(data, headers, [], []) data = pd.DataFrame(data, columns=headers) data_encoded = self.multicol_transform( data, columns=multicol_dict["columns"], all_encoders_=multicol_dict["all_encoders_"]) data_x = data_encoded[cat_labels + num_labels].values pred = clf.predict(data_x) if serialize: self.serialize(conllu, pred) if eval_gold: gold_feats, _, _, _, _ = read_conll(conllu, genre_pat=self.genre_pat, mode="seg", as_text=True) gold = [int(t['label'] != "_") for t in gold_feats] conf_mat = confusion_matrix(gold, pred) sys.stderr.write(str(conf_mat) + "\n") true_positive = conf_mat[1][1] false_positive = conf_mat[0][1] false_negative = conf_mat[1][0] prec = true_positive / (true_positive + false_positive) rec = true_positive / (true_positive + false_negative) f1 = 2 * prec * rec / (prec + rec) sys.stderr.write("P: " + str(prec) + "\n") sys.stderr.write("R: " + str(rec) + "\n") sys.stderr.write("F1: " + str(f1) + "\n") with io.open("diff.tab", 'w', encoding="utf8") as f: for i in range(len(gold)): f.write("\t".join([toks[i], str(gold[i]), str(pred[i])]) + "\n") return conf_mat, prec, rec, f1 else: return pred