def main(vectorModels): feature_dict = {} action = None train_in_path = "" pred_in_path = "" opts, args = getopt.getopt(sys.argv[1:], "a:", ["save_in_path=", "train_in_path="]) for op, value in opts: if op == "-a": action = value if op == "--save_in_path": train_in_path = value if op == "--train_in_path": pred_in_path = value if action == "save": if train_in_path: f = open(train_in_path,'r') texts = f.readlines() f.close() save_fit_result(texts) if action == "train": if pred_in_path: wordseg = Wordsegmenter("./bin/pyseg.so", "./bin/qsegconf.ini") in_path = "./data/yao_test_data.txt" data = pd.read_csv(in_path, sep="\t", dtype='str', names=['qid', 'ql', 'qr', 'label']) vectorModels = VectorModels() count_tfidf_hash_features = data[['ql', 'qr']].apply(lambda row: extract_features(wordseg, row['ql'], row['qr'], vectorModels), axis=1) feature_dict.update(count_tfidf_hash_features)
def explain(model_path): wordseg = Wordsegmenter("./bin/pyseg.so", "./bin/qsegconf.ini") sent_word2vec_path = "./data/word2vec.query.bin" sent_vocab_path = "./data/word2vec.query.vocab" sent_model_path = "./data/sif.model" sent_word2vec = KeyedVectors.load_word2vec_format(sent_word2vec_path, binary=True) sent_vocab_dict = load_vocab(sent_vocab_path) sent_model = joblib.load(sent_model_path) tfidf_count_hash_vectorModels = VectorModels() ner_dict_path = "./data/ner.dict" syn_dict_path = "./data/syn.dict" ner_dict, syn_dict = load_ner_dict(ner_dict_path, syn_dict_path) model = joblib.load(model_path) pd.set_option('display.max_rows', None) explain = eli5.explain_weights(model, top=None) explain = eli5.format_as_text(explain) print explain feature_names = [] column_names = ["qid", "ql", "qr"] #reader = pd.read_csv(in_path, sep="\t", dtype="str", names=column_names, chunksize=100) reader = pd.read_csv(sys.stdin, sep="\t", dtype="str", names=column_names, chunksize=1) first_chunk = True feature_extractor = lambda row: extract_features( wordseg, row["ql"], row["qr"], tfidf_count_hash_vectorModels, sent_word2vec, sent_vocab_dict, sent_model, ner_dict, syn_dict) for data in reader: _ = data.fillna("", inplace=True) X = data[["ql", "qr"]].apply(feature_extractor, axis=1) X_features = X.apply(pd.Series) feature_names = X_features.columns.values.tolist() X_features = X_features[feature_names] y_preds = model.predict_proba(X_features, ntree_limit=model.best_ntree_limit) y_preds = map(lambda o: o[1], y_preds) data = pd.concat([data, X_features], axis=1) data = data.assign(predict=y_preds) #if first_chunk: # data.to_csv(in_path + ".predict", header=True, sep="\t", mode="w") # first_chunk = False #else: # data.to_csv(in_path + ".predict", header=False, sep="\t", mode="a") data.to_csv(sys.stdout, header=False, sep="\t") explain = eli5.explain_prediction(model, X_features.iloc[0]) explain = eli5.format_as_text(explain) print explain print X_features.iloc[0]
def extract(): wordseg = Wordsegmenter("./bin/pyseg.so", "./bin/qsegconf.ini") sent_word2vec_path = "./data/word2vec.query.bin" sent_vocab_path = "./data/word2vec.query.vocab" sent_model_path = "./data/sif.model" sent_word2vec = KeyedVectors.load_word2vec_format(sent_word2vec_path, binary=True) sent_vocab_dict = load_vocab(sent_vocab_path) sent_model = joblib.load(sent_model_path) tfidf_count_hash_vectorModels = VectorModels() ner_dict_path = "./data/ner.dict" syn_dict_path = "./data/syn.dict" ner_dict, syn_dict = load_ner_dict(ner_dict_path, syn_dict_path) for line in sys.stdin: line = line.strip("\r\n") parts = line.split("\t") ql = parts[1] qr = parts[2] feature_dict = extract_features(wordseg, ql, qr, tfidf_count_hash_vectorModels, sent_word2vec, sent_vocab_dict, sent_model, ner_dict, syn_dict) print "{}\t{}".format(line, json.dumps(feature_dict))
def process(): wordseg = Wordsegmenter("./bin/pyseg.so", "./bin/qsegconf.ini") for line in sys.stdin: line = line.strip('\r\n') parts = line.split('\t') ql = parts[1] qr = parts[2] feature_dict = extract_features(ql, qr) print "{}\t{}".format(line, json.dumps(feature_dict))
def train(): wordseg = Wordsegmenter("./bin/pyseg.so", "./bin/qsegconf.ini") in_path = "./data/paraphrase_man_annotation.txt" sent_word2vec_path = "./data/word2vec.query.bin" sent_vocab_path = "./data/word2vec.query.vocab" sent_model_path = "./data/sif.model" sent_word2vec = KeyedVectors.load_word2vec_format(sent_word2vec_path, binary=True) sent_vocab_dict = load_vocab(sent_vocab_path) sent_model = joblib.load(sent_model_path) #input tfidf count and hash model tfidf_count_hash_vectorModels = VectorModels() data = pd.read_csv(in_path, sep="\t", dtype='str', names=['qid', 'ql', 'qr', 'label']) X = data[['ql', 'qr']].apply(lambda row: extract_features( wordseg, row['ql'], row['qr'], tfidf_count_hash_vectorModels, sent_word2vec, sent_vocab_dict, sent_model), axis=1) print("get all vector") X = pd.DataFrame(list(X)) y = data['label'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) model = LinearSVC() model.fit(X_train, y_train) for x in model.coef_[0]: print(x) model_path = "./model/paraphrase.svm_model" joblib.dump(model, model_path) y_preds = model.predict(X_test) mean_f1 = f1_score(y_test, y_preds, average='micro') print mean_f1 print classification_report(y_test, y_preds, target_names=["paraphrase", "other"]) feature_names = X.columns.values.tolist() for feature_name, coef in zip(feature_names, model.coef_.ravel()): print "%s\t%f" % (feature_name, coef)
if action == "save": if train_in_path: f = open(train_in_path,'r') texts = f.readlines() f.close() save_fit_result(texts) if action == "train": if pred_in_path: wordseg = Wordsegmenter("./bin/pyseg.so", "./bin/qsegconf.ini") in_path = "./data/yao_test_data.txt" data = pd.read_csv(in_path, sep="\t", dtype='str', names=['qid', 'ql', 'qr', 'label']) vectorModels = VectorModels() count_tfidf_hash_features = data[['ql', 'qr']].apply(lambda row: extract_features(wordseg, row['ql'], row['qr'], vectorModels), axis=1) feature_dict.update(count_tfidf_hash_features) if __name__ == "__main__": #main() wordseg = Wordsegmenter("./bin/pyseg.so", "./bin/qsegconf.ini") in_path = "./data/yao_test_data.txt" data = pd.read_csv(in_path, sep="\t", dtype='str', names=['qid', 'ql', 'qr', 'label']) vectorModels = VectorModels() count_tfidf_hash_features = data[['ql', 'qr']].apply(lambda row: extract_features(wordseg, row['ql'], row['qr'], vectorModels), axis=1) for k in count_tfidf_hash_features: print(k)