Beispiel #1
0
def main(vectorModels):
    feature_dict = {}
    
    action = None
    train_in_path = ""
    pred_in_path = ""
    opts, args = getopt.getopt(sys.argv[1:], "a:", ["save_in_path=", "train_in_path="])
    for op, value in opts:
        if op == "-a":
            action = value
        if op == "--save_in_path":
            train_in_path = value
        if op == "--train_in_path":
            pred_in_path = value
            
    if action == "save":
        if train_in_path:
            f = open(train_in_path,'r')
            texts = f.readlines()
            f.close()
            save_fit_result(texts)
            
    if action == "train":
        if pred_in_path:
            wordseg = Wordsegmenter("./bin/pyseg.so", "./bin/qsegconf.ini")
            in_path = "./data/yao_test_data.txt"
            data = pd.read_csv(in_path, sep="\t", dtype='str', names=['qid', 'ql', 'qr', 'label'])
            vectorModels = VectorModels()
            count_tfidf_hash_features = data[['ql', 'qr']].apply(lambda row: extract_features(wordseg, row['ql'], row['qr'], vectorModels), axis=1)
            feature_dict.update(count_tfidf_hash_features)
Beispiel #2
0
def explain(model_path):
    wordseg = Wordsegmenter("./bin/pyseg.so", "./bin/qsegconf.ini")
    sent_word2vec_path = "./data/word2vec.query.bin"
    sent_vocab_path = "./data/word2vec.query.vocab"
    sent_model_path = "./data/sif.model"

    sent_word2vec = KeyedVectors.load_word2vec_format(sent_word2vec_path,
                                                      binary=True)
    sent_vocab_dict = load_vocab(sent_vocab_path)
    sent_model = joblib.load(sent_model_path)

    tfidf_count_hash_vectorModels = VectorModels()

    ner_dict_path = "./data/ner.dict"
    syn_dict_path = "./data/syn.dict"
    ner_dict, syn_dict = load_ner_dict(ner_dict_path, syn_dict_path)

    model = joblib.load(model_path)

    pd.set_option('display.max_rows', None)

    explain = eli5.explain_weights(model, top=None)
    explain = eli5.format_as_text(explain)
    print explain

    feature_names = []
    column_names = ["qid", "ql", "qr"]
    #reader = pd.read_csv(in_path, sep="\t", dtype="str", names=column_names, chunksize=100)
    reader = pd.read_csv(sys.stdin,
                         sep="\t",
                         dtype="str",
                         names=column_names,
                         chunksize=1)
    first_chunk = True
    feature_extractor = lambda row: extract_features(
        wordseg, row["ql"], row["qr"], tfidf_count_hash_vectorModels,
        sent_word2vec, sent_vocab_dict, sent_model, ner_dict, syn_dict)
    for data in reader:
        _ = data.fillna("", inplace=True)

        X = data[["ql", "qr"]].apply(feature_extractor, axis=1)
        X_features = X.apply(pd.Series)
        feature_names = X_features.columns.values.tolist()
        X_features = X_features[feature_names]
        y_preds = model.predict_proba(X_features,
                                      ntree_limit=model.best_ntree_limit)
        y_preds = map(lambda o: o[1], y_preds)
        data = pd.concat([data, X_features], axis=1)
        data = data.assign(predict=y_preds)

        #if first_chunk:
        #    data.to_csv(in_path + ".predict", header=True, sep="\t", mode="w")
        #    first_chunk = False
        #else:
        #    data.to_csv(in_path + ".predict", header=False, sep="\t", mode="a")
        data.to_csv(sys.stdout, header=False, sep="\t")
        explain = eli5.explain_prediction(model, X_features.iloc[0])
        explain = eli5.format_as_text(explain)
        print explain
        print X_features.iloc[0]
Beispiel #3
0
def extract():
    wordseg = Wordsegmenter("./bin/pyseg.so", "./bin/qsegconf.ini")

    sent_word2vec_path = "./data/word2vec.query.bin"
    sent_vocab_path = "./data/word2vec.query.vocab"
    sent_model_path = "./data/sif.model"

    sent_word2vec = KeyedVectors.load_word2vec_format(sent_word2vec_path,
                                                      binary=True)
    sent_vocab_dict = load_vocab(sent_vocab_path)
    sent_model = joblib.load(sent_model_path)

    tfidf_count_hash_vectorModels = VectorModels()

    ner_dict_path = "./data/ner.dict"
    syn_dict_path = "./data/syn.dict"
    ner_dict, syn_dict = load_ner_dict(ner_dict_path, syn_dict_path)

    for line in sys.stdin:
        line = line.strip("\r\n")
        parts = line.split("\t")
        ql = parts[1]
        qr = parts[2]
        feature_dict = extract_features(wordseg, ql, qr,
                                        tfidf_count_hash_vectorModels,
                                        sent_word2vec, sent_vocab_dict,
                                        sent_model, ner_dict, syn_dict)

        print "{}\t{}".format(line, json.dumps(feature_dict))
Beispiel #4
0
def process():
    wordseg = Wordsegmenter("./bin/pyseg.so", "./bin/qsegconf.ini")
    for line in sys.stdin:
        line = line.strip('\r\n')
        parts = line.split('\t')
        ql = parts[1]
        qr = parts[2]
        feature_dict = extract_features(ql, qr)
        print "{}\t{}".format(line, json.dumps(feature_dict))
Beispiel #5
0
def train():
    wordseg = Wordsegmenter("./bin/pyseg.so", "./bin/qsegconf.ini")
    in_path = "./data/paraphrase_man_annotation.txt"

    sent_word2vec_path = "./data/word2vec.query.bin"
    sent_vocab_path = "./data/word2vec.query.vocab"
    sent_model_path = "./data/sif.model"

    sent_word2vec = KeyedVectors.load_word2vec_format(sent_word2vec_path,
                                                      binary=True)
    sent_vocab_dict = load_vocab(sent_vocab_path)
    sent_model = joblib.load(sent_model_path)

    #input tfidf count and hash model
    tfidf_count_hash_vectorModels = VectorModels()

    data = pd.read_csv(in_path,
                       sep="\t",
                       dtype='str',
                       names=['qid', 'ql', 'qr', 'label'])

    X = data[['ql', 'qr']].apply(lambda row: extract_features(
        wordseg, row['ql'], row['qr'], tfidf_count_hash_vectorModels,
        sent_word2vec, sent_vocab_dict, sent_model),
                                 axis=1)
    print("get all vector")
    X = pd.DataFrame(list(X))
    y = data['label']
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=42)
    model = LinearSVC()

    model.fit(X_train, y_train)

    for x in model.coef_[0]:
        print(x)

    model_path = "./model/paraphrase.svm_model"
    joblib.dump(model, model_path)
    y_preds = model.predict(X_test)
    mean_f1 = f1_score(y_test, y_preds, average='micro')
    print mean_f1
    print classification_report(y_test,
                                y_preds,
                                target_names=["paraphrase", "other"])

    feature_names = X.columns.values.tolist()
    for feature_name, coef in zip(feature_names, model.coef_.ravel()):
        print "%s\t%f" % (feature_name, coef)
Beispiel #6
0
            
    if action == "save":
        if train_in_path:
            f = open(train_in_path,'r')
            texts = f.readlines()
            f.close()
            save_fit_result(texts)
            
    if action == "train":
        if pred_in_path:
            wordseg = Wordsegmenter("./bin/pyseg.so", "./bin/qsegconf.ini")
            in_path = "./data/yao_test_data.txt"
            data = pd.read_csv(in_path, sep="\t", dtype='str', names=['qid', 'ql', 'qr', 'label'])
            vectorModels = VectorModels()
            count_tfidf_hash_features = data[['ql', 'qr']].apply(lambda row: extract_features(wordseg, row['ql'], row['qr'], vectorModels), axis=1)
            feature_dict.update(count_tfidf_hash_features)
            

if __name__ == "__main__":
    #main()
    
    wordseg = Wordsegmenter("./bin/pyseg.so", "./bin/qsegconf.ini")
    in_path = "./data/yao_test_data.txt"
    data = pd.read_csv(in_path, sep="\t", dtype='str', names=['qid', 'ql', 'qr', 'label'])
    vectorModels = VectorModels()
    count_tfidf_hash_features = data[['ql', 'qr']].apply(lambda row: extract_features(wordseg, row['ql'], row['qr'], vectorModels), axis=1)
    
    for k in count_tfidf_hash_features:
        print(k)