Beispiel #1
0
def main(path):
    train,test,wordVocab = loaddata(path)

    maxSentLen = 140
    train_sent = pad_sequences(train["sent"],maxlen=maxSentLen,padding="post")
    train_gen = pad_sequences(train["gen"],maxlen=5,padding="post")
    train_disease = pad_sequences(train["disease"],maxlen=5,padding="post")
    test_sent = pad_sequences(test["sent"],maxlen=maxSentLen,padding="post")
    test_gen = pad_sequences(test["gen"],maxlen=5,padding="post")
    test_disease = pad_sequences(test["disease"],maxlen=5,padding="post")
    train_rel = np.array(train["rel"])
    test_rel = np.array(test["rel"])

    print(train_rel.shape)
    print(test_rel.shape)

    # load pre-embedding
    print("loading embedding...")
    embedding_size = 200
    embeddingVocab_size = len(wordVocab)

    # w2v_dir_path = "/media/network/watching_dog/embedding/bio_nlp_vec/PubMed-shuffle-win-30.bin"
    w2v_dir_path = "/media/kazane/watching_dog/embedding/bio_nlp_vec/PubMed-shuffle-win-30.bin"

    word2vec = KeyedVectors.load_word2vec_format(w2v_dir_path, binary=True, unicode_errors='ignore')

    print("build embedding weights...")
    embedding_weights = np.zeros((embeddingVocab_size + 1, embedding_size))
    unknow_words = []
    know_words = []
    for word, index in wordVocab.items():
        try:
            embedding_weights[index, :] = word2vec[word.lower()]
            know_words.append(word)
        except KeyError as E:
            # print(E)
            unknow_words.append(word)
            embedding_weights[index, :] = np.random.uniform(-0.025, 0.025, embedding_size)
    print("unknow_per: ", len(unknow_words) / embeddingVocab_size, " unkownwords: ", len(unknow_words), " vocab_size: ",
          embeddingVocab_size)

    model = myModel(sent_lenth=maxSentLen,word_embedding=embedding_weights)
    model.attn(embedding=embedding_weights)

    model.train(inputs=[train_sent,train_gen,train_disease,],
                label=[train_rel,],
                save_path="./outputs/checkpoint",
                validation_split=0.1,
                batch_size=128,
                epochs=5,
                )
    y_ = model.predict([test_sent,test_gen,test_disease])
    # np.save(file=path+"/"+"outputs.npy",arr=y_)

    is_gate = False
    if is_gate is False:
        y_ = get_Result(y_,)
        get_F1Value(y_=y_,y=test_rel,path=path)
    else:
        get_ROC(y_,test_rel,path=path,gate=100)
def testmodel(filepath):
    """
    according to the image feature, predict the img label
    """
    print("test model....")
    images, tags = loaddata(filepath)
    feature_info = extractfeature(images, tags)

    x = [info[0] for info in feature_info]
    y = [info[1] for info in feature_info]

    clf = joblib.load('train_model.m')
    samples_proba = clf.predict_proba(x)  # predict the test images probability
    top5_index = np.argsort(-samples_proba, axis=1)[:, :5].tolist()
    res = []
    for (i, tag) in enumerate(y):
        res.append(tag in top5_index[i])
    return res
Beispiel #3
0
def testmodel(path):
    def getFeatVec(features, clf):
        featVec = np.zeros((1, 1000))
        res = clf.predict(features)
        for i in res:
            featVec[0][i] += 1
        return featVec

    def result(predict_y, test_y, num):
        print("###############################")
        right = 0
        res = []
        for i, tag in enumerate(test_y):
            if tag in predict_y[i][21 - num:21]:
                right += 1
                res.append(True)
            else:
                res.append(False)
        print("%d/%d = %f" % (right, len(test_y), right * 1.0 / len(test_y)))
        print("###############################")
        return res

    print('load model')
    svm = joblib.load("./05svm.model")
    clf = joblib.load("./05vocab.pkl")
    data, tags = loaddata(path)
    features, tags = extractfeature(data, tags)

    print('predict')
    test_x = np.float32([]).reshape(0, 1000)
    for feature in features:
        featVec = getFeatVec(feature, clf)
        test_x = np.append(test_x, featVec, axis=0)
    p = svm.predict_proba(test_x)
    p = p.argsort()
    res = result(p, tags, 5)
    return res
def testmodel(path):
    print("[test] start")
    data, tags = loaddata(path)

    feats, types = extractfeature(data, tags)

    svm = joblib.load("svm.pkl")
    pca = joblib.load("pca.pkl")
    normalizer = joblib.load("normalizer.pkl")

    feats = pca.transform(feats)
    feats = normalizer.transform(feats)

    probability = svm.predict_proba(feats)
    args = probability.argsort(axis=1)
    re = []
    for i in range(args.shape[0]):
        if int(types[i]) in args[i][-5:]:
            re.append("True")
        else:
            re.append("False")

    print("[test] end")
    return re
Beispiel #5
0
import pandas as pd
import numpy as np
from load import loaddata
from scipy import stats
from sklearn.preprocessing import Imputer
from sklearn.model_selection import KFold
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

data = loaddata('train.csv', 0.05)

X = data.loc[:, 'Feature_1':'Ret_120'].values
y = data.loc[:, 'Ret_121':'Ret_PlusTwo'].values

X_feature = data.loc[:, 'Feature_1':'Feature_25'].values
X_mins = data.loc[:, 'Ret_2':'Ret_120'].values
X_ret = data.loc[:, 'Ret_MinusTwo':'Ret_MinusOne'].values

daily_weights = data['Weight_Daily'].values
intraday_weights = data['Weight_Intraday'].values

#print(X.shape, y.shape)

imp = Imputer(missing_values='NaN', strategy="mean", axis=0)
X_imp = imp.fit_transform(X)
X_featureimp = imp.fit_transform(X_feature)
X_minsimp = imp.fit_transform(X_mins)
X_retimp = imp.fit_transform(X_ret)
Beispiel #6
0
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import random
import tensorflow as tf
from sklearn.preprocessing import Imputer
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA

from load import loaddata

#-------------------------------------------------------------------------------
# read data
#-------------------------------------------------------------------------------
data = loaddata('train.csv', 1.0)
features = data.loc[:, 'Feature_1':'Feature_25'].values
rets = data.loc[:, 'Ret_MinusTwo':'Ret_PlusTwo'].values
daily_weights = data['Weight_Daily'].values
intraday_weights = data['Weight_Intraday'].values

test_data = loaddata('test_2.csv', 3.0)
test_features = test_data.loc[:, 'Feature_1':'Feature_25'].values
test_rets = test_data.loc[:, 'Ret_MinusTwo':'Ret_120'].values
print("test rets shape ", test_rets.shape)

#-------------------------------------------------------------------------------
# preprocess
#-------------------------------------------------------------------------------
imp_axis0 = Imputer(missing_values='NaN', strategy="mean", axis=0)
imp_axis1 = Imputer(missing_values='NaN', strategy="mean", axis=1)
Beispiel #7
0
def main():

    # only read raw data if so required (cleaned files do not exist yet)
    if READ_RAW_DATA:

        # train set
        dataset = '../../data/training_set_VU_DM.csv'

        # test set (turn off relevance score in this case!)
        #dataset = '../../data/test_set_VU_DM.csv'

        # take the first 1000 lines of the dataset only - use this for testing
        # to make the code less slow! Comment it out for finalizing
        # dataset = '../data/testfile.csv'

        # loading in the right file
        data = load.loaddata(dataset)

        # # create competitor features
        data = features.create_competitor_features(data)

        # # create other features
        data = features.other_features(data)

        # # add relevance grades
        data = features.relevance_score(data)

        # remove outliers
        data = eda.remove_outliers(data)

        # # handling missing values
        data = eda.missing_values(data)

        if PLOT:

            # take a sample of the data to make plotting feasible
            sample_data = data.sample(n=500000)

            # plot distributions
            eda.plot_distributions(sample_data)

            # plot correlations between sets of variables
            eda.plot_correlations(sample_data)

            # plot impact of price of competitor on booking
            eda.plot_competitor_price_impact(sample_data)

            # get correlations of the features
            correlations.show_correlations(sample_data)

        # divide data into train and test set (and save these)
        train_data, test_data = process.split_train_test(data)

        # downsample data to create class balance (and save)
        downsampled_train_data = process.downsample(train_data)

        # upsample data to create class balance (and save it)
        upsampled_train_data = process.upsample(train_data)


    # data is already loaded - only need to load it from file
    # test for the best set of hyperparameters
    if HYPERPARAM:

        # get the appropriate training set
        if SAMPLING_METHOD == "downsample":

            traindataset = '../data/downsampled_crossvalidation_set.csv'

        elif SAMPLING_METHOD == "upsample":

            traindataset = "../data/upsampled_crossvalidation_set.csv"

        elif SAMPLING_METHOD == "none":

            traindataset = "../data/full_crossvalidation_set.csv"

        # loading in the data
        train_data = load.loaddata(traindataset)

        # remove columns not in test dataset
        keep_cols = [col for col in train_data.columns if col not in ['booking_bool', 'click_bool']]

        # sample a smaller subset to make this all feasible
        train_data = train_data[keep_cols].sample(n=4000)
        print(train_data.columns)

        # Train lambdamart for different hyperparam values and evaluate on validation set
        trees = [5, 10, 50, 100, 150, 300, 400]
        lrs = [0.15, 0.10, 0.8, 0.05, 0.01]

        indices = []
        for i in range(np.array(train_data.shape[0])):
            items = [0, 1]
            indices.append(items)

        indices = np.array(indices)

        # K-fold cross validation for different parameter combinations
        for tree in trees:
            for lr in lrs:
                # indices = np.array(train_data.shape[0])
                kf = KFold(n_splits = 5)

                ndcgs = []
                for train_index, test_index in kf.split(indices):

                    train_index = train_index.tolist()
                    test_index = test_index.tolist()

                    # Split up data
                    X_train, X_validation = train_data.iloc[train_index], train_data.iloc[test_index]

                    # Run lambdamart on training data and evaluate on validation data
                    ndcg = models.lambdamart(X_train, X_validation, tree, lr, SAMPLING_METHOD)
                    print(ndcg)
                    ndcgs.append(ndcg)

                average_ndcg = np.mean(ndcgs)

                # Save NDCG
                file = '../results/hyperparams/crossvalidation_' + SAMPLING_METHOD + '.txt'
                with open(file, 'a') as f:
                    line = 'trees: ' + str(tree) + ', lr: ' + str(lr) + ', average_ndcg: ' + str(average_ndcg) + '\n'
                    print(line)
                    f.write(line)
                f.close()

        # run the full model
        if LAMBDAMART:

            # test data is always the same
            testdataset = '../data/testing_set_only.csv'

            # get the appropriate training set
            if SAMPLING_METHOD == "downsample":

                traindataset = '../data/downsampled_training_set_only.csv'

            elif SAMPLING_METHOD == "upsample":

                traindataset = "../data/upsampled_training_set_only.csv"

            elif SAMPLING_METHOD == "none":

                traindataset = "../data/full_training_set_only.csv"

            # loading in the data
            train_data = load.loaddata(traindataset)

            # loading in final test set
            test_data = load.loaddata(testdataset)

            # hyperparameters
            trees = 2
            lrs = 0.10

            # train lambdamart and evaluate on test set
            ndcg = models.lambdamart(train_data, test_data, trees, lrs, SAMPLING_METHOD)
            print(ndcg)
Beispiel #8
0
from load import loaddata
from feature import extractfeature
from classify import trainmodel
from predict import testmodel
from datetime import datetime


start = datetime.now()
images, tags = loaddata("../data/images/train")
feature_info = extractfeature(images, tags)
trainmodel(feature_info)
result = testmodel("../data/images/test")
print("The accuracy is {}.".format(sum(result) / len(result)))
print("The whole proess cost {}.".format(datetime.now() - start))

Beispiel #9
0
from load import loaddata
from feature import extractfeature
from classify import trainmodel
from predict import testmodel


data, tags = loaddata("/home/acytoo/train_dataset/")

features = extractfeature(data, tags)

trainmodel(features)

res = testmodel("/home/acytoo/test_dataset/")

print "precise: ", res.count("True")*1.0/len(res)
from load import loaddata
from feature import extractfeature
from classify import trainmodel
from predict import testmodel

if __name__ == '__main__':
    data,tags = loaddata('./train')
    features = extractfeature(data,tags)
    trainmodel(features)
    res = testmodel('./test')
Beispiel #11
0
    dtest = xgb.DMatrix(X_test, label=y_test)

    # hyperparameters
    param = {'max_depth': 5, 'eta': 0.01, 'objective': 'rank:ndcg'}
    num_round = 20

    # specify model
    model = xgb.train(param, dtrain, num_round)

    # predict
    preds = model.predict(dtest)
    return preds, y_test


if __name__ == '__main__':

    traindataset = '../data/downsampled_training_set.csv'
    train_data = load.loaddata(traindataset)
    train_data = train_data.drop('date_time', axis=1)
    train_data = features.relevance_score(train_data)

    testdataset = '../data/test_subset.csv'
    test_data = load.loaddata(testdataset)
    test_data = test_data.drop('date_time', axis=1)
    test_data = features.relevance_score(test_data)

    y_pred, y_test = lambda_mart(train_data, test_data)
    print(len(y_test))
    print(len(y_pred))
    #print('Accuracy: {0:.4f}'.format(accuracy_score(y_test, y_pred)))
Beispiel #12
0
from gensim import corpora, models, similarities
from load import loaddata
from nltk.corpus import brown
from nltk.corpus import stopwords
import jieba
import logging
import nltk
import load

#Query
target = '房貸要如何申請?最近的利率優惠如何?'
#1. get jieba
jieba_target = list(jieba.cut(target,cut_all=False))
jieba_target_stopword=[]
for i in range(0,len(jieba_target)):
    if jieba_target[i] not in loaddata().chinese_stopwords:
        jieba_target_stopword.append(jieba_target[i])
#2. get vec
vec_target = dictionary.doc2bow(jieba_target_stopword)
#3. get lda score
score_target = lsi[vec_target]

#sims是LSI的結果
index = similarities.MatrixSimilarity(lsi[bank_questions_corpus])
sims = index[score_target]


###建立權重分數
#建立權重陣列
tier1=[]
tier2=[]