def main(path): train,test,wordVocab = loaddata(path) maxSentLen = 140 train_sent = pad_sequences(train["sent"],maxlen=maxSentLen,padding="post") train_gen = pad_sequences(train["gen"],maxlen=5,padding="post") train_disease = pad_sequences(train["disease"],maxlen=5,padding="post") test_sent = pad_sequences(test["sent"],maxlen=maxSentLen,padding="post") test_gen = pad_sequences(test["gen"],maxlen=5,padding="post") test_disease = pad_sequences(test["disease"],maxlen=5,padding="post") train_rel = np.array(train["rel"]) test_rel = np.array(test["rel"]) print(train_rel.shape) print(test_rel.shape) # load pre-embedding print("loading embedding...") embedding_size = 200 embeddingVocab_size = len(wordVocab) # w2v_dir_path = "/media/network/watching_dog/embedding/bio_nlp_vec/PubMed-shuffle-win-30.bin" w2v_dir_path = "/media/kazane/watching_dog/embedding/bio_nlp_vec/PubMed-shuffle-win-30.bin" word2vec = KeyedVectors.load_word2vec_format(w2v_dir_path, binary=True, unicode_errors='ignore') print("build embedding weights...") embedding_weights = np.zeros((embeddingVocab_size + 1, embedding_size)) unknow_words = [] know_words = [] for word, index in wordVocab.items(): try: embedding_weights[index, :] = word2vec[word.lower()] know_words.append(word) except KeyError as E: # print(E) unknow_words.append(word) embedding_weights[index, :] = np.random.uniform(-0.025, 0.025, embedding_size) print("unknow_per: ", len(unknow_words) / embeddingVocab_size, " unkownwords: ", len(unknow_words), " vocab_size: ", embeddingVocab_size) model = myModel(sent_lenth=maxSentLen,word_embedding=embedding_weights) model.attn(embedding=embedding_weights) model.train(inputs=[train_sent,train_gen,train_disease,], label=[train_rel,], save_path="./outputs/checkpoint", validation_split=0.1, batch_size=128, epochs=5, ) y_ = model.predict([test_sent,test_gen,test_disease]) # np.save(file=path+"/"+"outputs.npy",arr=y_) is_gate = False if is_gate is False: y_ = get_Result(y_,) get_F1Value(y_=y_,y=test_rel,path=path) else: get_ROC(y_,test_rel,path=path,gate=100)
def testmodel(filepath): """ according to the image feature, predict the img label """ print("test model....") images, tags = loaddata(filepath) feature_info = extractfeature(images, tags) x = [info[0] for info in feature_info] y = [info[1] for info in feature_info] clf = joblib.load('train_model.m') samples_proba = clf.predict_proba(x) # predict the test images probability top5_index = np.argsort(-samples_proba, axis=1)[:, :5].tolist() res = [] for (i, tag) in enumerate(y): res.append(tag in top5_index[i]) return res
def testmodel(path): def getFeatVec(features, clf): featVec = np.zeros((1, 1000)) res = clf.predict(features) for i in res: featVec[0][i] += 1 return featVec def result(predict_y, test_y, num): print("###############################") right = 0 res = [] for i, tag in enumerate(test_y): if tag in predict_y[i][21 - num:21]: right += 1 res.append(True) else: res.append(False) print("%d/%d = %f" % (right, len(test_y), right * 1.0 / len(test_y))) print("###############################") return res print('load model') svm = joblib.load("./05svm.model") clf = joblib.load("./05vocab.pkl") data, tags = loaddata(path) features, tags = extractfeature(data, tags) print('predict') test_x = np.float32([]).reshape(0, 1000) for feature in features: featVec = getFeatVec(feature, clf) test_x = np.append(test_x, featVec, axis=0) p = svm.predict_proba(test_x) p = p.argsort() res = result(p, tags, 5) return res
def testmodel(path): print("[test] start") data, tags = loaddata(path) feats, types = extractfeature(data, tags) svm = joblib.load("svm.pkl") pca = joblib.load("pca.pkl") normalizer = joblib.load("normalizer.pkl") feats = pca.transform(feats) feats = normalizer.transform(feats) probability = svm.predict_proba(feats) args = probability.argsort(axis=1) re = [] for i in range(args.shape[0]): if int(types[i]) in args[i][-5:]: re.append("True") else: re.append("False") print("[test] end") return re
import pandas as pd import numpy as np from load import loaddata from scipy import stats from sklearn.preprocessing import Imputer from sklearn.model_selection import KFold from sklearn.svm import SVR from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error data = loaddata('train.csv', 0.05) X = data.loc[:, 'Feature_1':'Ret_120'].values y = data.loc[:, 'Ret_121':'Ret_PlusTwo'].values X_feature = data.loc[:, 'Feature_1':'Feature_25'].values X_mins = data.loc[:, 'Ret_2':'Ret_120'].values X_ret = data.loc[:, 'Ret_MinusTwo':'Ret_MinusOne'].values daily_weights = data['Weight_Daily'].values intraday_weights = data['Weight_Intraday'].values #print(X.shape, y.shape) imp = Imputer(missing_values='NaN', strategy="mean", axis=0) X_imp = imp.fit_transform(X) X_featureimp = imp.fit_transform(X_feature) X_minsimp = imp.fit_transform(X_mins) X_retimp = imp.fit_transform(X_ret)
import pandas as pd import numpy as np import os import matplotlib.pyplot as plt import random import tensorflow as tf from sklearn.preprocessing import Imputer from sklearn.model_selection import KFold from sklearn.decomposition import PCA from load import loaddata #------------------------------------------------------------------------------- # read data #------------------------------------------------------------------------------- data = loaddata('train.csv', 1.0) features = data.loc[:, 'Feature_1':'Feature_25'].values rets = data.loc[:, 'Ret_MinusTwo':'Ret_PlusTwo'].values daily_weights = data['Weight_Daily'].values intraday_weights = data['Weight_Intraday'].values test_data = loaddata('test_2.csv', 3.0) test_features = test_data.loc[:, 'Feature_1':'Feature_25'].values test_rets = test_data.loc[:, 'Ret_MinusTwo':'Ret_120'].values print("test rets shape ", test_rets.shape) #------------------------------------------------------------------------------- # preprocess #------------------------------------------------------------------------------- imp_axis0 = Imputer(missing_values='NaN', strategy="mean", axis=0) imp_axis1 = Imputer(missing_values='NaN', strategy="mean", axis=1)
def main(): # only read raw data if so required (cleaned files do not exist yet) if READ_RAW_DATA: # train set dataset = '../../data/training_set_VU_DM.csv' # test set (turn off relevance score in this case!) #dataset = '../../data/test_set_VU_DM.csv' # take the first 1000 lines of the dataset only - use this for testing # to make the code less slow! Comment it out for finalizing # dataset = '../data/testfile.csv' # loading in the right file data = load.loaddata(dataset) # # create competitor features data = features.create_competitor_features(data) # # create other features data = features.other_features(data) # # add relevance grades data = features.relevance_score(data) # remove outliers data = eda.remove_outliers(data) # # handling missing values data = eda.missing_values(data) if PLOT: # take a sample of the data to make plotting feasible sample_data = data.sample(n=500000) # plot distributions eda.plot_distributions(sample_data) # plot correlations between sets of variables eda.plot_correlations(sample_data) # plot impact of price of competitor on booking eda.plot_competitor_price_impact(sample_data) # get correlations of the features correlations.show_correlations(sample_data) # divide data into train and test set (and save these) train_data, test_data = process.split_train_test(data) # downsample data to create class balance (and save) downsampled_train_data = process.downsample(train_data) # upsample data to create class balance (and save it) upsampled_train_data = process.upsample(train_data) # data is already loaded - only need to load it from file # test for the best set of hyperparameters if HYPERPARAM: # get the appropriate training set if SAMPLING_METHOD == "downsample": traindataset = '../data/downsampled_crossvalidation_set.csv' elif SAMPLING_METHOD == "upsample": traindataset = "../data/upsampled_crossvalidation_set.csv" elif SAMPLING_METHOD == "none": traindataset = "../data/full_crossvalidation_set.csv" # loading in the data train_data = load.loaddata(traindataset) # remove columns not in test dataset keep_cols = [col for col in train_data.columns if col not in ['booking_bool', 'click_bool']] # sample a smaller subset to make this all feasible train_data = train_data[keep_cols].sample(n=4000) print(train_data.columns) # Train lambdamart for different hyperparam values and evaluate on validation set trees = [5, 10, 50, 100, 150, 300, 400] lrs = [0.15, 0.10, 0.8, 0.05, 0.01] indices = [] for i in range(np.array(train_data.shape[0])): items = [0, 1] indices.append(items) indices = np.array(indices) # K-fold cross validation for different parameter combinations for tree in trees: for lr in lrs: # indices = np.array(train_data.shape[0]) kf = KFold(n_splits = 5) ndcgs = [] for train_index, test_index in kf.split(indices): train_index = train_index.tolist() test_index = test_index.tolist() # Split up data X_train, X_validation = train_data.iloc[train_index], train_data.iloc[test_index] # Run lambdamart on training data and evaluate on validation data ndcg = models.lambdamart(X_train, X_validation, tree, lr, SAMPLING_METHOD) print(ndcg) ndcgs.append(ndcg) average_ndcg = np.mean(ndcgs) # Save NDCG file = '../results/hyperparams/crossvalidation_' + SAMPLING_METHOD + '.txt' with open(file, 'a') as f: line = 'trees: ' + str(tree) + ', lr: ' + str(lr) + ', average_ndcg: ' + str(average_ndcg) + '\n' print(line) f.write(line) f.close() # run the full model if LAMBDAMART: # test data is always the same testdataset = '../data/testing_set_only.csv' # get the appropriate training set if SAMPLING_METHOD == "downsample": traindataset = '../data/downsampled_training_set_only.csv' elif SAMPLING_METHOD == "upsample": traindataset = "../data/upsampled_training_set_only.csv" elif SAMPLING_METHOD == "none": traindataset = "../data/full_training_set_only.csv" # loading in the data train_data = load.loaddata(traindataset) # loading in final test set test_data = load.loaddata(testdataset) # hyperparameters trees = 2 lrs = 0.10 # train lambdamart and evaluate on test set ndcg = models.lambdamart(train_data, test_data, trees, lrs, SAMPLING_METHOD) print(ndcg)
from load import loaddata from feature import extractfeature from classify import trainmodel from predict import testmodel from datetime import datetime start = datetime.now() images, tags = loaddata("../data/images/train") feature_info = extractfeature(images, tags) trainmodel(feature_info) result = testmodel("../data/images/test") print("The accuracy is {}.".format(sum(result) / len(result))) print("The whole proess cost {}.".format(datetime.now() - start))
from load import loaddata from feature import extractfeature from classify import trainmodel from predict import testmodel data, tags = loaddata("/home/acytoo/train_dataset/") features = extractfeature(data, tags) trainmodel(features) res = testmodel("/home/acytoo/test_dataset/") print "precise: ", res.count("True")*1.0/len(res)
from load import loaddata from feature import extractfeature from classify import trainmodel from predict import testmodel if __name__ == '__main__': data,tags = loaddata('./train') features = extractfeature(data,tags) trainmodel(features) res = testmodel('./test')
dtest = xgb.DMatrix(X_test, label=y_test) # hyperparameters param = {'max_depth': 5, 'eta': 0.01, 'objective': 'rank:ndcg'} num_round = 20 # specify model model = xgb.train(param, dtrain, num_round) # predict preds = model.predict(dtest) return preds, y_test if __name__ == '__main__': traindataset = '../data/downsampled_training_set.csv' train_data = load.loaddata(traindataset) train_data = train_data.drop('date_time', axis=1) train_data = features.relevance_score(train_data) testdataset = '../data/test_subset.csv' test_data = load.loaddata(testdataset) test_data = test_data.drop('date_time', axis=1) test_data = features.relevance_score(test_data) y_pred, y_test = lambda_mart(train_data, test_data) print(len(y_test)) print(len(y_pred)) #print('Accuracy: {0:.4f}'.format(accuracy_score(y_test, y_pred)))
from gensim import corpora, models, similarities from load import loaddata from nltk.corpus import brown from nltk.corpus import stopwords import jieba import logging import nltk import load #Query target = '房貸要如何申請?最近的利率優惠如何?' #1. get jieba jieba_target = list(jieba.cut(target,cut_all=False)) jieba_target_stopword=[] for i in range(0,len(jieba_target)): if jieba_target[i] not in loaddata().chinese_stopwords: jieba_target_stopword.append(jieba_target[i]) #2. get vec vec_target = dictionary.doc2bow(jieba_target_stopword) #3. get lda score score_target = lsi[vec_target] #sims是LSI的結果 index = similarities.MatrixSimilarity(lsi[bank_questions_corpus]) sims = index[score_target] ###建立權重分數 #建立權重陣列 tier1=[] tier2=[]