def main(argv): # Check if the appropriate amount of arguments were passed if len(argv) != 1: print("Usage: python3 main.py folder_with_datasets") exit(-1) print("Opening Datasets:") data = [] # Read and convert training data try: for x in os.listdir(argv[0]): print(x) if argv[0][-1] == "/": data.append([x, loadData(argv[0] + x)]) else: data.append([x, loadData(argv[0] + '/' + x)]) except FileNotFoundError: print("That is not a valid directory") exit(-1) # Train SVM print("\nTraining SVM") trained = tools_svm.train_svm(data) # Pickle it for later with open("trained.obj", "wb") as pickle_file: pickle.dump(trained, pickle_file) print("\nSVM Stored")
def processFile(file): file_name = file.split('/')[-1].split('.')[0] df = loader.loadData(file) df = preProcessing.targetToNum(df) df = preProcessing.createLabel(df) df_text = df[['AwardedAmountToDate', 'Abstract']] df_num = df.drop('Abstract', axis=1) df_text = preProcessing.htmlTagRemover(df_text) df_text = preProcessing.characterRemover(df_text) df_text = preProcessing.tokenizer(df_text) df_text = preProcessing.stemAndLemma(df_text) df_text = preProcessing.stopwordsRemover(df_text) # df_text_untagged = df_text.drop('AwardedAmountToDate', axis=1) df_num = preProcessing.nonPredictiveFeatureRemover(df_num) df_num = preProcessing.processDateFeatures(df_num) df_num = preProcessing.processCategoricalFeatures(df_num) # df_num_untagged = df_num.drop('AwardedAmountToDate', axis=1) text_file_name = file_name + '_text.pkl' num_file_name = file_name + '_num.pkl' with open(addresses['processed'] + text_file_name, 'wb') as f: pickle.dump(df_text, f) with open(addresses['processed'] + num_file_name, 'wb') as f: pickle.dump(df_num, f)
def analyzeBestFitUser(): """ 对测试集中最佳预测的用户进行深入的探究 """ movies, movieTagMat, userRankMat, testCases = loadData() user2userPredictor = user2user(userRankMat, topK=105) item2itemPredictor = item2item(userRankMat, movieTagMat, topK=20) # do test # _, results = predictTest(user2userPredictor, testCases, "") # _, results = predictTest(item2itemPredictor, testCases, "") # userAvgSSE = defaultdict(float) # for res in results: # userAvgSSE[res[0]] += (res[2] - res[1]) ** 2 # sse = list(userAvgSSE.items()) # sse.sort(key=lambda x: x[1]) # # best-fit user # uid, minSSE = sse[0] # print("(uid, smallest SSE): ({}, {})".format(uid, minSSE)) uid = 480 # do recommend # 使用不同的推荐系统进行结果对比 # recommender = Recommender(movieTagMat, userRankMat, movies, user2userPredictor) recommender = Recommender(movieTagMat, userRankMat, movies, item2itemPredictor) recommendMovies = recommender.doRecommend(uid, 50)["recommended_movies"] print("recommended movies:") recommendedCategory = defaultdict(int) for m, r in recommendMovies.items(): for genre in movies[r[0]].genres: recommendedCategory[genre] += 1 for k, v in sorted(recommendedCategory.items(), key=lambda d: d[0], reverse=True): print("{}: {}".format(k, v)) print("") # compare print("His or her favorite movies:") userRank = userRankMat[uid] idx = np.argsort(-userRank)[:50] userLikeCategory = defaultdict(int) for i in idx: for genre in movies[i].genres: userLikeCategory[genre] += 1 for k, v in sorted(userLikeCategory.items(), key=lambda d: d[0], reverse=True): print("{}: {}".format(k, v)) print("") for k, v in recommendedCategory.items(): if k in userLikeCategory: print("{},{},{}".format(k, v, userLikeCategory[k])) else: print("{},{},0".format(k, v)) for k, v in userLikeCategory.items(): if k not in recommendedCategory: print("{},0,{}".format(k, v))
def drawTopK_u2u() -> None: _, _, userRankMat, testSet = loadData() topKLst = list(range(1, 335)) sseLst = [] for topK in topKLst: sse, _ = predictTest(user2user(userRankMat, topK), testSet, "") sseLst.append(sse) draw(topKLst, sseLst, "Top K", "SSE", "参数Top K与平方误差和的关系折线图(1,335)", 1, 1) draw(topKLst[50:], sseLst[50:], "Top K", "SSE", "参数Top K与平方误差和的关系折线图(50,335)", 2, 1)
def drawTopK_i2i() -> None: _, movieTagMat, userRankMat, testSet = loadData() # topKLst = list(range(1, 101)) topKLst = list(range(1, 1301, 20)) sseLst = [] for topK in topKLst: sse, _ = predictTest(item2item(userRankMat, movieTagMat, topK=topK), testSet, "") print(sse) sseLst.append(sse) draw(topKLst, sseLst, "Top K", "SSE", "参数Top K与平方误差和的关系折线图(1,1300)", 1, 1)
def drawHashNumber_i2i() -> None: _, movieTagMat, userRankMat, testSet = loadData() sseLst = [] for hashFuncNum in range(1, 21): sse, _ = predictTest( item2item(userRankMat, movieTagMat, topK=20000, minHashParas=(hashFuncNum, 0, 2**32 - 1, 4294967311)), testSet, "") print(sse) sseLst.append(sse) draw(list(range(1, 21)), sseLst, "Number of hash functions", "SSE", "hash函数数量与平方误差和的关系折线图", 1, 10)
def drawHashNumber_u2u() -> None: _, _, userRankMat, testSet = loadData() threshold = 2.5 hashFuncNumber = range(100, 2001, 50) sseLst = [] for hashFuncNum in hashFuncNumber: sse, _ = predictTest( user2user(userRankMat, topK=105, threshold=threshold, minHashParas=(hashFuncNum, 0, 2**32 - 1, 4294967311)), testSet, "") print(sse) sseLst.append(sse) draw(hashFuncNumber, sseLst, "Number of hash functions", "SSE", "hash函数数量与平方误差和的关系折线图", 1, 10)
def doRecommender() -> None: """ 对所有用户进行推荐,输出到recommend文件夹中,结果以json文件的格式保存 """ begin = time() movies, movieTagMat, userRankMat, _ = loadData() predictor = item2item(userRankMat, movieTagMat, topK=100) recommender = Recommender(movieTagMat, userRankMat, movies, predictor) resFilePrefix: str = "./recommend/user" for i in range(userRankMat.shape[0]): with open(resFilePrefix + str(i + 1) + ".json", "w") as f: f.write( json.dumps(recommender.doRecommend(i, 50), indent=4, separators=(',', ':'))) end = time() print("total time usage: {}".format(end - begin))
import loader #from collections import Counter #import model #import pregex as pre data, group_idxs, test_data = loader.loadData("./data/csv.p", n_examples=1000, n_tasks=50, max_length=15) #M = loader.load('./models/task38.pt') #net = M['net'] #trace = M['trace'] #concepts = trace.baseConcepts #r = pre.create("(NA)|(NA)") #print(trace.model.scoreregex(r, trace)) # for concept in concepts: # print(str(concept)) # # c = Counter(concept.sample(trace) for _ in range(1000)) # # samples = sorted(c, key=c.get, reverse=True) # # print(samples) # # print() for i in range(len(test_data)): print(i, list(set(test_data[i]))[:5]) print(len(data), "train +", len(test_data), "test =", len(data) + len(test_data), "total")
import argparse import loader parser = argparse.ArgumentParser() parser.add_argument('--data_file', type=str, default="./data/csv.p") parser.add_argument('--n_tasks', type=int, default=40) #Per max_length parser.add_argument('--n_examples', type=int, default=500) parser.add_argument('--max_length', type=int, default=15) #maximum length of inputs or targets args = parser.parse_args() print("Loading data...") data, group_idxs, test_data = loader.loadData(args.data_file, args.n_examples, args.n_tasks, args.max_length) print("\nTraining Data:") for X in data: print(X[:5]) print("\nTest Data:") for X in test_data: print(X[:5])
import loader from propose import Proposal, evalProposal, getProposals, networkCache import util import torch import os import math import argparse import pregex as pre from trace import RegexWrapper parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, default=max(('results/%s'%x for x in os.listdir('results') if x[-3:]==".pt"), key=os.path.getmtime)) #Most recent model args = parser.parse_args() print("Loading", args.model) M = loader.load(args.model) if 'net' in M and M['net'] is not None: if torch.cuda.is_available(): M['net'].cuda() net = M['net'] data, group_idxs, test_data = loader.loadData(M['args'].data_file, M['args'].n_examples, M['args'].n_tasks, M['args'].max_length) trace = M['trace'] model = trace.model
def load_data(): matrix, y_vector = loader.loadData("../smartphone.txt") matrix = loader.tune_matrix(matrix) y_vector = np.transpose(y_vector) return matrix, y_vector
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Mar 24 10:59:50 2019 @author: xinning.w """ import loader import preProcessing PATH_TO_DATA = '/Users/mengjie/Documents/Courses/SpringII/6450NLP/Awards_data/' df = loader.loadData(PATH_TO_DATA) df = preProcessing.htmlTagRemover(df) df = preProcessing.characterRemover(df) df = preProcessing.tokenizer(df) df = preProcessing.stemAndLemma(df) df = preProcessing.stopwordsRemover(df) df = preProcessing.extractVectorMatrix(df) df = preProcessing.nonPredictiveFeatureRemover(df) df = preProcessing.processDateFeatures(df) df = preProcessing.processCategoricalFeatures(df) df.to_csv('processed.csv', index=False)
# IO name='.'.join(sys.argv[1].split(os.sep)[-1].split('.')[:-1]) if not param.has_key('name') else param['name'] err_output_folder=param['errOutputFolder'] model_saved_folder=param['modelSavedFolder'] gradient_saved_folder=param['gradientSavedFolder'] if param.has_key('gradientSavedFolder') else None out_file=param['outFile'] dictionary=loader.loadDict(dict_file) dictionary[-1]=np.random.randn(vector_dim)*0.5 if not os.path.exists(model_saved_folder): os.makedirs(model_saved_folder) if gradient_saved_folder!=None: if not os.path.exists(gradient_saved_folder): os.makedirs(gradient_saved_folder) train_index=loader.loadData(trainXFile) train_label=loader.loadData(trainYFile) if train_only==False: test_index=loader.loadData(testXFile) test_label=loader.loadData(testYFile) rnn=RNNs.RNNs(neurons=neurons,nonlinearity=nonlinearity) if model2load!=None: print 'load weights from file: %s'%model2load rnn.load(model2load,testOnly=True) results='''mode=%s,U_lr=%s,W_lr=%s,V_lr=%s,s_lr=%s,config file=%s\n'''%(mode, str(learn_rate['U']),str(learn_rate['W']),str(learn_rate['V']),str(learn_rate['s']),sys.argv[1]) print results, #Preprocess the data
loss = lasagne.objectives.aggregate( lasagne.objectives.categorical_crossentropy( lasagne.layers.get_output(output, x), y), mode = 'mean') updates = lasagne.updates.adagrad(loss, dcnnParams, learning_rate = 0.1) # ACCURACY FOR PREDICTIONS prediction = T.argmax(lasagne.layers.get_output(output, x, deterministic=True), axis=1) score = T.eq(prediction, y).mean() # SYMBOLIC FUNCTIONS trainDCNN = theano.function([x,y], outputs = loss, updates = updates) validateDCNN = theano.function([x,y], outputs = score) testDCNN = theano.function([x,y], outputs = score) # LOAD THE DATA trainingSentences = loader.loadData('myDataset/train.txt') trainingLabels = loader.loadData('myDataset/train_label.txt') validationSentences = loader.loadData('myDataset/dev.txt') validationLabels = loader.loadData('myDataset/dev_label.txt') testSentences = loader.loadData('myDataset/test.txt') testLabels = loader.loadData('myDataset/test_label.txt') # TRAIN THE MODEL print '...training the DCNN' for epoch in range(NUMOFEPOCHS): for i in xrange(len(trainingSentences)): trainDCNN(np.asarray(trainingSentences[i:i+1], dtype = np.int32), np.asarray(trainingLabels[i], dtype = np.int32)) print 'Sentence ', i, ' complete.' # SAVE THE TRAINED MODEL