# -*- coding: utf-8 -*- """ Created on Tue Nov 10 22:04:42 2015 @author: ssc317 """ import gzip from collections import defaultdict import mylib def readGz(f): for l in gzip.open(f): yield eval(l) data_ = [] for l in readGz("../train.json.gz"): data_.append(l) train_data = data_[:500000] valid_data = data_[500000:] mylib.saveData('../train_valid_1M',[train_data, valid_data]) mylib.saveData('../train_1M',[data_])
new_covariance = [(alpha - new_alpha)**2] + [(b-nb)**2 for b,nb in zip(beta_i.values(), new_beta_i.values())] + [(b-nb)**2 for b,nb in zip(beta_u.values(), new_beta_u.values())] new_covariance = sum(numpy.sqrt(new_covariance)) print "covariance is "+str(new_covariance) #if(alpha==new_alpha and beta_i == new_beta_i and beta_u == new_beta_u): #if(covariance < 1e-10): if(covariance == new_covariance): break else: alpha, beta_i, beta_u, iterNum, covariance = new_alpha, new_beta_i, new_beta_u, iterNum+1, new_covariance print "Finish iter " + str(iterNum) + " with lamda " + str(lamda) rating_valid_parameters = [] for d in valid_data: rating_valid_parameters.append([alpha, beta_u[d['reviewerID']], beta_i[d['itemID']]]) rating_valid_predict = [sum(para) for para in rating_valid_parameters] rating_valid_MSE = mse(rating_valid_predict, rating_valid_y) MSEs.append(rating_valid_MSE) thetas.append([alpha,beta_u,beta_i]) print "MSE of validation set is "+ str(rating_valid_MSE) print MSEs # In[] validation index = MSEs.index(min(MSEs)) [alpha,beta_u,beta_i] = thetas[index] rating_valid_parameters = [] for d in valid_data: rating_valid_parameters.append([alpha, beta_u[d['reviewerID']], beta_i[d['itemID']]]) rating_valid_predict = [sum(para) for para in rating_valid_parameters] rating_valid_MSE = mse(rating_valid_predict, rating_valid_y) print "MSE of validation set is "+ str(rating_valid_MSE) + " with lamda "+ str(index+1) mylib.saveData('1M_train_rating',[alpha, beta_u, beta_i])
# -*- coding: utf-8 -*- """ Created on Thu Nov 12 21:07:04 2015 @author: ssc317 """ import gzip import mylib from collections import defaultdict import string def readGz(f): for l in gzip.open(f): yield eval(l) data = [] for l in readGz("../train.json.gz"): data.append(l) mylib.saveData('../1Mtrain', [data])
rating_test_result = [[pr[0]+'-'+pr[1], str(ptr)] for pr,ptr in zip(pairs_Rating,rating_test_predict)] saveCSV('rating_test_result.csv',rating_test_result) # In[]================================================================================ # In[] main [data_] = mylib.loadData('./assignment1/data/1Mtrain') train_data = data_[:900000] valid_data = data_[900000:] del data_ dirty_limits = [10,11,12,13,14,15,16,17,18,19,20] sds = [0.3,0.5,0.7,0.9,1.0,2.0] dirty_bound = [[2.6,4.4],[2.8,4.2],[3,4],[3.5,3.5]] MSEs = [[[0 for dbi in range(len(dirty_bound))] for si in range(len(sds))] for dli in range(len(dirty_limits))] thetas = [[[0 for dbi in range(len(dirty_bound))] for si in range(len(sds))] for dli in range(len(dirty_limits))] for dli in range(len(dirty_limits)): dl = dirty_limits[dli] for si in range(len(sds)): s = sds[si] for dbi in range(len(dirty_bound)): db = dirty_bound[dbi] dirty_feature = [dl, s, db] [alpha, beta_i, beta_u, dirty_u, dirty_i] = convergence_dirty(train_data, 3, dirty_feature) MSE = validate_dirty(valid_data, dirty_u, dirty_i, alpha, beta_u, beta_i) MSEs[dli][si][dbi] = MSE thetas[dli][si][dbi] = [alpha, beta_i, beta_u, dirty_u, dirty_i] print "dirty_limit: " + str(dl) + " std: "+str(s)+ " dirty bound: "+ str(db) + "==> MSE: " + MSE mylib.saveData('dirty_features_theta', [MSEs,thetas]) print MSEs
'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'] dict_all, dict_count, dict_nHelpful, dict_helpful = defaultdict(float),defaultdict(int),defaultdict(float),defaultdict(float) punc = string.punctuation i = 0 print "start for loop" for l in readGz("../train.json.gz"): outOf = l['helpful']['outOf'] review = ''.join([o for o in list(l['reviewText']) if not o in punc]).split() for word in review: word = word.lower() if word not in stopwords: if outOf != 0: nHelpful = l['helpful']['nHelpful'] * 1.0; dict_count[word] += 1 dict_nHelpful[word] += nHelpful dict_helpful[word] += nHelpful / outOf dict_all[word] += 1 i += 1 if i == 500: mylib.saveData('../dicts_500',[dict_all, dict_count, dict_nHelpful, dict_helpful]) elif i == 5000: mylib.saveData('../dicts_5000',[dict_all, dict_count, dict_nHelpful, dict_helpful]) elif i == 50000: mylib.saveData('../dicts_50000',[dict_all, dict_count, dict_nHelpful, dict_helpful]) elif i == 500000: mylib.saveData('../dicts_500000',[dict_all, dict_count, dict_nHelpful, dict_helpful]) if i == 500000: break