# -*- coding: utf-8 -*-
"""
Created on Tue Nov 10 22:04:42 2015

@author: ssc317
"""
import gzip
from collections import defaultdict
import mylib
def readGz(f):
  for l in gzip.open(f):
    yield eval(l)

data_ = []
for l in readGz("../train.json.gz"):
    data_.append(l)
train_data = data_[:500000]
valid_data = data_[500000:]
mylib.saveData('../train_valid_1M',[train_data, valid_data])
mylib.saveData('../train_1M',[data_])
Esempio n. 2
0
        new_covariance = [(alpha - new_alpha)**2] + [(b-nb)**2 for b,nb in zip(beta_i.values(), new_beta_i.values())] + [(b-nb)**2 for b,nb in zip(beta_u.values(), new_beta_u.values())]
        new_covariance = sum(numpy.sqrt(new_covariance))
        print "covariance is "+str(new_covariance)
        #if(alpha==new_alpha and beta_i == new_beta_i and beta_u == new_beta_u):
        #if(covariance < 1e-10):
        if(covariance == new_covariance):
            break
        else:
            alpha, beta_i, beta_u, iterNum, covariance = new_alpha, new_beta_i, new_beta_u, iterNum+1, new_covariance
            print "Finish iter " + str(iterNum) + " with lamda " + str(lamda)
    rating_valid_parameters = []
    for d in valid_data:
        rating_valid_parameters.append([alpha, beta_u[d['reviewerID']], beta_i[d['itemID']]])
    rating_valid_predict = [sum(para) for para in rating_valid_parameters]
    rating_valid_MSE = mse(rating_valid_predict, rating_valid_y)
    MSEs.append(rating_valid_MSE)
    thetas.append([alpha,beta_u,beta_i])
    print "MSE of validation set is "+ str(rating_valid_MSE)
print MSEs
# In[] validation
index = MSEs.index(min(MSEs))
[alpha,beta_u,beta_i] = thetas[index]
rating_valid_parameters = []
for d in valid_data:
    rating_valid_parameters.append([alpha, beta_u[d['reviewerID']], beta_i[d['itemID']]])

rating_valid_predict = [sum(para) for para in rating_valid_parameters]
rating_valid_MSE = mse(rating_valid_predict, rating_valid_y)
print "MSE of validation set is "+ str(rating_valid_MSE) + " with lamda "+ str(index+1)
mylib.saveData('1M_train_rating',[alpha, beta_u, beta_i])
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 12 21:07:04 2015

@author: ssc317
"""

import gzip
import mylib
from collections import defaultdict
import string
def readGz(f):
  for l in gzip.open(f):
    yield eval(l)
data = []
for l in readGz("../train.json.gz"):
    data.append(l)
mylib.saveData('../1Mtrain', [data])
    rating_test_result = [[pr[0]+'-'+pr[1], str(ptr)] for pr,ptr in zip(pairs_Rating,rating_test_predict)]
    
    saveCSV('rating_test_result.csv',rating_test_result)
    
# In[]================================================================================
# In[] main
[data_] = mylib.loadData('./assignment1/data/1Mtrain')
train_data = data_[:900000]
valid_data = data_[900000:]
del data_

dirty_limits = [10,11,12,13,14,15,16,17,18,19,20]
sds = [0.3,0.5,0.7,0.9,1.0,2.0]
dirty_bound = [[2.6,4.4],[2.8,4.2],[3,4],[3.5,3.5]]
MSEs = [[[0 for dbi in range(len(dirty_bound))] for si in range(len(sds))] for dli in range(len(dirty_limits))]
thetas = [[[0 for dbi in range(len(dirty_bound))] for si in range(len(sds))] for dli in range(len(dirty_limits))]
for dli in range(len(dirty_limits)):
    dl = dirty_limits[dli]
    for si in range(len(sds)):
        s = sds[si]
        for dbi in range(len(dirty_bound)):
            db = dirty_bound[dbi]
            dirty_feature = [dl, s, db]
            [alpha, beta_i, beta_u, dirty_u, dirty_i] = convergence_dirty(train_data, 3, dirty_feature)
            MSE = validate_dirty(valid_data, dirty_u, dirty_i, alpha, beta_u, beta_i)
            MSEs[dli][si][dbi] = MSE
            thetas[dli][si][dbi] = [alpha, beta_i, beta_u, dirty_u, dirty_i]
            print "dirty_limit: " + str(dl) + "  std: "+str(s)+ "  dirty bound: "+ str(db) + "==> MSE: " + MSE 
            mylib.saveData('dirty_features_theta', [MSEs,thetas])
print MSEs
'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other',
'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than',
'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']

dict_all, dict_count, dict_nHelpful, dict_helpful = defaultdict(float),defaultdict(int),defaultdict(float),defaultdict(float)
punc = string.punctuation
i = 0
print "start for loop"
for l in readGz("../train.json.gz"):
    outOf = l['helpful']['outOf']
    review = ''.join([o for o in list(l['reviewText']) if not o in punc]).split()
    for word in review:
        word = word.lower()
        if word not in stopwords:
            if outOf != 0:
                nHelpful = l['helpful']['nHelpful'] * 1.0;
                dict_count[word] += 1
                dict_nHelpful[word] += nHelpful
                dict_helpful[word] += nHelpful / outOf
            dict_all[word] += 1
    i += 1
    if i == 500:
        mylib.saveData('../dicts_500',[dict_all, dict_count, dict_nHelpful, dict_helpful])
    elif i == 5000:
        mylib.saveData('../dicts_5000',[dict_all, dict_count, dict_nHelpful, dict_helpful])
    elif i == 50000:
        mylib.saveData('../dicts_50000',[dict_all, dict_count, dict_nHelpful, dict_helpful])
    elif i == 500000:
        mylib.saveData('../dicts_500000',[dict_all, dict_count, dict_nHelpful, dict_helpful])
    if i == 500000:
        break