# Tools from utils import disp, data # ML from collections import Counter from math import log print "> Loading data" root = data.getParent(__file__) alltoken = data.loadFile(root + '/computed/alltoken.pkl') reviews_feature = data.loadFile(root + '/computed/reviews_feature.pkl') n = len(reviews_feature) print "Total reviews:", n # TF-IDF print "> Computing TF" TF = dict() i = 0 for review in reviews_feature: i += 1 disp.tempPrint(str(i)) TF[review] = Counter() for token in reviews_feature[review]: TF[review][token] = float(reviews_feature[review][token]) / float(max(reviews_feature[review].values())) print "> Computing IDF" IDF = dict()
from tokenizer import Tokenizer # storing data from collections import Counter root = data.getParent(__file__) #filename = root + "/dataset/yelp_academic_dataset_review_training_small.json" filename = sys.argv[1] # Variables # number of reviews a token has to appear to be kept hardthreshold = 2 print "> Loading data" alltoken = data.loadFile(root + '/computed/alltoken.pkl') print "> Scanning data" print "Loading file", filename reviews_feature = dict() reviews_score = dict() tok = Tokenizer(preserve_case=True) # extracting tokens for line in data.generateLine(filename): review = json.loads(line) reviewid = review['review_id'] text = tok.ngrams(review['text'], 1, 3) score = int(review['stars'])
# Tools from utils import data from linear_regression_sgd import sgd print "> Loading" root = data.getParent(__file__) print "loading review scores" target = data.loadFile(root + '/computed/reviews_score.pkl') print "loading TFIDF matrix" TFIDF = data.loadFile(root + '/computed/TFIDF.pkl') nReviews = len(target) print "> Optimizing with SGD" RMSE, weights, bias = sgd(TFIDF, target, alpha=0.001, epsilon=0.001) print "Alpha = ", alpha, " -- RMSE = ", RMSE data.saveFile((weights, bias), root + "/computed/linear_regression_weights.pkl")
# Tools from utils import data from collections import Counter def dot(csparse, c): r = 0 for key in csparse: r += csparse[key] * c[key] return r print "> Loading" root = data.getParent(__file__) print "loading TFIDF matrix" TFIDF = data.loadFile(root + '/computed/TFIDF.pkl') print "loading weights" weights, bias = data.loadFile(root + "/computed/linear_regression_weights.pkl") predict = dict() for review in TFIDF: predict[review] = bias + dot(TFIDF[review], weights) data.saveFile(predict, root + "/computed/linear_regression_predict.pkl")
# Tools from utils import data from evaluation import plot print "> Loading" root = data.getParent(__file__) print "loading review scores" target = data.loadFile(root + "/computed/reviews_score.pkl") print "loading predicted scores" predict = data.loadFile(root + "/computed/linear_regression_predict.pkl") RMSE = 0 for review in target: RMSE += (target[review] - predict[review]) ** 2 RMSE /= len(target) print "RMSE:", RMSE plot.error_boxplot(target, predict)