Beispiel #1
0
# Tools
from utils import disp, data

# ML
from collections import Counter
from math import log

print "> Loading data"

root = data.getParent(__file__)

alltoken = data.loadFile(root + '/computed/alltoken.pkl')
reviews_feature = data.loadFile(root + '/computed/reviews_feature.pkl')

n = len(reviews_feature)

print "Total reviews:", n

# TF-IDF
print "> Computing TF"
TF = dict()
i = 0
for review in reviews_feature:
  i += 1
  disp.tempPrint(str(i))
  TF[review] = Counter()
  for token in reviews_feature[review]:
    TF[review][token] = float(reviews_feature[review][token]) / float(max(reviews_feature[review].values()))

print "> Computing IDF"
IDF = dict()
from tokenizer import Tokenizer

# storing data
from collections import Counter

root = data.getParent(__file__)
#filename = root + "/dataset/yelp_academic_dataset_review_training_small.json"
filename = sys.argv[1]

# Variables

# number of reviews a token has to appear to be kept
hardthreshold = 2

print "> Loading data"
alltoken = data.loadFile(root + '/computed/alltoken.pkl')

print "> Scanning data"
print "Loading file", filename

reviews_feature = dict()
reviews_score = dict()

tok = Tokenizer(preserve_case=True)
# extracting tokens
for line in data.generateLine(filename):
  review = json.loads(line)
  reviewid = review['review_id']
  text = tok.ngrams(review['text'], 1, 3)
  score = int(review['stars'])
 
# Tools 
from utils import data
from linear_regression_sgd import sgd

print "> Loading"
root = data.getParent(__file__)
print "loading review scores"
target = data.loadFile(root + '/computed/reviews_score.pkl')
print "loading TFIDF matrix"
TFIDF = data.loadFile(root + '/computed/TFIDF.pkl')

nReviews = len(target)

print "> Optimizing with SGD"
RMSE, weights, bias = sgd(TFIDF, target, alpha=0.001, epsilon=0.001)
print "Alpha = ", alpha, " -- RMSE = ", RMSE
data.saveFile((weights, bias), root + "/computed/linear_regression_weights.pkl")
# Tools 
from utils import data
from collections import Counter

def dot(csparse, c):
  r = 0
  for key in csparse:
    r += csparse[key] * c[key]
  return r

print "> Loading"
root = data.getParent(__file__)
print "loading TFIDF matrix"
TFIDF = data.loadFile(root + '/computed/TFIDF.pkl')
print "loading weights"
weights, bias = data.loadFile(root + "/computed/linear_regression_weights.pkl")

predict = dict()
for review in TFIDF:
  predict[review] = bias + dot(TFIDF[review], weights)

data.saveFile(predict, root + "/computed/linear_regression_predict.pkl")
# Tools
from utils import data
from evaluation import plot

print "> Loading"
root = data.getParent(__file__)
print "loading review scores"
target = data.loadFile(root + "/computed/reviews_score.pkl")
print "loading predicted scores"
predict = data.loadFile(root + "/computed/linear_regression_predict.pkl")

RMSE = 0
for review in target:
    RMSE += (target[review] - predict[review]) ** 2
RMSE /= len(target)

print "RMSE:", RMSE

plot.error_boxplot(target, predict)