Esempio n. 1
0
TF = dict()
i = 0
for review in reviews_feature:
  i += 1
  disp.tempPrint(str(i))
  TF[review] = Counter()
  for token in reviews_feature[review]:
    TF[review][token] = float(reviews_feature[review][token]) / float(max(reviews_feature[review].values()))

print "> Computing IDF"
IDF = dict()
i = 0
for token in alltoken:
  i += 1
  disp.tempPrint(str(i))
  IDF[token] = log(float(n) / float(len(alltoken[token])))

print "> Computing TFIDF"
TFIDF = dict()
i = 0
for review in reviews_feature:
  i += 1
  disp.tempPrint(str(i))
  TFIDF[review] = Counter()
  for token in reviews_feature[review]:
    TFIDF[review][token] = TF[review][token] * IDF[token]

# SGD linear regression

data.saveFile(TFIDF, root + '/computed/TFIDF.pkl')
# Tools 
from utils import data
from collections import Counter

def dot(csparse, c):
  r = 0
  for key in csparse:
    r += csparse[key] * c[key]
  return r

print "> Loading"
root = data.getParent(__file__)
print "loading TFIDF matrix"
TFIDF = data.loadFile(root + '/computed/TFIDF.pkl')
print "loading weights"
weights, bias = data.loadFile(root + "/computed/linear_regression_weights.pkl")

predict = dict()
for review in TFIDF:
  predict[review] = bias + dot(TFIDF[review], weights)

data.saveFile(predict, root + "/computed/linear_regression_predict.pkl")
# number of reviews a token has to appear to be kept
hardthreshold = 2

print "> Loading data"
alltoken = data.loadFile(root + '/computed/alltoken.pkl')

print "> Scanning data"
print "Loading file", filename

reviews_feature = dict()
reviews_score = dict()

tok = Tokenizer(preserve_case=True)
# extracting tokens
for line in data.generateLine(filename):
  review = json.loads(line)
  reviewid = review['review_id']
  text = tok.ngrams(review['text'], 1, 3)
  score = int(review['stars'])
 
  # filtering tokens by the ones in the model
  text = filter(lambda k: k in alltoken, text)
  reviews_feature[reviewid] = Counter(text)
  reviews_score[reviewid] = score

print "> End of full scan"

print "> Saving"
data.saveFile(reviews_feature, root + "/computed/reviews_feature.pkl")
data.saveFile(reviews_score, root + "/computed/reviews_score.pkl")
# Tools 
from utils import data
from linear_regression_sgd import sgd

print "> Loading"
root = data.getParent(__file__)
print "loading review scores"
target = data.loadFile(root + '/computed/reviews_score.pkl')
print "loading TFIDF matrix"
TFIDF = data.loadFile(root + '/computed/TFIDF.pkl')

nReviews = len(target)

print "> Optimizing with SGD"
RMSE, weights, bias = sgd(TFIDF, target, alpha=0.001, epsilon=0.001)
print "Alpha = ", alpha, " -- RMSE = ", RMSE
data.saveFile((weights, bias), root + "/computed/linear_regression_weights.pkl")