def applyAggregationModel(testReviews, featureAvgSent, model, busImportantFeatures, userImportantFeatures): logger = logging.getLogger('signature.aAM.applyAggregationModel') logger.info('starting applyAggregationModel from %d reviews'%len(testReviews)) fsw = featureStructureWorker() for r, review in enumerate(testReviews): reviewFeatures = review['predSentiments'] #features = encodeAspects2features(fsw, reviewFeatures) features = encodeAspects1features(fsw, reviewFeatures, featureAvgSent) #aggregation = model.predict(features) #print aggregation, review['stars'] predictedFeatures = review['exPredFeatures']#[*,1] #Predicted Features, Predicted Sentiments by BUSINESS busID = review['business_id'] if busID in busImportantFeatures: busSents = busImportantFeatures[busID]['sentiment'] else: busSents = {} testData = {a:busSents[a][0] for a in busSents if a in predictedFeatures and predictedFeatures[a][1] == 1 and busSents[a][1] > 1} features = encodeAspects1features(fsw, testData, featureAvgSent) aggregationBUS = model.predict(features) review['rating_prediction'] = review.get('rating_prediction', {}) review['rating_prediction']['aggregBUSavg'] = aggregationBUS if not r%10000: logger.debug('%d reviews processed'%r) return testReviews
def learnAggregationModelsCV(trainReviews, featureAvgSent, busImportantFeatures, path): logger = logging.getLogger("signature.lAMCV.learnAggregationModelsCV") logger.info("starting learnAggregationModel from %d reviews" % len(trainReviews)) fsw = featureStructureWorker() learnData = list() learnLabels = list() for j, review in enumerate(trainReviews): reviewFeatures = fsw.getReviewFeaturesSentiment(review["features"]) rating = review["stars"] for aspect in reviewFeatures: if ( review["business_id"] in busImportantFeatures and aspect in busImportantFeatures[review["business_id"]]["sentiment"] and busImportantFeatures[review["business_id"]]["sentiment"][aspect][1] > 5 ): sentiment = busImportantFeatures[review["business_id"]]["sentiment"][aspect][0] reviewFeatures[aspect] = sentiment else: reviewFeatures[aspect] = featureAvgSent[aspect] # features = encodeAspects2features(fsw, reviewFeatures) features = encodeAspects1features(fsw, reviewFeatures, featureAvgSent) learnData.append(features) learnLabels.append(rating) learnData = np.array(learnData) learnLabels = np.array(learnLabels) bestRes = 0.0 bestReg = 0.0 for reg in [0.01, 0.05, 0.1, 0.2, 0.5, 1.0, 5.0, 10, 15, 50, 100, 200, 500]: kf = cross_validation.KFold(len(learnLabels), n_folds=10) results = list() for train_index, test_index in kf: X_train, X_test = learnData[train_index], learnData[test_index] y_train, y_test = learnLabels[train_index], learnLabels[test_index] clf = linear_model.Ridge(alpha=reg) clf.fit(X_train, y_train) results.append(clf.score(X_test, y_test)) if np.average(results) > bestRes: bestRes = np.average(results) bestReg = reg # print reg, np.average(results) logger.info("Best score %f with regularization = %.2f" % (bestRes, bestReg)) clf = linear_model.Ridge(alpha=bestReg) clf.fit(learnData, learnLabels) return clf
def learnAggregationModelsCV(trainReviews, featureAvgSent, path): logger = logging.getLogger('signature.lAMCV.learnAggregationModelsCV') logger.info('starting learnAggregationModel from %d reviews'%len(trainReviews)) fsw = featureStructureWorker() learnData = list() learnLabels = list() for j, review in enumerate(trainReviews): reviewFeatures = fsw.getReviewFeaturesSentiment(review['features']) rating = review['stars'] #features = encodeAspects2features(fsw, reviewFeatures) features = encodeAspects1features(fsw, reviewFeatures, featureAvgSent) learnData.append(features) learnLabels.append(rating) learnData = np.array(learnData) learnLabels = np.array(learnLabels) bestRes = 0.0 bestReg = 0.0 for reg in [0.01,0.05,0.1,0.2,0.5,1.0,5.0,10,15,50,100,200,500]: kf = cross_validation.KFold(len(learnLabels), n_folds=10) results = list() for train_index, test_index in kf: X_train, X_test = learnData[train_index], learnData[test_index] y_train, y_test = learnLabels[train_index], learnLabels[test_index] clf = linear_model.Ridge(alpha = reg) clf.fit (X_train, y_train) results.append(clf.score(X_test, y_test)) if np.average(results) > bestRes: bestRes = np.average(results) bestReg = reg #print reg, np.average(results) logger.info('Best score %f with regularization = %.2f'%(bestRes, bestReg)) clf = linear_model.Ridge(alpha = bestReg) clf.fit(learnData, learnLabels) return clf