def learnFeatureExistance(busImportantFeatures, userImportantFeatures, trainReviews, path): logger = logging.getLogger('signature.learnFE') logger.info('starting learnFeatureExistance from %d reviews'%len(trainReviews)) fsw = featureStructureWorker() modelDict = dict() missed_prediction = dict() for f, feature in enumerate(fsw.featureIdicator): if not fsw.featureIdicator[feature]: continue logger.info('Start working with (%d) %s'%(f,feature)) #get data X1, Y1, X2, Y2, missed = getFeatures(logger, feature, trainReviews, busImportantFeatures, userImportantFeatures) missed_prediction[feature] = [missed, len(Y1)] # stat_line = '%d (%d/%d)reviews (%d of them pos(%d)/neg(%d))'%(len(Y1),sum(Y1),len(Y1) - sum(Y1), # len(Y2),sum(Y2),len(Y2) - sum(Y2)) logger.debug('Got features for %d (%d/%d)reviews (%d of them pos(%d)/neg(%d))'%(len(Y1),sum(Y1),len(Y1) - sum(Y1), len(Y2),sum(Y2),len(Y2) - sum(Y2))) print(len(Y1),len(Y2)) if len(Y1) < 100 or sum(Y1) < 50 or len(Y1) - sum(Y1) < 50: continue if len(Y2) < 100 or sum(Y2) < 50 or len(Y2) - sum(Y2) < 50: continue # if len(Y1) < 10 or sum(Y1) < 10 or len(Y1) - sum(Y1) < 10: # continue # if len(Y2) < 10 or sum(Y2) < 10 or len(Y2) - sum(Y2) < 10: # continue # #cross validation # indicator = range(len(X)) # random.shuffle(indicator) # thres = int(len(indicator)*0.8) # trainX = np.array([X[i] for i in indicator[:thres]]) # trainY = np.array([Y[i] for i in indicator[:thres]]) # testX = np.array([X[i] for i in indicator[thres:]]) # testY = np.array([Y[i] for i in indicator[thres:]]) #Logistic Regression bestThres, bestQ,logmodel = getLogModel(logger, feature, X1, Y1, path) logger.info('Sentiment prediction for (%d) %s'%(f,feature)) #Logistic Regression bestThres_2, bestQ_2, logmodel_2 = getLogModel(logger, feature, X2, Y2, path) feat_info = [len(Y1), sum(Y1), len(Y1) - sum(Y1)] + bestQ + [len(Y2), sum(Y2),len(Y2) - sum(Y2)] + bestQ_2 #bestThresSVM,bestF1SVM,svmmodel = getBestSVMModel(logger, feature, X, Y, path) # crossValidation(logger, np.array(X), np.array(Y)) modelDict[feature] = [bestThres, logmodel, bestThres_2, logmodel_2, feat_info] # print(f) # if f > 6: # break return modelDict
def applyFeatureExistance(busImportantFeatures, userImportantFeatures, testReviews, modelDict, path): logger = logging.getLogger('signature.applyFE.aFE') logger.info('starting applyFeatureExistance from %d reviews'%len(testReviews)) fsw = featureStructureWorker() featureWeights = dict() featureSWeights = dict() featureQuality = dict() for k, feature in enumerate(fsw.featureIdicator): # print(k,feature) # if k > 15: # break if not fsw.featureIdicator[feature]: continue if feature not in modelDict: continue logger.debug('Start working with (%d) %s'%(k,feature)) #get data X1, Y1, X2, Y2, missed = getFeatures(logger, feature, testReviews, busImportantFeatures, userImportantFeatures) #weight = frequency featureWeights[feature] = float(sum(Y1))/len(Y1) #weight = sentiment featureSWeights[feature] = float(sum(Y2))/len(Y2) ''' Existence ''' #Ypred = [int(x[1] > modelDict[feature][0]) for x in modelDict[feature][1].predict_proba(np.array(X1))] Ypred = modelDict[feature][1].predict(np.array(X1)) Yreal = np.array(Y1) quality = list(f1_score(Yreal, Ypred, average=None)) quality += list(precision_score(Yreal, Ypred, average=None)) quality += list(recall_score(Yreal, Ypred, average=None)) ''' Sentiment ''' #YSpred = [int(x[1] > modelDict[feature][2]) for x in modelDict[feature][3].predict_proba(np.array(X2))] YSpred = modelDict[feature][3].predict(np.array(X2)) YSreal = np.array(Y2) qualityS = list(f1_score(YSreal, YSpred, average=None)) qualityS += list(precision_score(YSreal, YSpred, average=None)) qualityS += list(recall_score(YSreal, YSpred, average=None)) featureQuality[feature] = [round(featureWeights[feature],2), len(Y1)] featureQuality[feature] += [round(x,2) for x in quality] featureQuality[feature] += [round(featureSWeights[feature],2), len(Y2)] featureQuality[feature] += [round(x,2) for x in qualityS] # print(feature,featureQuality[feature]) for r, review in enumerate(testReviews): existence = 0 predictedExistence = 0 X1, Y1, X2, Y2, missed = getFeatures(logger, feature, [review], busImportantFeatures, userImportantFeatures) if len(Y1): #check if the review has enough history review['exPredFeatures'] = review.get('exPredFeatures', {}) existence = Y1[0] #print Yreal[r], Ypred[r], modelDict[feature][0] prediction = modelDict[feature][1].predict_proba(np.array(X1))[0][1] # probability of second class!!! #prediction = float(modelDict[feature][1].predict(np.array(X1))[0]) #prediction = busImportantFeatures[review['business_id']]['featureFreq'][feature]/100.0 if prediction >= modelDict[feature][0]: predictedExistence = 1 else: predictedExistence = 0 predictedExistence = prediction # print(X1[0], prediction, busImportantFeatures[review['business_id']]['featureFreq'][feature]/100.0) randomPrediction = random.random()#int(random.random() > 0.5) simplePrediction = busImportantFeatures[review['business_id']]['featureFreq'][feature]/100.0#int(busImportantFeatures[review['business_id']]['featureFreq'][feature] > 40) basePredictionPos = 1 basePredictionNeg = 0 #print(existence, predictedExistence, randomPrediction, simplePrediction, basePredictionPos, basePredictionNeg) review['exPredFeatures'][feature] = [existence, predictedExistence, randomPrediction, simplePrediction, basePredictionPos, basePredictionNeg] #print(feature, review['exPredFeatures'][feature]) ''' Sentiment ''' if len(Y2): review['sentPredFeatures'] = review.get('sentPredFeatures', {}) sentiment = Y2[0] #print Yreal[r], Ypred[r], modelDict[feature][0] prediction = modelDict[feature][3].predict_proba(np.array(X2))[0][1] #prediction = float(modelDict[feature][3].predict(np.array(X2))[0]) if prediction >= modelDict[feature][2]: predictedSentiment = 1 else: predictedSentiment = 0 predictedSentiment = prediction randomSPrediction = random.random()#int(random.random() > 0.5) simpleSPrediction = (busImportantFeatures[review['business_id']]['sentiment'].get(feature,[0.0,0])[0]+1)/2.0#int(busImportantFeatures[review['business_id']]['sentiment'].get(feature,[0.0,0])[0] >= -0.5) baseSPredictionPos = 1 baseSPredictionNeg = 0 review['sentPredFeatures'][feature] = [sentiment, predictedSentiment, randomSPrediction, simpleSPrediction, baseSPredictionPos, baseSPredictionNeg] if not r%5000: logger.debug('%d reviews processed'%r) return testReviews, featureWeights, featureQuality