コード例 #1
0
def learnFeatureExistance(busImportantFeatures, userImportantFeatures, trainReviews, path):
    logger = logging.getLogger('signature.learnFE')
    logger.info('starting learnFeatureExistance from %d reviews'%len(trainReviews))
    fsw = featureStructureWorker()
    modelDict = dict()
    
    missed_prediction = dict()
    for f, feature in enumerate(fsw.featureIdicator):
        if not fsw.featureIdicator[feature]:
            continue
        logger.info('Start working with (%d) %s'%(f,feature))
        #get data
        X1, Y1, X2, Y2, missed = getFeatures(logger, feature, trainReviews, busImportantFeatures, userImportantFeatures)
        missed_prediction[feature] = [missed, len(Y1)]
        
        
#        stat_line = '%d (%d/%d)reviews (%d of them pos(%d)/neg(%d))'%(len(Y1),sum(Y1),len(Y1) - sum(Y1),
#                                                                      len(Y2),sum(Y2),len(Y2) - sum(Y2))

        logger.debug('Got features for %d (%d/%d)reviews (%d of them pos(%d)/neg(%d))'%(len(Y1),sum(Y1),len(Y1) - sum(Y1),
                                                                                     len(Y2),sum(Y2),len(Y2) - sum(Y2)))

        print(len(Y1),len(Y2))
        if len(Y1) < 100 or sum(Y1) < 50 or len(Y1) - sum(Y1) < 50:
            continue
        if len(Y2) < 100 or sum(Y2) < 50 or len(Y2) - sum(Y2) < 50:
            continue

#        if len(Y1) < 10 or sum(Y1) < 10 or len(Y1) - sum(Y1) < 10:
#            continue
#        if len(Y2) < 10 or sum(Y2) < 10 or len(Y2) - sum(Y2) < 10:
#            continue


#        #cross validation
#        indicator = range(len(X))
#        random.shuffle(indicator)
#        thres = int(len(indicator)*0.8)
#        trainX = np.array([X[i] for i in indicator[:thres]])
#        trainY = np.array([Y[i] for i in indicator[:thres]])
#        testX = np.array([X[i] for i in indicator[thres:]])
#        testY = np.array([Y[i] for i in indicator[thres:]])
        
        #Logistic Regression
        bestThres, bestQ,logmodel = getLogModel(logger, feature, X1, Y1, path)
        
        logger.info('Sentiment prediction for (%d) %s'%(f,feature))
        #Logistic Regression
        bestThres_2, bestQ_2, logmodel_2 = getLogModel(logger, feature, X2, Y2, path)
        
        
        feat_info = [len(Y1), sum(Y1), len(Y1) - sum(Y1)] + bestQ + [len(Y2), sum(Y2),len(Y2) - sum(Y2)] + bestQ_2
        
        #bestThresSVM,bestF1SVM,svmmodel = getBestSVMModel(logger, feature, X, Y, path)
        
#       crossValidation(logger, np.array(X), np.array(Y))
        
        
        modelDict[feature] = [bestThres, logmodel, bestThres_2, logmodel_2, feat_info]
        
#        print(f)
#        if f > 6:
#            break
        
    return modelDict
コード例 #2
0
def applyFeatureExistance(busImportantFeatures, userImportantFeatures, testReviews, modelDict, path):
    logger = logging.getLogger('signature.applyFE.aFE')
    logger.info('starting applyFeatureExistance from %d reviews'%len(testReviews))
    fsw = featureStructureWorker()
    featureWeights = dict()
    featureSWeights = dict()
    
    featureQuality = dict()
    
    for k, feature in enumerate(fsw.featureIdicator):
#        print(k,feature)
#        if k > 15:
#            break
        
        
        if not fsw.featureIdicator[feature]:
            continue
        if feature not in modelDict:
            continue
        
        logger.debug('Start working with (%d) %s'%(k,feature))
        #get data
        X1, Y1, X2, Y2, missed = getFeatures(logger, feature, testReviews, busImportantFeatures, userImportantFeatures)
        
        
        #weight = frequency
        featureWeights[feature] = float(sum(Y1))/len(Y1)
        #weight = sentiment
        featureSWeights[feature] = float(sum(Y2))/len(Y2)
        
        '''
        Existence
        '''
        #Ypred = [int(x[1] > modelDict[feature][0])  for x in modelDict[feature][1].predict_proba(np.array(X1))]
        Ypred = modelDict[feature][1].predict(np.array(X1))
        Yreal = np.array(Y1)
        
        quality = list(f1_score(Yreal, Ypred, average=None)) 
        quality += list(precision_score(Yreal, Ypred, average=None)) 
        quality += list(recall_score(Yreal, Ypred, average=None))
        
        '''
        Sentiment
        '''
        #YSpred = [int(x[1] > modelDict[feature][2]) for x in modelDict[feature][3].predict_proba(np.array(X2))]
        YSpred = modelDict[feature][3].predict(np.array(X2))
        YSreal = np.array(Y2)
        
        qualityS = list(f1_score(YSreal, YSpred, average=None)) 
        qualityS += list(precision_score(YSreal, YSpred, average=None)) 
        qualityS += list(recall_score(YSreal, YSpred, average=None))
        
        featureQuality[feature]  = [round(featureWeights[feature],2), len(Y1)]
        featureQuality[feature] += [round(x,2) for x in quality]
        featureQuality[feature] += [round(featureSWeights[feature],2), len(Y2)]
        featureQuality[feature] += [round(x,2) for x in qualityS]
        
#        print(feature,featureQuality[feature])
        
        for r, review in enumerate(testReviews):
            existence = 0
            predictedExistence = 0
            
            X1, Y1, X2, Y2, missed = getFeatures(logger, feature, [review], busImportantFeatures, userImportantFeatures)
            if len(Y1): #check if the review has enough history
                review['exPredFeatures'] = review.get('exPredFeatures', {})
            
                existence = Y1[0]
                #print Yreal[r], Ypred[r], modelDict[feature][0]
                
                
                prediction = modelDict[feature][1].predict_proba(np.array(X1))[0][1] # probability of second class!!!
                #prediction = float(modelDict[feature][1].predict(np.array(X1))[0])
                #prediction = busImportantFeatures[review['business_id']]['featureFreq'][feature]/100.0
                if prediction >= modelDict[feature][0]:
                    predictedExistence = 1
                else:
                    predictedExistence = 0
                predictedExistence = prediction
#                print(X1[0], prediction, busImportantFeatures[review['business_id']]['featureFreq'][feature]/100.0)
                randomPrediction = random.random()#int(random.random() > 0.5)
                simplePrediction = busImportantFeatures[review['business_id']]['featureFreq'][feature]/100.0#int(busImportantFeatures[review['business_id']]['featureFreq'][feature] > 40)
                basePredictionPos = 1
                basePredictionNeg = 0
                
                #print(existence, predictedExistence, randomPrediction, simplePrediction, basePredictionPos, basePredictionNeg)
                
                
                review['exPredFeatures'][feature] = [existence, predictedExistence,
                                                     randomPrediction, simplePrediction, 
                                                     basePredictionPos, basePredictionNeg]
                    
                #print(feature, review['exPredFeatures'][feature])
            
            '''
            Sentiment
            '''
            if len(Y2):
                review['sentPredFeatures'] = review.get('sentPredFeatures', {})
            
                sentiment = Y2[0]
                #print Yreal[r], Ypred[r], modelDict[feature][0]
                
                prediction = modelDict[feature][3].predict_proba(np.array(X2))[0][1]
                #prediction = float(modelDict[feature][3].predict(np.array(X2))[0])
                if prediction >= modelDict[feature][2]:
                    predictedSentiment = 1
                else:
                    predictedSentiment = 0
                predictedSentiment = prediction
                
                randomSPrediction = random.random()#int(random.random() > 0.5)
                simpleSPrediction = (busImportantFeatures[review['business_id']]['sentiment'].get(feature,[0.0,0])[0]+1)/2.0#int(busImportantFeatures[review['business_id']]['sentiment'].get(feature,[0.0,0])[0] >= -0.5)
                baseSPredictionPos = 1
                baseSPredictionNeg = 0
                
                review['sentPredFeatures'][feature] = [sentiment, predictedSentiment,
                                                       randomSPrediction, simpleSPrediction,
                                                       baseSPredictionPos, baseSPredictionNeg]
            
            if not r%5000:
                logger.debug('%d reviews processed'%r)
    
    return testReviews, featureWeights, featureQuality