def calculateChisquare(): print 'Calculating chi...' reviews = yelpReview.readExperimentData() cReviews = classfiedReviews(reviews) lexicons = loadLexiconTable() chiTable = {} chiTable[0] = {} i=0 for key, value in lexicons.items(): A = 0 B = 0 C = 0 D = 0 for review in cReviews[0]: words = review['text'].split() if key in words: A+=1 B = value - A C = len(cReviews[4]) - A D = 5000 - len(cReviews[4]) - B x = 5000.0 *(A*D-C*B)*(A*D-C*B)/((A+C)*(B+D)*(A+B)*(C+D)) chiTable[0][key] = x i+=1 if i%50 == 0: print i saveJson(chiTable[0], 'chi_0.json') print 'Done.'
def svmTrainFor2(lexicons): print 'Training SVM...' reviews = yelpReview.readExperimentData() feature0 = loadFeature(FEATURE0) feature4 = loadFeature(FEATURE4) feature = feature0+feature4 x=[] y=[] for review in reviews: star = review['stars']-1 if star == 4 or star == 3: star = 1 elif star == 2: continue elif star == 0 or star == 1: star = 0 vector = getReviewVector(review, feature, lexicons) d = distance.cosine(vectors[0], vector) #if math.isnan(d): # print 'meet nan' # continue y.append(star) #x.append(vector) x.append([d]) #print star print len(x), len(y) clf = svm.SVR() clf.fit(x, y) print 'Done.' return clf
def svmTrain(lexicons): print 'Training SVM...' reviews = yelpReview.readExperimentData() feature0 = loadFeature(FEATURE0) feature4 = loadFeature(FEATURE4) feature = feature0+feature4 x=[] y=[] for review in reviews: vector = getReviewVector(review, feature, lexicons) star = review['stars']-1 d = distance.cosine(vectors[0], vector) y.append(star) #x.append(vector) x.append([d]) print len(x), len(y) clf = svm.SVR() clf.fit(x, y) print 'Done.' return clf
def buildVectors(lexicons): print 'Building vectors...' reviews = yelpReview.readExperimentData() cReviews = classfiedReviews(reviews) feature0 = loadFeature(FEATURE0) feature4 = loadFeature(FEATURE4) feature = feature0+feature4 #feature0 for i in range(0,5): print i for review in cReviews[i]: vector = getReviewVector(review, feature, lexicons) vectors[i]=numpy.add(vectors[i], vector) d01 = distance.cosine(vectors[0],vectors[1]) d02 = distance.cosine(vectors[0],vectors[2]) d03 = distance.cosine(vectors[0],vectors[3]) d04 = distance.cosine(vectors[0],vectors[4]) print d01, d02, d03, d04 plt.plot([0, d01, d02, d03, d04]) plt.show() #print len(vectors[i]) print 'Done.'
def buildLexiconTable(): print 'Generating Lexicon Tables...' lexiconTable = {} reviews = yelpReview.readExperimentData() i=0 for review in reviews: text = nltk.word_tokenize(review['text']) posTags = nltk.pos_tag(text) wordSet = {} for posTag in posTags: if posTag[1] == 'JJ' or posTag[1] == 'JJS' or posTag[1] == 'JJR': if wordSet.has_key(posTag[0]): continue else: wordSet[posTag[0]]=1 if not lexiconTable.has_key(posTag[0]): lexiconTable[posTag[0]] = 1 else: lexiconTable[posTag[0]]+=1 i+=1 if i%100 == 0: print i saveJson(lexiconTable, LEXICONTABLE) print 'Done.'