def idf(): IDF = {} numDocs = 0 # Get all the products from the database dat = data.getDatabase() for table in dat: #print '.' # Go through each product in each table for product in dat[table]: item = product[3] # Get their reviews revs = GetReviews.readReview(item)["Reviews"] try: for r in revs: #print r # Tokenize and Stem reviews con = Tokenizer.stemming(Tokenizer.tokenize(r['Cons'])) pro = Tokenizer.stemming(Tokenizer.tokenize(r['Pros'])) comment = Tokenizer.stemming(Tokenizer.tokenize(r['Comments'])) #print 'Before:',r['Cons'],'\n\nAfter:',con # Count unique tokens in the document for token in list(set(con) | set(pro) | set(comment)): if token in IDF: IDF[token] = IDF[token] + 1 else: IDF[token] = 1 numDocs = numDocs + 1 # Increment the number of documents except: pass # Calculate and return the idf score for term in IDF: IDF[term] = math.log(float(numDocs)/float(IDF[term])) pickle.dump(dict(IDF),open('../data/idf.pickle','wb')) # Pickling saves SOOO much time return IDF
def tf_idf(): TF_IDF = {} # Load the inverse document frequencies #IDF = idf() IDF = dict(pickle.load(open('../data/idf.pickle','rb'))) dat = data.getDatabase() # get all of the products for table in dat: print '.' # progress marker for product in dat[table]: # For each product in each table item = product[3] # Item number is [3] in the tuple revs = GetReviews.readReview(item)["Reviews"] # we want to read the actual reviews product_review = [] try: for r in revs: # for each review tf = {} # Tokenize and stem the entire review con = Tokenizer.stemming(Tokenizer.tokenize(r['Cons'])) pro = Tokenizer.stemming(Tokenizer.tokenize(r['Pros'])) comment = Tokenizer.stemming(Tokenizer.tokenize(r['Comments'])) # combine pros, cons, and comments sections for token in list(con+pro+comment): # calculate the term frequencies if token in tf: tf[token] = tf[token] + 1 else: tf[token] = 1 for t in tf: tf[t] = float(1+math.log(tf[t]))*IDF[t] # calculate tf-idf score product_review.append(tf) # add to list of reviews except: pass TF_IDF[item] = product_review # add list of reviews to the dictionary return TF_IDF
def BuildTrainingSet(): # initialize variables database = data.getDatabase() IDF = dict(pickle.load(open('../data/idf.pickle','rb'))) numReviews = 0 posReview = {} numPos = 0 negReview = {} numNeg = 0 # For each product in each subcategory for table in database: for product in database[table]: item = product[3] revs = GetReviews.readReview(item)["Reviews"] # get the review try: for r in revs: if (numReviews%37)==0: # Analyze every 37th review tf = {} # Get reviews for you to read con = r['Cons'] pro = r['Pros'] comment = r['Comments'] # Read the reviews print pro,' :: ',con,' :: ',comment # set up to add to training set con = Tokenizer.stemming(Tokenizer.tokenize(r['Cons'])) pro = Tokenizer.stemming(Tokenizer.tokenize(r['Pros'])) comment = Tokenizer.stemming(Tokenizer.tokenize(r['Comments'])) # Treat all parts as one review for token in list(con+pro+comment): if token in tf: tf[token] = tf[token] + 1 else: tf[token] = 1 for t in tf: # tf-idf formula tf[t] = float(1+math.log(tf[t]))*IDF[t] # hopefully you have had time to read, now decide Q = int(raw_input('\n1 for good.... 0 for bad.....\n').rstrip('\n')) if Q==1: # Good posReview[numPos] = tf # add to training set numPos = numPos + 1 elif Q==0: # Bad negReview[numNeg] = tf # add to training set numNeg = numNeg + 1 else: print 'FAIL!!!!!!' numReviews = numReviews + 1 # increase number of reviews except: pass saveSet(posReview,negReview) # Save the training sets return (numPos, numNeg)