Beispiel #1
0
def idf():
  IDF = {}
  numDocs = 0
  # Get all the products from the database
  dat = data.getDatabase()
  for table in dat:
    #print '.'
    # Go through each product in each table
    for product in dat[table]:
      item = product[3]
      # Get their reviews
      revs = GetReviews.readReview(item)["Reviews"]
      try:
        for r in revs:
          #print r
          # Tokenize and Stem reviews
          con = Tokenizer.stemming(Tokenizer.tokenize(r['Cons']))
          pro = Tokenizer.stemming(Tokenizer.tokenize(r['Pros']))
          comment = Tokenizer.stemming(Tokenizer.tokenize(r['Comments']))
          #print 'Before:',r['Cons'],'\n\nAfter:',con
          # Count unique tokens in the document
          for token in list(set(con) | set(pro) | set(comment)):
            if token in IDF: IDF[token] = IDF[token] + 1
            else: IDF[token] = 1
            numDocs = numDocs + 1
            # Increment the number of documents
      except: pass
  # Calculate and return the idf score
  for term in IDF:
    IDF[term] = math.log(float(numDocs)/float(IDF[term]))
  pickle.dump(dict(IDF),open('../data/idf.pickle','wb'))  # Pickling saves SOOO much time
  return IDF
Beispiel #2
0
def tf_idf():
  TF_IDF = {}
  # Load the inverse document frequencies
  #IDF = idf()
  IDF = dict(pickle.load(open('../data/idf.pickle','rb')))
  dat = data.getDatabase()	# get all of the products
  for table in dat:
    print '.'	# progress marker
    for product in dat[table]:	# For each product in each table
	item = product[3]	# Item number is [3] in the tuple
	revs = GetReviews.readReview(item)["Reviews"]	# we want to read the actual reviews
	product_review = []
	try:
	  for r in revs:	# for each review
	    tf = {}
		# Tokenize and stem the entire review
	    con = Tokenizer.stemming(Tokenizer.tokenize(r['Cons']))
	    pro = Tokenizer.stemming(Tokenizer.tokenize(r['Pros']))
	    comment = Tokenizer.stemming(Tokenizer.tokenize(r['Comments']))
		# combine pros, cons, and comments sections
	    for token in list(con+pro+comment):		# calculate the term frequencies
		if token in tf: tf[token] = tf[token] + 1
		else: tf[token] = 1
	    for t in tf:
	    	tf[t] = float(1+math.log(tf[t]))*IDF[t] # calculate tf-idf score
	    product_review.append(tf)	# add to list of reviews
	except: pass
	TF_IDF[item] = product_review	# add list of reviews to the dictionary
  return TF_IDF
Beispiel #3
0
def BuildTrainingSet():
	# initialize variables
  database = data.getDatabase()
  IDF = dict(pickle.load(open('../data/idf.pickle','rb')))
  numReviews = 0
  posReview = {}
  numPos = 0
  negReview = {}
  numNeg = 0
		# For each product in each subcategory
  for table in database:
    for product in database[table]:
	item = product[3]
	revs = GetReviews.readReview(item)["Reviews"]
		# get the review
	try:
	  for r in revs:
	    if (numReviews%37)==0:	# Analyze every 37th review
	  	tf = {}
		  # Get reviews for you to read
	  	con = r['Cons']
	  	pro = r['Pros']
	  	comment = r['Comments']
		  # Read the reviews
	  	print pro,' :: ',con,' :: ',comment
		  # set up to add to training set
	  	con = Tokenizer.stemming(Tokenizer.tokenize(r['Cons']))
	  	pro = Tokenizer.stemming(Tokenizer.tokenize(r['Pros']))
	  	comment = Tokenizer.stemming(Tokenizer.tokenize(r['Comments']))
		  # Treat all parts as one review
	  	for token in list(con+pro+comment):
		  if token in tf: tf[token] = tf[token] + 1
		  else: tf[token] = 1
	  	for t in tf:
		    # tf-idf formula
	    	  tf[t] = float(1+math.log(tf[t]))*IDF[t]
		    # hopefully you have had time to read, now decide
	    	Q = int(raw_input('\n1 for good.... 0 for bad.....\n').rstrip('\n'))
	    	if Q==1:	# Good
		  posReview[numPos] = tf	# add to training set
		  numPos = numPos + 1
		elif Q==0:	# Bad
		  negReview[numNeg] = tf	# add to training set
		  numNeg = numNeg + 1
		else: print 'FAIL!!!!!!'

	    numReviews = numReviews + 1		# increase number of reviews
	except: pass
  saveSet(posReview,negReview)	# Save the training sets
  return (numPos, numNeg)