Ejemplo n.º 1
def ex1(reviews, tests, predictionInfo):
   allTrainingReviews = extractReviews(reviews)
   allTrainingNames = extractNames(reviews)

   rmse = classifiers.crossValidate(allTrainingReviews, lambda trainSet: classifiers.TwoClassifier(trainSet), 4, allTrainingNames)
   print 'Average RMS error rate on all validation sets: {0}\n'.format(rmse)

   classy = classifiers.TwoClassifier(allTrainingReviews)
   for review in tests:
      key = '{0}::{1}'.format(review['file'], review['position'])
      for cat in ['food', 'service', 'venue', 'overall']:
         prediction = classy.classifyDocument(review[cat + 'Review'])
         predictionInfo[key][cat] = prediction
Ejemplo n.º 2
def ex2(reviews, tests, predictionInfo):
   # Transforms the documents some
   transReviews = []
   names = []
   for review in reviews:
      doc = {}
      for cat in ['food', 'service', 'venue', 'overall']:
         doc[cat] = int(review[cat + 'Score'])
      transReviews.append((doc, int(review['overallScore'])))
      names.append('{0}::{1}'.format(review['file'], review['position']))

   rmse = classifiers.crossValidate(transReviews, lambda trainSet: classifiers.OverallClassifier(trainSet), 4, names)
   print 'Average RMS error rate on all validation sets: {0}\n'.format(rmse)

   classy = classifiers.OverallClassifier(transReviews)
   for review in tests:
      key = '{0}::{1}'.format(review['file'], review['position'])
      # Get the info from the predictions from ex1
      doc = {}
      for cat in ['food', 'service', 'venue']:
         doc[cat] = predictionInfo[key][cat]
      prediction = classy.classifyDocument(doc)
      predictionInfo[key]['loneOverall'] = prediction
Ejemplo n.º 3
def ex3(reviews, tests, predictionInfo):
   # Transforms the documents some
   transReviews = []
   names = []
   for review in reviews:
      for cat in ['food', 'service', 'venue', 'overall']:
         transReviews.append((review[cat + 'Review'], review['reviewer']))
         names.append('{0}::{1}'.format(review['file'], review['position']))
   for review in reviews:
      doc = ''
      for cat in ['food', 'service', 'venue', 'overall']:
         doc += ' ' + review[cat + 'Review']
      transReviews.append((doc, review['reviewer']))
      names.append('{0}::{1}'.format(review['file'], review['position']))

   rmse = classifiers.crossValidate(transReviews, lambda trainSet: classifiers.NBClassifier(trainSet), 4, names, True)
   print 'Average RMS error rate on all validation sets: {0}\n'.format(rmse)

   classy = classifiers.NBClassifier(transReviews)
   for test in tests:
      key = '{0}::{1}'.format(test['file'], test['position'])
      predictions = {}
      for cat in ['food', 'service', 'venue', 'overall']:
         prediction = classy.classifyDocument(review[cat + 'Review'])
         if not predictions.has_key(prediction):
            predictions[prediction] = 1
            predictions[prediction] += 1
      if len(predictions) == 4:
         # Random plz
         finalPrediction = predictions.keys()[random.randint(0, len(predictions) - 1)]
         finalPrediction = max(predictions, key=predictions.get)
      predictionInfo[key]['reviewer'] = finalPrediction
Ejemplo n.º 4
   allTrainingReviews = extractReviews(reviews)

#   res = {}
#   bestRes = (10, '')
#   for i in range(1, 10):
#      for j in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
#         res['{0}-{1}'.format(i, j)] = classifiers.crossValidate(allTrainingReviews, lambda trainSet: classifiers.WordWeightClassifier(trainSet, i, j))
#         if res['{0}-{1}'.format(i, j)] < bestRes[0]:
#            bestRes = (res['{0}-{1}'.format(i, j)], '{0}-{1}'.format(i, j))
#   print res
#   print bestRes
#   sys.exit()

   print 'RMSE: {0}'.format(classifiers.crossValidate(allTrainingReviews, lambda trainSet: classifiers.TwoClassifier(trainSet)))

   print 'RMSE: {0}'.format(classifiers.crossValidate(allTrainingReviews, lambda trainSet: classifiers.WordWeightClassifier(trainSet)))

   print 'RMSE: {0}'.format(classifiers.crossValidate(allTrainingReviews, lambda trainSet: classifiers.SentiClassifier(trainSet)))

   setDistRmse = classifiers.crossValidate(allTrainingReviews, lambda trainSet: classifiers.SetDistClassifier(trainSet))
   wwRmse = classifiers.crossValidate(allTrainingReviews, lambda trainSet: classifiers.WordWeightClassifier(trainSet))
   binaryRmse = classifiers.crossValidate(allTrainingReviews, lambda trainSet: classifiers.BinaryClassSplitClassifier(trainSet))
   nbRmse = classifiers.crossValidate(allTrainingReviews, lambda trainSet: classifiers.NBClassifier(trainSet), 4, 0)

   print 'SetDist: {0}, WW: {1}, Binary: {2}, NB: {3}'.format(setDistRmse, wwRmse, binaryRmse, nbRmse)