Ejemplo n.º 1
0
   def classifyDocument(self, document, realScore = -1):
      featureSet = set(features.batchStem(features.toUnigrams(document)))

      scores = {}

      #TEST uniques
      for (classVal, classGrams) in self.grams.items():
         scores[classVal] = []
         for classGram in classGrams:
            scores[classVal].append(nltk.metrics.distance.jaccard_distance(featureSet, classGram))
      #TEST all
      #for i in range(1, 6):
      #   scores[i] = []
      #   scores[i].append(nltk.metrics.distance.jaccard_distance(featureSet, self.uniqueFeatures[i]))

      finalDists = {}
      minDist = 10000
      bestClass = 4
      for (classVal, dists) in scores.items():
         #dist = sum(dists) / float(len(dists))

         ordered = sorted(dists)
         if len(dists) % 2 == 0 and len(dists) > 0:
            dist = (dists[len(dists) / 2] + dists[len(dists) / 2 - 1]) / 2
         else:
            dist = dists[len(dists) / 2]

         finalDists[classVal] = dist
         if dist < minDist:
            minDist = dist
            bestClass = classVal

      print '{0} -- {1} - r{2}'.format(finalDists, bestClass, realScore)
      return bestClass
Ejemplo n.º 2
0
   def __init__(self, trainingSet, probabilityDist = nltk.probability.LaplaceProbDist):
      rawClassFeatures = {}

      #TEST
      allFreqs = {}
      classCounts = {}

      for i in range(1, 6):
         rawClassFeatures[i] = set()

      for trainDocument in trainingSet:
         unigrams = set(features.batchStem(features.toUnigrams(trainDocument[0])))

         if not classCounts.has_key(trainDocument[1]):
            classCounts[trainDocument[1]] = 1
         else:
            classCounts[trainDocument[1]] += 1

         for gram in unigrams:
            rawClassFeatures[int(trainDocument[1])].add(gram)

            if not allFreqs.has_key(gram):
               allFreqs[gram] = 1
            else:
               allFreqs[gram] += 1

      self.uniqueFeatures = {}
      for i in range(1, 6):
         uniques = set(rawClassFeatures[i])

         for j in range(1, 6):
            if i != j:
               uniques -= rawClassFeatures[j]

         #print '{0} -- {1}'.format(i, uniques)
         self.uniqueFeatures[i] = uniques

#      for (score, grams) in self.uniqueFeatures.items():
#         print '{0} ({1})'.format(score, classCounts[score])
#         for gram in grams:
#            print '   {0} ({1})'.format(gram, allFreqs[gram])


      #sys.exit()
      #TEST
      #return

      # TODO: try splitting up into multiple docs
      # TRY: all in one doc, with false and true; invert that for false
      # Two docs, only pos

      self.binClassifiers = {}
      for i in range(1, 6):
         allFeatures = [({gram: True}, 'in') for gram in self.uniqueFeatures[i]]

         for j in range(1, 6):
            for gram in self.uniqueFeatures[j]:
               allFeatures.append(({gram: True}, 'out'))

         self.binClassifiers[i] = nltk.NaiveBayesClassifier.train(allFeatures, probabilityDist)
Ejemplo n.º 3
0
   def __init__(self, trainingSet, minOccur = 6, maxOccurPercent = 0.40, stem = True):
      self.weights = {}
      self.medianScore = median([doc[1] for doc in trainingSet])
      self.meanScore = 0
      self.stem = stem

      maxOccur = int(len(trainingSet) * maxOccurPercent)

      rawScores = {}
      for trainDocument in trainingSet:
         if self.stem:
            unigrams = set(features.batchStem(features.toUnigrams(trainDocument[0])))
         else:
            unigrams = set(features.toUnigrams(trainDocument[0]))

         self.meanScore += int(trainDocument[1])

         for gram in unigrams:
            if not rawScores.has_key(gram):
               rawScores[gram] = []
            rawScores[gram].append(int(trainDocument[1]))

      self.meanScore /= float(len(trainingSet))
      # TEST
      #print 'Prior Prob: {0}, Median: {1}'.format(self.meanScore, self.medianScore)

      #compressedScores = {}
      for (gram, scores) in rawScores.items():
         if len(scores) >= minOccur and len(scores) < maxOccur:
            #compressedScores[gram] = scores
            #TEST median
            self.weights[gram] = median(scores)
Ejemplo n.º 4
0
   def classifyDocument(self, document, realScore = -1):
      order = [1, 2, 3, 5, 4]
      rtn = 4

      grams = set(features.batchStem(features.toUnigrams(document)))

      #TEST
#      intersectionSizes = {}
#      for i in order:
#         if len(self.uniqueFeatures[i]) == 0:
#            intersectionSizes[i] = 0
#         else:
#            #intersectionSizes[i] = len(self.uniqueFeatures[i].intersection(grams)) / float(len(self.uniqueFeatures[i]))
#            intersectionSizes[i] = len(self.uniqueFeatures[i].intersection(grams))
#
#      score = 4
#      maxOccur = 0
#      for (key, occur) in intersectionSizes.items():
#         if occur > maxOccur:
#            score = key
#            maxOccur = occur
#
#      print '{0} -- {1} - {2}'.format(intersectionSizes, score, realScore)
#      return score
#
#      return random.randint(3, 4)

      featureSet = {gram: True for gram in set(features.batchStem(features.toUnigrams(document)))}
      probs = {}
      weights = []
      for i in order:
         prob = self.binClassifiers[i].prob_classify(featureSet)
         probs[i] = prob.prob('in')
         weights.append(prob.prob('in') * i)
         #probs[i] = prob.prob('out')
         #weights.append(prob.prob('out') * i)


#         if self.binClassifiers[i].classify(featureSet) == 'in':
#            rtn = i
#            print 'Found: {0}'.format(i)
#            break

      score = sum(weights) / 5.0
      print '{0} -- {1} - {2}'.format(probs, score, realScore)
      return score
Ejemplo n.º 5
0
   def __init__(self, trainingSet):
      self.grams = {}

      #TEST
      rawClassFeatures = {}
      for i in range(1, 6):
         rawClassFeatures[i] = set()

      for trainingDocument in trainingSet:
         if not self.grams.has_key(trainingDocument[1]):
            self.grams[trainingDocument[1]] = []
         grams = set(features.batchStem(features.toUnigrams(trainingDocument[0])))
         self.grams[trainingDocument[1]].append(grams)
         rawClassFeatures[trainingDocument[1]] |= grams

      self.uniqueFeatures = {}
      for i in range(1, 6):
         self.uniqueFeatures[i] = set(rawClassFeatures[i])
         for j in range(1, 6):
            if i != j:
               self.uniqueFeatures[i] -= rawClassFeatures[j]
Ejemplo n.º 6
0
   def classifyDocument(self, document, realScore = -1):
      if self.stem:
         unigrams = features.batchStem(features.toUnigrams(document))
      else:
         unigrams = features.toUnigrams(document)

      scores = []
      mean = 0

      for gram in unigrams:
         if self.weights.has_key(gram):
            scores.append(self.weights[gram])
            mean += self.weights[gram]

      # No counting words here, assign the prior probability
      if len(scores) == 0:
         return self.meanScore
      mean /= float(len(scores))

      positives = []
      negatives = []

      # Get me some residuals, yum yum.
      squareResiduals = []
      for score in scores:
         #if score < mean:
         if score < self.meanScore:
            negatives.append(self.meanScore - score)
            #negatives.append(mean - score)
            #negatives.append(score)
         else:
            positives.append(score - self.meanScore)
            #positives.append(score - mean)
            #positives.append(score)

         sign = 1
         if score < mean:
            sign = -1
         squareResiduals.append(sign * math.pow(mean - score, 2))

      squareSum = sum(squareResiduals)

      sign = 1
      if squareSum < 0:
         sign = -1

      crazyScore = mean + sign * math.sqrt(abs(squareSum))

      #positiveScore = sum(positives)
      #negativeScore = sum(negatives)
      if len(positives) == 0:
         positiveScore = 0.0005
      else:
         positiveScore = sum(positives)
         #positiveScore = sum(positives) * len(scores) / len(positives)

      if len(negatives) == 0:
         negativeScore = 0.0005
      else:
         negativeScore = sum(negatives)
         #negativeScore = sum(negatives) * len(scores) / len(negatives)

      evenScore = positiveScore / negativeScore * mean

      sortedScores = sorted(scores)
      median = sortedScores[int(len(sortedScores) / 2)]

      #print 'Even -- Positive: {0}, Negative: {1}, Even Score: {2}'.format(positiveScore, negativeScore, evenScore)
      #print 'Mean: {0}, Median: {1}, CrazyScore: {2}, Real Score: {3}'.format(mean, median, crazyScore, realScore)

      rtn = 4
      if positiveScore > negativeScore:
         #rtn = int(0.5 + median)
         #rtn = 0.5 + mean
         rtn = 0.5 + self.meanScore
         #rtn = 1 + self.meanScore
         #rtn = 0.5 + self.medianScore
      else:
         #rtn = int(median - 0.5)
         #rtn = int(mean - 0.5)
         #rtn = mean - 0.5
         rtn = self.meanScore - 0.5
         #rtn = self.meanScore - 1
         #rtn = self.medianScore - 0.5
      #weight = ((positiveScore - negativeScore) / (negativeScore + positiveScore))
      weight = ((positiveScore - negativeScore) / (negativeScore + positiveScore))
      #weight = (weight / abs(weight)) * pow(weight, 2)
      #weight = ((positiveScore - negativeScore) / (negativeScore + positiveScore)) * 2
      globalRtn = self.meanScore + weight * 0.5
      localRtn = mean + weight * 0.5
      #rtn = int(self.meanScore + (weight * 0.5) + 0.5)

      # Adjust by personal offset.

      #print 'Final Score: {0}, Int Score: {1}, Weight: {2}'.format(rtn, int(rtn + 0.5), weight)
      #print 'LocalRtn: {0}, GlobalRtn: {1}, Weight: {2} -- r{3}\n'.format(localRtn, globalRtn, weight, realScore)

      # Experiment has shown that we almost always predict over.

      return int(globalRtn + 0.5)