Ejemplo n.º 1
0
class ProjectVectorBuilder():
    
    projects = {}
    
    def __init__(self, project_data):
        self.project_data = project_data
        
        self.nb = NaiveBayesClassifier(rm.TRAINDATA_VOCAB, rm.TRAINDATA_DATASET)
        self.nb.train()
    
    def build_projects_vector(self):
        print "In build projects"
        for name, project in self.project_data.iteritems():
            readme = project['readme']
            
            # Bad case: When readme is not found. It returns empty lists.
            if isinstance(readme, list): 
                readme = ""
            else:
                readme = unicode(readme, 'utf-8', errors = 'ignore')

            if project['description'] != None:
                readme += project['description']
            
            if readme == "":    continue

            self.projects[name] = {}
            prob_data = self.nb.classify(readme)[0]
            self.projects[name]['class_prob'] = prob_data
            self.projects[name]['description'] = project['description']
            if len(prob_data) > 0:
                self.projects[name]['category'] = max(prob_data.iteritems(), key=operator.itemgetter(1))[0]
                self.projects[name]['prob'] = max(prob_data.iteritems(), key=operator.itemgetter(1))[1]
        return self.projects
Ejemplo n.º 2
0
    def k_fold(self, k):
        assert k > 1
        print('Starting ' + str(k) + '-fold cross-validation.')
        input('Press Enter to continue...')
        for run in range(0, k):
            print('Run ' + str(run+1))
            nb = NBC()
            testing_data = []
            training_data = []
            for idx, d in enumerate(self.__data):
                subset_size = int(len(d)/k)
                testing_data.append(d[run*subset_size:(run+1)*subset_size])
                training_data.append(d[:run*subset_size] + d[(run+1)*subset_size:])

            nb.train(training_data, self.__label)

            hits = 0
            misses = 0
            for idx, item in enumerate(self.__label):
                for doc in testing_data[idx]:
                    if nb.predict(doc) == item:
                        hits += 1
                    else:
                        misses += 1

            total_length = 0
            for item in testing_data:
                total_length += len(item)

            self.__precision.append((hits/total_length, misses/total_length))

            if self.__verbose:
                nb.info(self.__level)
                input('Press Enter to continue...')

        total = 0
        for item in self.__precision:
            print('Precision: %.2d' % (item[0]*100))
            total += item[0]
        total /= len(self.__precision)
        print('Average precision: %.2d' % (total*100))
Ejemplo n.º 3
0
 def __init__(self, project_data):
     self.project_data = project_data
     
     self.nb = NaiveBayesClassifier(rm.TRAINDATA_VOCAB, rm.TRAINDATA_DATASET)
     self.nb.train()