Esempio n. 1
0
class NaiveBayesClassifier:
    """ Naive Bayes Classifier """
    def __init__(self, trainingDataFile, classifierDumpFile, datadir):
        # Instantiate classifier helper
        self.helper = ClassifierHelper('%s/%s' % (datadir, 'feature_list.txt'),
                                       '%s/%s' % (datadir, 'stop_words.txt'))
        self.trainingDataFile = trainingDataFile
        self.classifierPickled = classifierDumpFile
        self.last_trained = None
        self.classifier = self._getClassifier()

    def _getClassifier(self, reload_existing=False):
        import os.path
        # Record time.
        self.time = datetime.now()
        if reload_existing:
            if os.path.exists(self.classifierPickled):
                f1 = open(self.classifierPickled)
                if (f1):
                    self.classifier = pickle.load(f1)
                    f1.close()
                    return
        return self._getNBTrainedClassifer(self.trainingDataFile,
                                           self.classifierPickled)

    def _getUniqData(self, data):
        uniq_data = {}
        for i in data:
            d = data[i]
            u = []
            for element in d:
                if element not in u:
                    u.append(element)
            # end inner loop
            uniq_data[i] = u
        # end outer loop
        return uniq_data

    # start getProcessedTweets
    def _getProcessedTweets(self, data):
        tweets = {}
        for i in data:
            d = data[i]
            tw = []
            for t in d:
                tw.append(self.helper.process_tweet(t))
            tweets[i] = tw
        # end loop
        return tweets

    def _getNBTrainedClassifer(self, trainingDataFile, classifierDumpFile):
        # read all tweets and labels
        tweets = self._getFilteredTrainingData(trainingDataFile)
        training_set = nltk.classify.apply_features(
            self.helper.extract_features, tweets)
        # Write back classifier and word features to a file
        classifier = nltk.NaiveBayesClassifier.train(training_set)
        outfile = open(classifierDumpFile, 'wb')
        pickle.dump(classifier, outfile)
        outfile.close()
        return classifier

    def _getFilteredTrainingData(self, _file):
        inpTweets = csv.reader(open(_file, 'rb'), delimiter=',', quotechar='|')
        count = 0
        featureList = []
        tweets = []
        for row in inpTweets:
            if len(row) < 2:
                continue
            category = row[0]
            tweet = row[1]
            processedTweet = self.helper.process_tweet(tweet)
            featureVector = self.helper.getFeatureVector(processedTweet)
            featureList.extend(featureVector)
            tweets.append((featureVector, category))
        return tweets

    # classify words
    def classify(self, message):
        processedTestTweet = self.helper.process_tweet(message)
        classification = self.classifier.classify(
            self.helper.extract_features(
                self.helper.getFeatureVector(processedTestTweet)))
        return classification