예제 #1
0
class MaxEntClassifier:
    def extract_features(self, document):
        document_words = set(document)
        features = {}
        for word in self.feature_list:
            features['contains(%s)' % word] = (word in document_words)
        return features

    def get_feature_vector(self, tweet):
        words = tweet.split()
        features = []
        for word in words:
            word = word.strip('\'"?!,.')
            valid = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", word)
            if word in self.stop_words or valid is None:
                continue
            else:
                features.append(word.lower())
        for gram in nltk.bigrams(words):
            x, y = gram
            valid_x = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", x)
            valid_y = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", y)
            if x in self.stop_words or y in self.stop_words or valid_x is None or valid_y is None:
                continue
            else:
                features.append(gram[0] + " " + gram[1])
        return features

    def __init__(self,
                 stop_words_file,
                 related_training_data_file,
                 awareness_training_data_file,
                 needs_training,
                 related_classifier_dump_file,
                 awareness_classifier_dump_file,
                 feature_list_file,
                 classifier_type='nb'):
        self.helper = ClassifierHelper()
        self.stop_words = self.init_stop_words(stop_words_file)
        self.feature_list = []
        if needs_training:
            self.related_classifier = self.train_classifier(
                related_training_data_file, related_classifier_dump_file,
                feature_list_file, classifier_type)
            self.awareness_classifier = self.train_classifier(
                awareness_training_data_file, awareness_classifier_dump_file,
                feature_list_file, classifier_type)
        else:
            with open(related_classifier_dump_file, 'rb') as f:
                self.related_classifier = pickle.load(f)
            with open(awareness_classifier_dump_file, 'rb') as f:
                self.awareness_classifier = pickle.load(f)
            with open(feature_list_file, 'r') as f:
                for token in f:
                    self.feature_list.append(token.strip())

    def classify_awareness(self, tweet):
        processed_tweet = self.helper.process_tweet(tweet)
        return self.awareness_classifier.classify(
            self.extract_features(self.get_feature_vector(processed_tweet)))

    def classify_related(self, tweet):
        processed_tweet = self.helper.process_tweet(tweet)
        return self.related_classifier.classify(
            self.extract_features(self.get_feature_vector(processed_tweet)))

    def show_informative_features(self, n):
        return self.related_classifier.show_most_informative_features(
            n, show='pos'
        ), self.awareness_classifier.show_most_informative_features(n)

    def train_classifier(self, training_data_file, classifier_dump_file,
                         feature_list_file, classifier_type):
        training_data = csv.reader(codecs.open(training_data_file,
                                               'r',
                                               encoding='UTF-8'),
                                   delimiter=',',
                                   quotechar='|')
        tweets = []
        for row in training_data:
            sentiment = row[0]
            tweet = row[1]
            processed_tweet = self.helper.process_tweet(tweet)
            feature_vector = self.get_feature_vector(processed_tweet)
            self.feature_list.extend(feature_vector)
            tweets.append((feature_vector, sentiment))
        self.feature_list = list(set(self.feature_list))
        training_set = nltk.apply_features(self.extract_features, tweets)

        if classifier_type == 'nb':
            out_classifier = nltk.classify.NaiveBayesClassifier.train(
                training_set)
            with open(classifier_dump_file, 'wb') as f:
                pickle.dump(out_classifier, f)
        elif classifier_type == 'maxent':
            out_classifier = nltk.classify.maxent.MaxentClassifier.train(
                training_set,
                'GIS',
                trace=3,
                labels=None,
                gaussian_prior_sigma=0,
                max_iter=10)
            with open(classifier_dump_file, 'wb') as f:
                pickle.dump(out_classifier, f)

        with open(feature_list_file, 'w') as f:
            for token in self.feature_list:
                f.write(token + '\n')
        return out_classifier

    def init_stop_words(self, stop_words_file):
        stop_words = ['AT_USER', 'URL']
        with open(stop_words_file, 'r') as file:
            for word in file:
                stop_words.append(word.strip())
        return stop_words
예제 #2
0
class NaiveBayesClassifier:
    """ Naive Bayes Classifier """
    def __init__(self, trainingDataFile, classifierDumpFile, datadir):
        # Instantiate classifier helper
        self.helper = ClassifierHelper('%s/%s' % (datadir, 'feature_list.txt'),
                                       '%s/%s' % (datadir, 'stop_words.txt'))
        self.trainingDataFile = trainingDataFile
        self.classifierPickled = classifierDumpFile
        self.last_trained = None
        self.classifier = self._getClassifier()

    def _getClassifier(self, reload_existing=False):
        import os.path
        # Record time.
        self.time = datetime.now()
        if reload_existing:
            if os.path.exists(self.classifierPickled):
                f1 = open(self.classifierPickled)
                if (f1):
                    self.classifier = pickle.load(f1)
                    f1.close()
                    return
        return self._getNBTrainedClassifer(self.trainingDataFile,
                                           self.classifierPickled)

    def _getUniqData(self, data):
        uniq_data = {}
        for i in data:
            d = data[i]
            u = []
            for element in d:
                if element not in u:
                    u.append(element)
            # end inner loop
            uniq_data[i] = u
        # end outer loop
        return uniq_data

    # start getProcessedTweets
    def _getProcessedTweets(self, data):
        tweets = {}
        for i in data:
            d = data[i]
            tw = []
            for t in d:
                tw.append(self.helper.process_tweet(t))
            tweets[i] = tw
        # end loop
        return tweets

    def _getNBTrainedClassifer(self, trainingDataFile, classifierDumpFile):
        # read all tweets and labels
        tweets = self._getFilteredTrainingData(trainingDataFile)
        training_set = nltk.classify.apply_features(
            self.helper.extract_features, tweets)
        # Write back classifier and word features to a file
        classifier = nltk.NaiveBayesClassifier.train(training_set)
        outfile = open(classifierDumpFile, 'wb')
        pickle.dump(classifier, outfile)
        outfile.close()
        return classifier

    def _getFilteredTrainingData(self, _file):
        inpTweets = csv.reader(open(_file, 'rb'), delimiter=',', quotechar='|')
        count = 0
        featureList = []
        tweets = []
        for row in inpTweets:
            if len(row) < 2:
                continue
            category = row[0]
            tweet = row[1]
            processedTweet = self.helper.process_tweet(tweet)
            featureVector = self.helper.getFeatureVector(processedTweet)
            featureList.extend(featureVector)
            tweets.append((featureVector, category))
        return tweets

    # classify words
    def classify(self, message):
        processedTestTweet = self.helper.process_tweet(message)
        classification = self.classifier.classify(
            self.helper.extract_features(
                self.helper.getFeatureVector(processedTestTweet)))
        return classification
예제 #3
0
class MaxEntClassifier:
    def extract_features(self, document):
        document_words = set(document)
        features = {}
        for word in self.feature_list:
            features['contains(%s)' % word] = (word in document_words)
        return features

    def get_feature_vector(self, tweet):
        words = tweet.split()
        features = []
        for word in words:
            word = word.strip('\'"?!,.')
            valid = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", word)
            if word in self.stop_words or valid is None:
                continue
            else:
                features.append(word.lower())
        for gram in nltk.bigrams(words):
            x, y = gram
            valid_x = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", x)
            valid_y = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", y)
            if x in self.stop_words or y in self.stop_words or valid_x is None or valid_y is None:
                continue
            else:
                features.append(gram[0] + " " + gram[1])
        return features

    def __init__(self, stop_words_file, related_training_data_file, awareness_training_data_file, needs_training,
                 related_classifier_dump_file, awareness_classifier_dump_file, feature_list_file,
                 classifier_type='nb'):
        self.helper = ClassifierHelper()
        self.stop_words = self.init_stop_words(stop_words_file)
        self.feature_list = []
        if needs_training:
            self.related_classifier = self.train_classifier(related_training_data_file, related_classifier_dump_file,
                                                            feature_list_file,
                                                            classifier_type)
            self.awareness_classifier = self.train_classifier(awareness_training_data_file,
                                                              awareness_classifier_dump_file, feature_list_file,
                                                              classifier_type)
        else:
            with open(related_classifier_dump_file, 'rb') as f:
                self.related_classifier = pickle.load(f)
            with open(awareness_classifier_dump_file, 'rb') as f:
                self.awareness_classifier = pickle.load(f)
            with open(feature_list_file, 'r') as f:
                for token in f:
                    self.feature_list.append(token.strip())

    def classify_awareness(self, tweet):
        processed_tweet = self.helper.process_tweet(tweet)
        return self.awareness_classifier.classify(self.extract_features(self.get_feature_vector(processed_tweet)))

    def classify_related(self, tweet):
        processed_tweet = self.helper.process_tweet(tweet)
        return self.related_classifier.classify(self.extract_features(self.get_feature_vector(processed_tweet)))

    def show_informative_features(self, n):
        return self.related_classifier.show_most_informative_features(n, show='pos'), self.awareness_classifier.show_most_informative_features(n)

    def train_classifier(self, training_data_file, classifier_dump_file, feature_list_file, classifier_type):
        training_data = csv.reader(codecs.open(training_data_file, 'r', encoding='UTF-8'), delimiter=',', quotechar='|')
        tweets = []
        for row in training_data:
            sentiment = row[0]
            tweet = row[1]
            processed_tweet = self.helper.process_tweet(tweet)
            feature_vector = self.get_feature_vector(processed_tweet)
            self.feature_list.extend(feature_vector)
            tweets.append((feature_vector, sentiment))
        self.feature_list = list(set(self.feature_list))
        training_set = nltk.apply_features(self.extract_features, tweets)

        if classifier_type == 'nb':
            out_classifier = nltk.classify.NaiveBayesClassifier.train(training_set)
            with open(classifier_dump_file, 'wb') as f:
                pickle.dump(out_classifier, f)
        elif classifier_type == 'maxent':
            out_classifier = nltk.classify.maxent.MaxentClassifier.train(training_set, 'GIS', trace=3, labels=None,
                                                                         gaussian_prior_sigma=0, max_iter=10)
            with open(classifier_dump_file, 'wb') as f:
                pickle.dump(out_classifier, f)

        with open(feature_list_file, 'w') as f:
            for token in self.feature_list:
                f.write(token + '\n')
        return out_classifier

    def init_stop_words(self, stop_words_file):
        stop_words = ['AT_USER', 'URL']
        with open(stop_words_file, 'r') as file:
            for word in file:
                stop_words.append(word.strip())
        return stop_words