Example #1
0
class SentimentAnalyzer:
    threshold = 1

    def __init__(self, file_location):
        self.features = set([])
        raw_data = []
        training_data = []
        word_freq = {}
        #self.word_freq = {}
        with open(file_location, 'rb') as data:
            data_reader = csv.DictReader(data)
            for row in data_reader:
                # print row
                h_tokens = nltk.word_tokenize(row['headline'].lower())
                #self.features = self.features.union(set(h_tokens))

                for token in h_tokens:
                    if token in word_freq:
                        word_freq[token] += 1
                    else:
                        word_freq[token] = 1

                #for token in h_tokens:
                #    if token in self.word_freq:
                #        self.word_freq[token] += 1
                #    else:
                #        self.word_freq[token] = 1

                raw_data.append(
                    (h_tokens, 0, float(row[' anger']) / 100))  # anger
                raw_data.append(
                    (h_tokens, 1, float(row[' disgust']) / 100))  # disgust
                raw_data.append(
                    (h_tokens, 2, float(row[' fear']) / 100))  # fear
                raw_data.append((h_tokens, 3, float(row[' joy']) / 100))  # joy
                raw_data.append(
                    (h_tokens, 4, float(row[' sadness']) / 100))  # sadness
                raw_data.append(
                    (h_tokens, 5, float(row[' surprise']) / 100))  # surprise

        for key in word_freq.keys():
            if word_freq[key] > self.threshold:
                self.features.add(key)

        print "F-vec size: " + str(len(self.features))

        for data in raw_data:
            f_vector = []
            for f in self.features:
                f_vector.append(1 if f in data[0] else 0)
            training_data.append((f_vector, data[1], data[2]))

        self.classifier = NaiveBayesClassifier(6, len(self.features))
        self.classifier.train(training_data)

    def predict(self, text):
        token_set = set(nltk.word_tokenize(text.lower()))
        f_vector = []
        for f in self.features:
            f_vector.append(1 if f in token_set else 0)
        return self.classifier.predict(f_vector)

    def predict_all(self, text):
        token_set = set(nltk.word_tokenize(text.lower()))
        f_vector = []
        for f in self.features:
            f_vector.append(1 if f in token_set else 0)
        return self.classifier.predict_all(f_vector)

    def test(self, test_file_location):
        test_data = open(test_file_location, 'rb')
        test_reader = csv.DictReader(test_data)
        total = 0
        correct = 0

        for row in test_reader:
            total += 1
            emotions = map(float, [
                row[' anger'], row[' disgust'], row[' fear'], row[' joy'],
                row[' sadness'], row[' surprise']
            ])
            acceptable_emotions = []
            for i in xrange(len(emotions)):
                if emotions[i] > 1:
                    acceptable_emotions.append(i)
            acceptable_emotions = sorted(acceptable_emotions,
                                         reverse=True,
                                         key=lambda x: emotions[x])[:3]

            #print acceptable_emotions
            #print emotion
            prediction = self.predict(row['headline'])[0]
            #print prediction

            if prediction in acceptable_emotions:
                correct += 1

        return float(correct) / total
    with open(fn, 'r') as file:
        for line in file:
            if line.startswith("Subject:"):
                subject = re.sub(r"^Subject:", "", line).strip()
                data.append((subject, is_spam))


def split_data(data, p):
    return data[:int(len(data) * p)], data[int(len(data) * p):]


def in_random_order(data):
    indices = [i for i, _ in enumerate(data)]
    random.shuffle(indices)
    result = []
    for i in indices:
        result.append(data[i])
    return result


random.seed(0)
train_data, test_data = split_data(in_random_order(data), 0.75)
classifier = NaiveBayesClassifier()
classifier.train(train_data)

classified = [(message, is_spam, classifier.classify(message))
              for message, is_spam in test_data]

counts = Counter(
    (is_spam, spam_prob > 0.5) for (_, is_spam, spam_prob) in classified)