コード例 #1
0
 def get_feature_vector(self, tweet):
     USE_BIGRAMS = True
     #unigrams = ""
     unigrams = utils.top_n_words(self.FREQ_DIST_FILE, self.UNIGRAM_SIZE)
     bigrams = utils.top_n_bigrams(self.BI_FREQ_DIST_FILE, self.BIGRAM_SIZE)
     uni_feature_vector = []
     bi_feature_vector = []
     words = tweet.split()
     for i in range(len(words) - 1):
         word = words[i]
         next_word = words[i + 1]
         #unigrams = utils.top_n_words(self.FREQ_DIST_FILE, self.UNIGRAM_SIZE)
         if unigrams.get(word):
             uni_feature_vector.append(word)
         if USE_BIGRAMS:
             #bigrams = utils.top_n_bigrams(self.BI_FREQ_DIST_FILE, self.BIGRAM_SIZE)
             if bigrams.get((word, next_word)):
                 bi_feature_vector.append((word, next_word))
     if len(words) >= 1:
         unigrams = utils.top_n_words(self.FREQ_DIST_FILE,
                                      self.UNIGRAM_SIZE)
         if unigrams.get(words[-1]):
             uni_feature_vector.append(words[-1])
     return uni_feature_vector, bi_feature_vector
コード例 #2
0
def dtmain(dataset):
    dict = {}
    np.random.seed(1337)
    unigrams = utils.top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE)
    if USE_BIGRAMS:
        bigrams = utils.top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE)
    tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False)
    if TRAIN:
        train_tweets, val_tweets = utils.split_data(tweets)
    else:
        random.shuffle(tweets)
        train_tweets = tweets
    del tweets
    print('Extracting features & training batches')
    clf = DecisionTreeClassifier(max_depth=25)
    batch_size = len(train_tweets)
    i = 1
    n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size)))
    for training_set_X, training_set_y in extract_features(
            train_tweets,
            test_file=False,
            feat_type=FEAT_TYPE,
            batch_size=batch_size):
        utils.write_status(i, n_train_batches)
        i += 1
        if FEAT_TYPE == 'frequency':
            tfidf = apply_tf_idf(training_set_X)
            training_set_X = tfidf.transform(training_set_X)
        clf.fit(training_set_X, training_set_y)
    print('\n')
    print('Testing')
    if TRAIN:
        correct, total = 0, len(val_tweets)
        i = 1
        batch_size = len(val_tweets)
        n_val_batches = int(np.ceil(len(val_tweets) / float(batch_size)))
        for val_set_X, val_set_y in extract_features(val_tweets,
                                                     test_file=False,
                                                     feat_type=FEAT_TYPE,
                                                     batch_size=batch_size):
            if FEAT_TYPE == 'frequency':
                val_set_X = tfidf.transform(val_set_X)
            prediction = clf.predict(val_set_X)
            correct += np.sum(prediction == val_set_y)
            utils.write_status(i, n_val_batches)
            i += 1
        dict.update({'dataset': dataset})
        dict.update({'correct': correct})
        dict.update({'total': total})
        rslt = correct * 100. / total
        dict.update({'result': round(rslt, 2)})
        print('\nCorrect: %d/%d = %.4f %%' %
              (correct, total, correct * 100. / total))
    else:
        del train_tweets
        test_tweets = process_tweets(TEST_PROCESSED_FILE, test_file=True)
        n_test_batches = int(np.ceil(len(test_tweets) / float(batch_size)))
        predictions = np.array([])
        print('Predicting batches')
        i = 1
        for test_set_X, _ in extract_features(test_tweets,
                                              test_file=True,
                                              feat_type=FEAT_TYPE):
            if FEAT_TYPE == 'frequency':
                test_set_X = tfidf.transform(test_set_X)
            prediction = clf.predict(test_set_X)
            predictions = np.concatenate((predictions, prediction))
            utils.write_status(i, n_test_batches)
            i += 1
        predictions = [(str(j), int(predictions[j]))
                       for j in range(len(test_tweets))]
        utils.save_results_to_csv(predictions, 'decisiontree.csv')
        print('\nSaved to decisiontree.csv')
    return dict
コード例 #3
0
# Performs classification using Decision Tree.

FREQ_DIST_FILE = 'training-processed-freqdist.pkl'
BI_FREQ_DIST_FILE = 'training-processed-freqdist-bi.pkl'
TRAIN_PROCESSED_FILE = settings.MEDIA_ROOT + "\\" + 'training-processed.csv'
TEST_PROCESSED_FILE = 'testigdataset-processed.csv'

# True while training.
TRAIN = True
UNIGRAM_SIZE = 15000
VOCAB_SIZE = UNIGRAM_SIZE
BIGRAM_SIZE = 10000
# If using bigrams.
USE_BIGRAMS = False
unigrams = utils.top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE)
bigrams = utils.top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE)

if USE_BIGRAMS:
    BIGRAM_SIZE = 10000
    VOCAB_SIZE = UNIGRAM_SIZE + BIGRAM_SIZE
FEAT_TYPE = 'frequency'


def get_feature_vector(tweet):
    uni_feature_vector = []
    bi_feature_vector = []
    words = tweet.split()
    for i in range(len(words) - 1):
        word = words[i]
        next_word = words[i + 1]
        if unigrams.get(word):
コード例 #4
0
class NaivebayesDPH:
    FREQ_DIST_FILE = 'training-processed-freqdist.pkl'
    BI_FREQ_DIST_FILE = 'training-processed-freqdist-bi.pkl'
    TRAIN_PROCESSED_FILE = settings.MEDIA_ROOT + "\\" + 'training-processed.csv'
    TEST_PROCESSED_FILE = ''
    TRAIN = True
    UNIGRAM_SIZE = 15000
    VOCAB_SIZE = UNIGRAM_SIZE
    USE_BIGRAMS = True
    BIGRAM_SIZE = 10000
    unigrams = utils.top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE)
    bigrams = utils.top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE)

    def __init__(self, testdataset):
        self.TEST_PROCESSED_FILE = testdataset
        #self.unigrams = utils.top_n_words(self.FREQ_DIST_FILE, self.UNIGRAM_SIZE)

    if USE_BIGRAMS:
        BIGRAM_SIZE = 10000
        VOCAB_SIZE = UNIGRAM_SIZE + BIGRAM_SIZE
    FEAT_TYPE = 'frequency'

    def get_feature_vector(self, tweet):
        USE_BIGRAMS = True
        #unigrams = ""
        unigrams = utils.top_n_words(self.FREQ_DIST_FILE, self.UNIGRAM_SIZE)
        bigrams = utils.top_n_bigrams(self.BI_FREQ_DIST_FILE, self.BIGRAM_SIZE)
        uni_feature_vector = []
        bi_feature_vector = []
        words = tweet.split()
        for i in range(len(words) - 1):
            word = words[i]
            next_word = words[i + 1]
            #unigrams = utils.top_n_words(self.FREQ_DIST_FILE, self.UNIGRAM_SIZE)
            if unigrams.get(word):
                uni_feature_vector.append(word)
            if USE_BIGRAMS:
                #bigrams = utils.top_n_bigrams(self.BI_FREQ_DIST_FILE, self.BIGRAM_SIZE)
                if bigrams.get((word, next_word)):
                    bi_feature_vector.append((word, next_word))
        if len(words) >= 1:
            unigrams = utils.top_n_words(self.FREQ_DIST_FILE,
                                         self.UNIGRAM_SIZE)
            if unigrams.get(words[-1]):
                uni_feature_vector.append(words[-1])
        return uni_feature_vector, bi_feature_vector

    def extract_features(self,
                         tweets,
                         batch_size=500,
                         test_file=True,
                         feat_type='presence'):
        num_batches = int(np.ceil(len(tweets) / float(batch_size)))
        for i in range(num_batches):
            batch = tweets[i * batch_size:(i + 1) * batch_size]
            features = lil_matrix((batch_size, self.VOCAB_SIZE))
            labels = np.zeros(batch_size)
            for j, tweet in enumerate(batch):
                if test_file:
                    tweet_words = tweet[1][0]
                    tweet_bigrams = tweet[1][1]
                else:
                    tweet_words = tweet[2][0]
                    tweet_bigrams = tweet[2][1]
                    labels[j] = tweet[1]
                if feat_type == 'presence':
                    tweet_words = set(tweet_words)
                    tweet_bigrams = set(tweet_bigrams)
                for word in tweet_words:
                    idx = self.unigrams.get(word)
                    if idx:
                        features[j, idx] += 1
                if self.USE_BIGRAMS:
                    for bigram in tweet_bigrams:
                        idx = self.bigrams.get(bigram)
                        if idx:
                            features[j, self.UNIGRAM_SIZE + idx] += 1
            yield features, labels

    def apply_tf_idf(self, X):
        transformer = TfidfTransformer(smooth_idf=True,
                                       sublinear_tf=True,
                                       use_idf=True)
        transformer.fit(X)
        return transformer

    def process_tweets(self, csv_file, test_file=True):
        """Returns a list of tuples of type (tweet_id, feature_vector)
                or (tweet_id, sentiment, feature_vector)

        Args:
            csv_file (str): Name of processed csv file generated by preprocess.py
            test_file (bool, optional): If processing test file

        Returns:
            list: Of tuples
        """
        tweets = []
        print('Generating feature vectors')
        cvs_file = self.TEST_PROCESSED_FILE
        with open(csv_file, 'r') as csv:
            lines = csv.readlines()
            total = len(lines)
            for i, line in enumerate(lines):
                if test_file:
                    tweet_id, tweet = line.split(',')
                else:
                    tweet_id, sentiment, tweet = line.split(',')
                feature_vector = self.get_feature_vector(tweet)
                if test_file:
                    tweets.append((tweet_id, feature_vector))
                else:
                    tweets.append((tweet_id, int(sentiment), feature_vector))
                utils.write_status(i + 1, total)
        print('\n')
        return tweets

    def mainmethod(self):
        USE_BIGRAMS = True
        dict = {}
        np.random.seed(1337)
        #unigrams = utils.top_n_words(self.FREQ_DIST_FILE, self.UNIGRAM_SIZE)
        #if USE_BIGRAMS:
        #bigrams = utils.top_n_bigrams(self.BI_FREQ_DIST_FILE, self.BIGRAM_SIZE)
        tweets = self.process_tweets(self.TRAIN_PROCESSED_FILE,
                                     test_file=False)
        if self.TRAIN:
            train_tweets, val_tweets = utils.split_data(tweets)
        else:
            random.shuffle(tweets)
            train_tweets = tweets
        del tweets
        print('Extracting features & training batches')
        clf = MultinomialNB()
        batch_size = len(train_tweets)
        i = 1
        n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size)))
        for training_set_X, training_set_y in self.extract_features(
                train_tweets,
                test_file=False,
                feat_type=self.FEAT_TYPE,
                batch_size=batch_size):
            utils.write_status(i, n_train_batches)
            i += 1
            if self.FEAT_TYPE == 'frequency':
                tfidf = self.apply_tf_idf(training_set_X)
                training_set_X = tfidf.transform(training_set_X)
            clf.partial_fit(training_set_X, training_set_y, classes=[0, 1])
        print('\n')
        print('Testing')
        if self.TRAIN:
            correct, total = 0, len(val_tweets)
            i = 1
            batch_size = len(val_tweets)
            n_val_batches = int(np.ceil(len(val_tweets) / float(batch_size)))
            for val_set_X, val_set_y in self.extract_features(
                    val_tweets,
                    test_file=False,
                    feat_type=self.FEAT_TYPE,
                    batch_size=batch_size):
                if self.FEAT_TYPE == 'frequency':
                    val_set_X = tfidf.transform(val_set_X)
                prediction = clf.predict(val_set_X)
                correct += np.sum(prediction == val_set_y)
                utils.write_status(i, n_val_batches)
                i += 1
                dict.update({'dataset': self.TEST_PROCESSED_FILE})
                dict.update({'correct': correct})
                dict.update({'total': total})
                rslt = correct * 100. / total
                dict.update({'result': round(rslt, 2)})
            print('\nCorrect: %d/%d = %.4f %%' %
                  (correct, total, correct * 100. / total))
        else:
            del train_tweets
            test_tweets = self.process_tweets(self.TEST_PROCESSED_FILE,
                                              test_file=True)
            n_test_batches = int(np.ceil(len(test_tweets) / float(batch_size)))
            predictions = np.array([])
            print('Predicting batches')
            i = 1
            for test_set_X, _ in self.extract_features(
                    test_tweets, test_file=True, feat_type=self.FEAT_TYPE):
                if FEAT_TYPE == 'frequency':
                    test_set_X = tfidf.transform(test_set_X)
                prediction = clf.predict(test_set_X)
                predictions = np.concatenate((predictions, prediction))
                utils.write_status(i, n_test_batches)
                i += 1
            predictions = [(str(j), int(predictions[j]))
                           for j in range(len(test_tweets))]
            utils.save_results_to_csv(predictions, 'naivebayes.csv')
            print('\nSaved to naivebayes.csv')
        return dict