Beispiel #1
0
def process_tweets(csv_file, test_file=True):
    """Returns a list of tuples of type (tweet_id, feature_vector)
            or (tweet_id, sentiment, feature_vector)

    Args:
        csv_file (str): Name of processed csv file generated by preprocess.py
        test_file (bool, optional): If processing test file

    Returns:
        list: Of tuples
    """
    tweets = []
    print('Generating feature vectors')
    with open(csv_file, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            if test_file:
                tweet_id, tweet = line.split(',')
            else:
                tweet_id, sentiment, tweet = line.split(',')
            feature_vector = get_feature_vector(tweet)
            if test_file:
                tweets.append((tweet_id, feature_vector))
            else:
                tweets.append((tweet_id, int(sentiment), feature_vector))
            utils.write_status(i + 1, total)
    print('\n')
    return tweets
Beispiel #2
0
def dtmain(dataset):
    dict = {}
    np.random.seed(1337)
    unigrams = utils.top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE)
    if USE_BIGRAMS:
        bigrams = utils.top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE)
    tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False)
    if TRAIN:
        train_tweets, val_tweets = utils.split_data(tweets)
    else:
        random.shuffle(tweets)
        train_tweets = tweets
    del tweets
    print('Extracting features & training batches')
    clf = DecisionTreeClassifier(max_depth=25)
    batch_size = len(train_tweets)
    i = 1
    n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size)))
    for training_set_X, training_set_y in extract_features(
            train_tweets,
            test_file=False,
            feat_type=FEAT_TYPE,
            batch_size=batch_size):
        utils.write_status(i, n_train_batches)
        i += 1
        if FEAT_TYPE == 'frequency':
            tfidf = apply_tf_idf(training_set_X)
            training_set_X = tfidf.transform(training_set_X)
        clf.fit(training_set_X, training_set_y)
    print('\n')
    print('Testing')
    if TRAIN:
        correct, total = 0, len(val_tweets)
        i = 1
        batch_size = len(val_tweets)
        n_val_batches = int(np.ceil(len(val_tweets) / float(batch_size)))
        for val_set_X, val_set_y in extract_features(val_tweets,
                                                     test_file=False,
                                                     feat_type=FEAT_TYPE,
                                                     batch_size=batch_size):
            if FEAT_TYPE == 'frequency':
                val_set_X = tfidf.transform(val_set_X)
            prediction = clf.predict(val_set_X)
            correct += np.sum(prediction == val_set_y)
            utils.write_status(i, n_val_batches)
            i += 1
        dict.update({'dataset': dataset})
        dict.update({'correct': correct})
        dict.update({'total': total})
        rslt = correct * 100. / total
        dict.update({'result': round(rslt, 2)})
        print('\nCorrect: %d/%d = %.4f %%' %
              (correct, total, correct * 100. / total))
    else:
        del train_tweets
        test_tweets = process_tweets(TEST_PROCESSED_FILE, test_file=True)
        n_test_batches = int(np.ceil(len(test_tweets) / float(batch_size)))
        predictions = np.array([])
        print('Predicting batches')
        i = 1
        for test_set_X, _ in extract_features(test_tweets,
                                              test_file=True,
                                              feat_type=FEAT_TYPE):
            if FEAT_TYPE == 'frequency':
                test_set_X = tfidf.transform(test_set_X)
            prediction = clf.predict(test_set_X)
            predictions = np.concatenate((predictions, prediction))
            utils.write_status(i, n_test_batches)
            i += 1
        predictions = [(str(j), int(predictions[j]))
                       for j in range(len(test_tweets))]
        utils.save_results_to_csv(predictions, 'decisiontree.csv')
        print('\nSaved to decisiontree.csv')
    return dict
Beispiel #3
0
 def mainmethod(self):
     USE_BIGRAMS = True
     dict = {}
     np.random.seed(1337)
     #unigrams = utils.top_n_words(self.FREQ_DIST_FILE, self.UNIGRAM_SIZE)
     #if USE_BIGRAMS:
     #bigrams = utils.top_n_bigrams(self.BI_FREQ_DIST_FILE, self.BIGRAM_SIZE)
     tweets = self.process_tweets(self.TRAIN_PROCESSED_FILE,
                                  test_file=False)
     if self.TRAIN:
         train_tweets, val_tweets = utils.split_data(tweets)
     else:
         random.shuffle(tweets)
         train_tweets = tweets
     del tweets
     print('Extracting features & training batches')
     clf = MultinomialNB()
     batch_size = len(train_tweets)
     i = 1
     n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size)))
     for training_set_X, training_set_y in self.extract_features(
             train_tweets,
             test_file=False,
             feat_type=self.FEAT_TYPE,
             batch_size=batch_size):
         utils.write_status(i, n_train_batches)
         i += 1
         if self.FEAT_TYPE == 'frequency':
             tfidf = self.apply_tf_idf(training_set_X)
             training_set_X = tfidf.transform(training_set_X)
         clf.partial_fit(training_set_X, training_set_y, classes=[0, 1])
     print('\n')
     print('Testing')
     if self.TRAIN:
         correct, total = 0, len(val_tweets)
         i = 1
         batch_size = len(val_tweets)
         n_val_batches = int(np.ceil(len(val_tweets) / float(batch_size)))
         for val_set_X, val_set_y in self.extract_features(
                 val_tweets,
                 test_file=False,
                 feat_type=self.FEAT_TYPE,
                 batch_size=batch_size):
             if self.FEAT_TYPE == 'frequency':
                 val_set_X = tfidf.transform(val_set_X)
             prediction = clf.predict(val_set_X)
             correct += np.sum(prediction == val_set_y)
             utils.write_status(i, n_val_batches)
             i += 1
             dict.update({'dataset': self.TEST_PROCESSED_FILE})
             dict.update({'correct': correct})
             dict.update({'total': total})
             rslt = correct * 100. / total
             dict.update({'result': round(rslt, 2)})
         print('\nCorrect: %d/%d = %.4f %%' %
               (correct, total, correct * 100. / total))
     else:
         del train_tweets
         test_tweets = self.process_tweets(self.TEST_PROCESSED_FILE,
                                           test_file=True)
         n_test_batches = int(np.ceil(len(test_tweets) / float(batch_size)))
         predictions = np.array([])
         print('Predicting batches')
         i = 1
         for test_set_X, _ in self.extract_features(
                 test_tweets, test_file=True, feat_type=self.FEAT_TYPE):
             if FEAT_TYPE == 'frequency':
                 test_set_X = tfidf.transform(test_set_X)
             prediction = clf.predict(test_set_X)
             predictions = np.concatenate((predictions, prediction))
             utils.write_status(i, n_test_batches)
             i += 1
         predictions = [(str(j), int(predictions[j]))
                        for j in range(len(test_tweets))]
         utils.save_results_to_csv(predictions, 'naivebayes.csv')
         print('\nSaved to naivebayes.csv')
     return dict