else:
                tweet_id, sentiment, tweet =line.split(',')
            feature_vector=getFeatureVector(tweet)
            if test_file:
                tweets.append((tweet_id, feature_vector))
            else:
                tweets.append((tweet_id, int(sentiment), feature_vector))
            #utils.write_status(i+1,total)
    print ('\n')
    return tweets

if __name__ =='__main__':
    np.random.seed(1337)
    unigrams=utils.top_n_words(FreqDistFile, UnigramSize)
    if UseBigrams:
        bigrams=utils.top_n_bigrams(BiFreqDistFile, BigramSize)
    tweets=process_tweets(TrainProcessedFile, test_file=False)
    #if TRAIN:
    train_tweets, val_tweets = utils.split_data(tweets)#validation
    # else:
    #     random.shuffle(tweets)
    #     train_tweets=tweets
    del tweets
    print ('Extracting Features and Training batches')
    clf = svm.LinearSVC(C=0.1)
    batch_size=len(train_tweets)#!!!!!!!!!!!!!
    i=1
    n_train_batches=int(np.ceil(len(train_tweets)/float(batch_size)))#!!!!!!!!!!!!!!!
    
    for training_set_x, training_set_y in extractFeatures(train_tweets, test_file=False, feat_type=FeatType, batch_size=batch_size):
        #utils.write_status(i, n_train_batches)
                tweet_id, sentiment, tweet = line.split(',')
            feature_vector = get_feature_vector(tweet)
            if test_file:
                tweets.append((tweet_id, feature_vector))
            else:
                tweets.append((tweet_id, int(sentiment), feature_vector))
            utils.write_status(i + 1, total)
    print('\n')
    return tweets


if __name__ == '__main__':
    np.random.seed(1337)
    unigrams = utils.top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE)
    if USE_BIGRAMS:
        bigrams = utils.top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE)
    tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False)
    if TRAIN:
        train_tweets, val_tweets = utils.split_data(tweets)
    else:
        random.shuffle(tweets)
        train_tweets = tweets
    del tweets
    print('Extracting features & training batches')
    clf = DecisionTreeClassifier(max_depth=25)
    batch_size = len(train_tweets)
    i = 1
    n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size)))
    for training_set_X, training_set_y in extract_features(
            train_tweets,
            test_file=False,
Beispiel #3
0
    print(wordsList[:3])

    # FUNCTION CALL TO SAVE THE PROCESSED DATA INTO CSV FILES
    save_processed_file(wordsList[:31963],
                        './datasets/TRAIN_PROCESSED_FILE.csv',
                        './datasets/train_tweets.csv', False)
    save_processed_file(wordsList[31962:],
                        './datasets/TEST_PROCESSED_FILE.csv',
                        './datasets/test_tweets.csv', True)

    np.random.seed(1337)
    unigrams = utils.top_n_words('./datasets/train_tweets-freqdist.pkl',
                                 UNIGRAM_SIZE)
    if USE_BIGRAMS:
        bigrams = utils.top_n_bigrams(
            './datasets/train_tweets-freqdist-bi.pkl', BIGRAM_SIZE)

    tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False)
    if TRAIN:
        train_tweets, val_tweets = utils.split_data(tweets)
    else:
        random.shuffle(tweets)
        train_tweets = tweets
    del tweets
    # TREE CONSTRUCTION
    print('Extracting features & training batches')
    clf = DecisionTreeClassifier(max_depth=25)
    batch_size = len(train_tweets)
    i = 1
    n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size)))
    for training_set_X, training_set_y in extract_features(