Ejemplo n.º 1
0
def transform_20ng_dataset(ouput_file):
    """
    Preprocess the 20NG dataset and stores the preprocessed data
    :param ouput_file:
    :return:
    """

    train_data, train_labels = get_20ng(split='train')
    test_data, test_labels = get_20ng(split='test')

    wv = WordVectorizer()
    wv.fit(train_data)
    glove_embedding = wv.get_glove_embedding()

    train_data = wv.transform(train_data)
    test_data = wv.transform(test_data)
    data = {}
    data['train_data'] = train_data
    data['test_data'] = test_data
    data['train_labels'] = train_labels
    data['test_labels'] = test_labels
    data['word2id'] = wv.word2id
    data['id2word'] = wv.id2word
    data['embedding'] = glove_embedding

    with open(ouput_file, "wb") as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
import os
import gensim
from tweet import Tweet
from word_vectorizer import WordVectorizer
import csv

#load the word2vec module
word2vecModule = gensim.models.KeyedVectors.load_word2vec_format(
    './embed_tweets_de_200M_200D/embedding_file', binary=False)

#initialize thw WordVectorizer model
word_vectorizer = WordVectorizer(word2vecModule)

# load train data file to get the tweets
with open('../tweetsCrawler/train.csv', 'r') as train_data:
    #write tweets and its matrix to vectors.csv
    with open('./vectors.csv', 'w') as vectors_data:

        writer = csv.writer(vectors_data)
        #write headr row to vectors.csv
        writer.writerow([
            "politician_name", "party", "tweet", "matrix",
            "percentageOfMissingWords"
        ])
        #rows to be written to vectors.csv
        out_rows = []

        reader = csv.DictReader(train_data)
        for row in reader:
            current_tweet = Tweet(row['tweet'])
            #preprocess the tweet, and get list of tokens