Esempio n. 1
0
def main(argv):
    outdir = "preprocessed_data"

    out_file = ''
    out_reduced = ''
    in_file = ''
    max_tweets = np.inf
    fwemb_vocabulary = None
    try:
        opts, args = getopt.getopt(
            argv, "i:o:f:m:", ["ifile=", "ofile=", "wfilter", 'maxTweets'])
    except getopt.GetoptError:
        print 'test.py -i <inputfile> -o <outputfile>'
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-o", "--ofile"):
            out_file = '{}.pickle'.format(arg)
            out_reduced = '{}_reduced.pickle'.format(arg)
        elif opt in ("-i", "--ifile"):
            in_file = 'semeval/{}.gz'.format(arg)
        elif opt in ('-f', '--wfilter'):
            fwemb_vocabulary = load_glove_vocabulary(
                'embeddings/{}'.format(arg), ' ')
        elif opt in ('-m', '--maxTweets'):
            max_tweets = int(arg)

    print outdir
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    #unsupervised data
    alphabet = Alphabet(start_feature_id=0)
    alphabet.add('UNKNOWN_WORD_IDX')
    dummy_word_idx = alphabet.fid

    tknzr = TweetTokenizer(reduce_len=True)
    fnames_gz = [in_file]

    counter = 0

    for fname in fnames_gz:
        with gzip.open(fname, 'r') as f:
            for tweet in tqdm(f):
                tweet = tknzr.tokenize(preprocess_tweet(tweet))
                for token in tweet:
                    if fwemb_vocabulary:
                        if token in fwemb_vocabulary:
                            alphabet.add(token)
                    else:
                        alphabet.add(token)
                counter += 1
                if (counter % 1000000) == 0:
                    print 'Processed tweets: {}'.format(counter)
                    print 'Alphabet Lenght: {}'.format(len(alphabet))
                if counter > max_tweets:
                    break
        print len(alphabet)

    print 'Alphabet before purge:', len(alphabet)
    cPickle.dump(alphabet, open(os.path.join(outdir, out_file), 'wb'))

    for word, (idx, freq) in tqdm(alphabet.items()):
        if freq > 10:
            alphabet.add(word)

    alphabet.add('DUMMY_WORD_IDX"')
    print "Alphabet after purge:", len(alphabet)
    cPickle.dump(alphabet, open(os.path.join(outdir, out_reduced), 'wb'))