Esempio n. 1
0
def main():
    HOME_DIR = "semeval_parsed"
    np.random.seed(123)
    input_fname = '200M'

    data_dir = HOME_DIR + '_' + input_fname

    #get vocabulary
    fname_vocab = os.path.join(data_dir, 'vocab.pickle')
    alphabet = cPickle.load(open(fname_vocab))
    words = alphabet.keys()
    print "Vocab size", len(alphabet)

    #get embeddings
    fname, delimiter, ndim = ('embeddings/smiley_tweets_embedding_final', ' ',
                              52)
    word2vec = load_glove_vec(fname, words, delimiter, ndim)

    ndim = len(word2vec[word2vec.keys()[0]])
    print 'ndim', ndim

    random_words_count = 0
    vocab_emb = np.zeros((len(alphabet) + 1, ndim), dtype='float32')
    for word, idx in alphabet.iteritems():
        word_vec = word2vec.get(word, None)
        if word_vec is None:
            word_vec = np.random.uniform(-0.25, 0.25, ndim)
            random_words_count += 1
        vocab_emb[idx] = word_vec
    print "Using zero vector as random"
    print 'random_words_count', random_words_count
    print vocab_emb.shape
    outfile = os.path.join(data_dir, 'emb_smiley_tweets_embedding_final.npy')
    print outfile
    np.save(outfile, vocab_emb)
def main():
    HOME_DIR = "semeval_parsed"
    np.random.seed(123)
    input_fname = '200M'
    embedding = 'custom'
    type = '200M'
    ndim = 52

    data_dir = HOME_DIR + '_' + input_fname
    fname_vocab = os.path.join(data_dir, 'vocab_{}.pickle'.format('topic'))

    tknr = TweetTokenizer()
    alphabet = cPickle.load(open(fname_vocab))
    words = alphabet.keys()
    tok_words = {}
    words = []
    for word, idx in alphabet.iteritems():
        tok_word = tknr.tokenize(word.decode('utf-8'))
        tok_words[idx] = tok_word
        words.extend(tok_word)

    print len(tok_words)
    print len(words)
    print "Vocab size", len(alphabet)
    fname, delimiter, ndim = (
        'embeddings/updated_embeddings_custom_200M'.format(type, str(ndim)),
        ' ', ndim)

    word2vec = load_glove_vec(fname, words, delimiter, ndim)

    print 'len', len(word2vec)
    ndim = len(word2vec[word2vec.keys()[0]])
    print 'ndim', ndim

    random_words_count = 0
    vocab_emb = np.zeros((len(alphabet) + 1, ndim), dtype='float32')

    for idx, tok_word in tok_words.iteritems():
        isrand = 1
        word_vec = np.zeros(ndim)
        for tok in tok_word:
            if tok in word2vec.keys():
                word_vec += word2vec[tok]
                isrand = 0

        if isrand:
            word_vec = np.random.uniform(-0.25, 0.25, ndim)
            random_words_count += 1
        vocab_emb[idx] = word_vec.astype(np.float32) / len(tok_word)
    print "Using zero vector as random"
    print 'random_words_count', random_words_count

    svd = TruncatedSVD(n_components=5)
    vocab_emb = svd.fit_transform(vocab_emb).astype(np.float32)
    print vocab_emb.shape
    fname = 'embeddings/smiley_tweets_embedding_{}'.format('topic')
    outfile = os.path.join(data_dir,
                           'emb_{}.npy'.format(os.path.basename(fname)))
    print outfile
    np.save(outfile, vocab_emb)
Esempio n. 3
0
def main():
    HOME_DIR = "semeval_parsed"
    np.random.seed(123)
    input_fname = '200M'

    data_dir = HOME_DIR + '_' + input_fname

    #get vocabulary
    fname_vocab = os.path.join(data_dir, 'vocab.pickle')
    alphabet = cPickle.load(open(fname_vocab))
    words = alphabet.keys()
    print "Vocab size", len(alphabet)

    #get embeddings
    fname,delimiter,ndim = ('embeddings/smiley_tweets_embedding_final',' ',52)
    word2vec = load_glove_vec(fname,words,delimiter,ndim)

    ndim = len(word2vec[word2vec.keys()[0]])
    print 'ndim', ndim

    random_words_count = 0
    vocab_emb = np.zeros((len(alphabet) + 1, ndim),dtype='float32')
    for word,idx in alphabet.iteritems():
        word_vec = word2vec.get(word, None)
        if word_vec is None:
          word_vec = np.random.uniform(-0.25, 0.25, ndim)
          random_words_count += 1
        vocab_emb[idx] = word_vec
    print "Using zero vector as random"
    print 'random_words_count', random_words_count
    print vocab_emb.shape
    outfile = os.path.join(data_dir, 'emb_smiley_tweets_embedding_final.npy')
    print outfile
    np.save(outfile, vocab_emb)
def main():
    input_fname = 'small'
    if len(sys.argv) > 1:
        input_fname = sys.argv[1]

    tknzr = TweetTokenizer()
    tagger = PerceptronTagger()

    fout = (
        'embeddings/smiley_tweets_embedding_expanded_{}'.format(input_fname))
    fname, delimiter, ndim = (
        'embeddings/smiley_tweets_embedding_{}'.format(input_fname), ' ', 52)
    word2vec = load_glove_vec(fname, {}, delimiter, ndim)

    tagdict = tagger.tagdict
    tagidx = {}
    nRows = len(word2vec)
    nCols = len(tagdict)

    print nRows, ':', nCols

    counter = 0
    for tag in tagdict.keys():
        tagidx[tag] = counter
        counter += 1

    exp_wemb = {}
    for word in word2vec.keys():
        exp_wemb[word] = np.zeros(nCols)

    print tagidx

    train = "semeval/task-B-train-plus-dev.tsv.gz"
    test = "semeval/task-B-test2014-twitter.tsv.gz"
    dev = "semeval/twitter-test-gold-B.downloaded.tsv.gz"
    test15 = "semeval/task-B-test2015-twitter.tsv.gz"
    smiley_pos = 'semeval/smiley_tweets_{}.gz'.format(input_fname)

    it = 0
    files = [train, test, dev, test15, smiley_pos]
    for filen in files:
        for tweet in gzip.open(filen, 'rb'):
            tweet = tknzr.tokenize(tweet.decode('utf-8'))
            tags = _pos_tag(tweet, None, tagger)
            for (word, tag) in tags:
                if word in exp_wemb.keys() and tag in tagidx.keys():
                    idx = tagidx[tag]
                    exp_wemb[word][idx] = 1
            if (it % 10) == 0:
                print 'Progress:', it
            it += 1

    f = open(fout, 'wb')
    for word in exp_wemb:
        f.write(word)
        tags = exp_wemb[word]
        for i in np.nditer(tags):
            f.write(' {}'.format(i))
        fname.write("\n")
Esempio n. 5
0
def main():
    input_fname = 'small'
    if len(sys.argv) > 1:
        input_fname = sys.argv[1]

    tknzr = TweetTokenizer()
    tagger = PerceptronTagger()

    fout = ('embeddings/smiley_tweets_embedding_expanded_{}'.format(input_fname))
    fname,delimiter,ndim = ('embeddings/smiley_tweets_embedding_{}'.format(input_fname),' ',52)
    word2vec = load_glove_vec(fname,{},delimiter,ndim)

    tagdict = tagger.tagdict
    tagidx = {}
    nRows = len(word2vec)
    nCols = len(tagdict)

    print nRows,':',nCols

    counter = 0
    for tag in tagdict.keys():
        tagidx[tag] = counter
        counter += 1

    exp_wemb = {}
    for word in word2vec.keys():
        exp_wemb[word] = np.zeros(nCols)

    print tagidx

    train = "semeval/task-B-train-plus-dev.tsv.gz"
    test = "semeval/task-B-test2014-twitter.tsv.gz"
    dev = "semeval/twitter-test-gold-B.downloaded.tsv.gz"
    test15 = "semeval/task-B-test2015-twitter.tsv.gz"
    smiley_pos = 'semeval/smiley_tweets_{}.gz'.format(input_fname)

    it = 0
    files = [train,test,dev,test15,smiley_pos]
    for filen in files:
        for tweet in gzip.open(filen,'rb'):
            tweet = tknzr.tokenize(tweet.decode('utf-8'))
            tags = _pos_tag(tweet, None, tagger)
            for (word,tag) in tags:
                if word in exp_wemb.keys() and tag in tagidx.keys():
                    idx = tagidx[tag]
                    exp_wemb[word][idx] = 1
            if (it%10) == 0:
                print 'Progress:',it
            it += 1

    f = open(fout,'wb')
    for word in exp_wemb:
        f.write(word)
        tags = exp_wemb[word]
        for i in np.nditer(tags):
            f.write(' {}'.format(i))
        fname.write("\n")
Esempio n. 6
0
def main():
    HOME_DIR = "semeval_parsed"
    np.random.seed(123)
    input_fname = '200M'
    embedding = 'custom'
    type = '200M'
    ndim = 52

    data_dir = HOME_DIR + '_' + input_fname
    fname_vocab = os.path.join(data_dir, 'vocab_{}.pickle'.format('topic'))

    tknr = TweetTokenizer()
    alphabet = cPickle.load(open(fname_vocab))
    words = alphabet.keys()
    tok_words = {}
    words = []
    for word,idx in alphabet.iteritems():
        tok_word = tknr.tokenize(word.decode('utf-8'))
        tok_words[idx] = tok_word
        words.extend(tok_word)

    print len(tok_words)
    print len(words)
    print "Vocab size", len(alphabet)
    fname,delimiter,ndim = ('embeddings/updated_embeddings_custom_200M'.format(type,str(ndim)),' ',ndim)

    word2vec = load_glove_vec(fname,words,delimiter,ndim)

    print 'len',len(word2vec)
    ndim = len(word2vec[word2vec.keys()[0]])
    print 'ndim', ndim

    random_words_count = 0
    vocab_emb = np.zeros((len(alphabet) + 1, ndim),dtype='float32')

    for idx,tok_word in tok_words.iteritems():
        isrand = 1
        word_vec = np.zeros(ndim)
        for tok in tok_word:
            if tok in word2vec.keys():
                word_vec += word2vec[tok]
                isrand = 0

        if isrand:
          word_vec = np.random.uniform(-0.25, 0.25, ndim)
          random_words_count += 1
        vocab_emb[idx] = word_vec.astype(np.float32)/len(tok_word)
    print "Using zero vector as random"
    print 'random_words_count', random_words_count

    svd = TruncatedSVD(n_components=5)
    vocab_emb = svd.fit_transform(vocab_emb).astype(np.float32)
    print vocab_emb.shape
    fname = 'embeddings/smiley_tweets_embedding_{}'.format('topic')
    outfile = os.path.join(data_dir, 'emb_{}.npy'.format(os.path.basename(fname)))
    print outfile
    np.save(outfile, vocab_emb)
Esempio n. 7
0
def main(argv):
    np.random.seed(123)
    data_dir = 'preprocessed_data'
    emb_path = 'embeddings/smiley_tweets_embedding_multilingual300M'
    emb_name = 'smiley_tweets_embedding_mixed2M_words'
    fname_vocab = os.path.join(data_dir, 'vocab_reduced.pickle')
    multi_emb_path = [
        'embeddings/smiley_tweets_embedding_netherlands_300M',
        'embeddings/smiley_tweets_embedding_german_300M',
        'embeddings/smiley_tweets_embedding_italian_300M',
        'embeddings/smiley_tweets_embedding_english_590M',
    ]

    try:
        opts, args = getopt.getopt(argv, "v:e:", ["vocab=", "embedding="])
    except getopt.GetoptError:
        print 'test.py -i <inputfile> -o <outputfile>'
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-v", "--vocab"):
            fname_vocab = os.path.join(data_dir, '{}.pickle'.format(arg))
        elif opt in ("-e", "--embedding"):
            emb_path = 'embeddings/{}'.format(arg)
            emb_name = arg

    #get vocabulary
    print(fname_vocab)
    alphabet = cPickle.load(open(fname_vocab))
    words = alphabet.keys()
    print "Vocab size", len(alphabet)

    word2vec = {}
    #get embeddings
    for p in multi_emb_path:
        fname, delimiter, ndim = (p, ' ', 52)
        word2vec.update(load_glove_vec(fname, words, delimiter, ndim))

    print len(word2vec.keys())
    ndim = len(word2vec[word2vec.keys()[0]])
    print 'ndim', ndim

    random_words_count = 0
    vocab_emb = np.zeros((len(alphabet) + 1, ndim), dtype='float32')
    for word, (idx, freq) in alphabet.iteritems():
        word_vec = word2vec.get(word, None)
        if word_vec is None or word_vec.shape[0] != 52:
            word_vec = np.random.uniform(-0.25, 0.25, ndim)
            random_words_count += 1
        vocab_emb[idx] = word_vec
    print 'random_words_count', random_words_count
    print vocab_emb.shape
    outfile = os.path.join(data_dir, 'emb_{}.npy'.format(emb_name))
    print outfile
    np.save(outfile, vocab_emb)
Esempio n. 8
0
def main():
    outdir = "preprocessed_data"
    out_file = 'vocal_wembext.pickle'
    fname, delimiter, ndim = (
        'embeddings/smiley_tweets_embedding_multilingual300M', ' ', 52)
    word2vec = load_glove_vec(fname, {}, delimiter, ndim)

    alphabet = Alphabet(start_feature_id=0)
    alphabet.add('UNKNOWN_WORD_IDX')
    alphabet.add('DUMMY_WORD_IDX')
    dummy_word_idx = alphabet.get('DUMMY_WORD_IDX')

    for token in word2vec.keys():
        alphabet.add(token)

    print 'Alphabet before purge:', len(alphabet)
    cPickle.dump(alphabet, open(os.path.join(outdir, out_file), 'wb'))
Esempio n. 9
0
    def purge_dict(self,input_fname,min_freq=5):
        #removes all words from the alphabet which occur less than 5 times or are not contained in the word embeddings
        emb_fname,delimiter,ndim = ('embeddings/smiley_tweets_embedding_final',' ',52)

        word2vec = load_glove_vec(emb_fname,{},delimiter,ndim)
        for k in self.keys():
            idx,freq = self[k]
            if freq < min_freq and word2vec.get(k, None) == None:
                del self[k]
            else:
                self[k] = idx

        #reset fid after deletion
        self['UNK'] = 0
        counter = self.first
        for k,idx in sorted(self.items(),key=operator.itemgetter(1)):
            self[k] = counter
            counter += 1
        self.fid = counter
def main():
    data_dir = "parsed_tweets"
    wemb_dir = 'embeddings/smiley_tweets_embedding_final'
    wemb_delimiter = ' '
    wemb_nidm = 52

    vocabs = [
        ('parsed_tweets/vocab_words.pickle','final'),
        ('parsed_tweets/vocab_hashtags.pickle','topn')
    ]
    for fname_vocab,name in vocabs:
        #get vocabulary
        alphabet = cPickle.load(open(fname_vocab))
        words = alphabet.keys()
        print "Vocab size", len(alphabet)

        #get embeddings
        fname,delimiter,ndim = (wemb_dir,wemb_delimiter,wemb_nidm)
        word2vec = load_glove_vec(fname,words,delimiter,ndim)

        ndim = len(word2vec[word2vec.keys()[0]])
        print 'ndim', ndim

        random_words_count = 0
        vocab_emb = np.zeros((len(alphabet) + 1, ndim),dtype='float32')
        rand_vec = np.random.uniform(-0.25, 0.25, ndim)

        for word,idx in alphabet.iteritems():
            word_vec = word2vec.get(word, None)
            if word_vec is None:
              word_vec = rand_vec
              random_words_count += 1
            vocab_emb[idx] = word_vec
        print "Using zero vector as random"
        print 'random_words_count', random_words_count
        print vocab_emb.shape
        outfile = os.path.join(data_dir, 'emb_smiley_tweets_embedding_{}.npy'.format(name))
        print outfile
        np.save(outfile, vocab_emb)
def main():
    data_dir = "parsed_tweets"
    wemb_dir = 'embeddings/smiley_tweets_embedding_final'
    wemb_delimiter = ' '
    wemb_nidm = 52

    vocabs = [('parsed_tweets/vocab_words.pickle', 'final'),
              ('parsed_tweets/vocab_hashtags.pickle', 'topn')]
    for fname_vocab, name in vocabs:
        #get vocabulary
        alphabet = cPickle.load(open(fname_vocab))
        words = alphabet.keys()
        print "Vocab size", len(alphabet)

        #get embeddings
        fname, delimiter, ndim = (wemb_dir, wemb_delimiter, wemb_nidm)
        word2vec = load_glove_vec(fname, words, delimiter, ndim)

        ndim = len(word2vec[word2vec.keys()[0]])
        print 'ndim', ndim

        random_words_count = 0
        vocab_emb = np.zeros((len(alphabet) + 1, ndim), dtype='float32')
        rand_vec = np.random.uniform(-0.25, 0.25, ndim)

        for word, idx in alphabet.iteritems():
            word_vec = word2vec.get(word, None)
            if word_vec is None:
                word_vec = rand_vec
                random_words_count += 1
            vocab_emb[idx] = word_vec
        print "Using zero vector as random"
        print 'random_words_count', random_words_count
        print vocab_emb.shape
        outfile = os.path.join(
            data_dir, 'emb_smiley_tweets_embedding_{}.npy'.format(name))
        print outfile
        np.save(outfile, vocab_emb)