def main(): HOME_DIR = "semeval_parsed" np.random.seed(123) input_fname = '200M' data_dir = HOME_DIR + '_' + input_fname #get vocabulary fname_vocab = os.path.join(data_dir, 'vocab.pickle') alphabet = cPickle.load(open(fname_vocab)) words = alphabet.keys() print "Vocab size", len(alphabet) #get embeddings fname, delimiter, ndim = ('embeddings/smiley_tweets_embedding_final', ' ', 52) word2vec = load_glove_vec(fname, words, delimiter, ndim) ndim = len(word2vec[word2vec.keys()[0]]) print 'ndim', ndim random_words_count = 0 vocab_emb = np.zeros((len(alphabet) + 1, ndim), dtype='float32') for word, idx in alphabet.iteritems(): word_vec = word2vec.get(word, None) if word_vec is None: word_vec = np.random.uniform(-0.25, 0.25, ndim) random_words_count += 1 vocab_emb[idx] = word_vec print "Using zero vector as random" print 'random_words_count', random_words_count print vocab_emb.shape outfile = os.path.join(data_dir, 'emb_smiley_tweets_embedding_final.npy') print outfile np.save(outfile, vocab_emb)
def main(): HOME_DIR = "semeval_parsed" np.random.seed(123) input_fname = '200M' embedding = 'custom' type = '200M' ndim = 52 data_dir = HOME_DIR + '_' + input_fname fname_vocab = os.path.join(data_dir, 'vocab_{}.pickle'.format('topic')) tknr = TweetTokenizer() alphabet = cPickle.load(open(fname_vocab)) words = alphabet.keys() tok_words = {} words = [] for word, idx in alphabet.iteritems(): tok_word = tknr.tokenize(word.decode('utf-8')) tok_words[idx] = tok_word words.extend(tok_word) print len(tok_words) print len(words) print "Vocab size", len(alphabet) fname, delimiter, ndim = ( 'embeddings/updated_embeddings_custom_200M'.format(type, str(ndim)), ' ', ndim) word2vec = load_glove_vec(fname, words, delimiter, ndim) print 'len', len(word2vec) ndim = len(word2vec[word2vec.keys()[0]]) print 'ndim', ndim random_words_count = 0 vocab_emb = np.zeros((len(alphabet) + 1, ndim), dtype='float32') for idx, tok_word in tok_words.iteritems(): isrand = 1 word_vec = np.zeros(ndim) for tok in tok_word: if tok in word2vec.keys(): word_vec += word2vec[tok] isrand = 0 if isrand: word_vec = np.random.uniform(-0.25, 0.25, ndim) random_words_count += 1 vocab_emb[idx] = word_vec.astype(np.float32) / len(tok_word) print "Using zero vector as random" print 'random_words_count', random_words_count svd = TruncatedSVD(n_components=5) vocab_emb = svd.fit_transform(vocab_emb).astype(np.float32) print vocab_emb.shape fname = 'embeddings/smiley_tweets_embedding_{}'.format('topic') outfile = os.path.join(data_dir, 'emb_{}.npy'.format(os.path.basename(fname))) print outfile np.save(outfile, vocab_emb)
def main(): HOME_DIR = "semeval_parsed" np.random.seed(123) input_fname = '200M' data_dir = HOME_DIR + '_' + input_fname #get vocabulary fname_vocab = os.path.join(data_dir, 'vocab.pickle') alphabet = cPickle.load(open(fname_vocab)) words = alphabet.keys() print "Vocab size", len(alphabet) #get embeddings fname,delimiter,ndim = ('embeddings/smiley_tweets_embedding_final',' ',52) word2vec = load_glove_vec(fname,words,delimiter,ndim) ndim = len(word2vec[word2vec.keys()[0]]) print 'ndim', ndim random_words_count = 0 vocab_emb = np.zeros((len(alphabet) + 1, ndim),dtype='float32') for word,idx in alphabet.iteritems(): word_vec = word2vec.get(word, None) if word_vec is None: word_vec = np.random.uniform(-0.25, 0.25, ndim) random_words_count += 1 vocab_emb[idx] = word_vec print "Using zero vector as random" print 'random_words_count', random_words_count print vocab_emb.shape outfile = os.path.join(data_dir, 'emb_smiley_tweets_embedding_final.npy') print outfile np.save(outfile, vocab_emb)
def main(): input_fname = 'small' if len(sys.argv) > 1: input_fname = sys.argv[1] tknzr = TweetTokenizer() tagger = PerceptronTagger() fout = ( 'embeddings/smiley_tweets_embedding_expanded_{}'.format(input_fname)) fname, delimiter, ndim = ( 'embeddings/smiley_tweets_embedding_{}'.format(input_fname), ' ', 52) word2vec = load_glove_vec(fname, {}, delimiter, ndim) tagdict = tagger.tagdict tagidx = {} nRows = len(word2vec) nCols = len(tagdict) print nRows, ':', nCols counter = 0 for tag in tagdict.keys(): tagidx[tag] = counter counter += 1 exp_wemb = {} for word in word2vec.keys(): exp_wemb[word] = np.zeros(nCols) print tagidx train = "semeval/task-B-train-plus-dev.tsv.gz" test = "semeval/task-B-test2014-twitter.tsv.gz" dev = "semeval/twitter-test-gold-B.downloaded.tsv.gz" test15 = "semeval/task-B-test2015-twitter.tsv.gz" smiley_pos = 'semeval/smiley_tweets_{}.gz'.format(input_fname) it = 0 files = [train, test, dev, test15, smiley_pos] for filen in files: for tweet in gzip.open(filen, 'rb'): tweet = tknzr.tokenize(tweet.decode('utf-8')) tags = _pos_tag(tweet, None, tagger) for (word, tag) in tags: if word in exp_wemb.keys() and tag in tagidx.keys(): idx = tagidx[tag] exp_wemb[word][idx] = 1 if (it % 10) == 0: print 'Progress:', it it += 1 f = open(fout, 'wb') for word in exp_wemb: f.write(word) tags = exp_wemb[word] for i in np.nditer(tags): f.write(' {}'.format(i)) fname.write("\n")
def main(): input_fname = 'small' if len(sys.argv) > 1: input_fname = sys.argv[1] tknzr = TweetTokenizer() tagger = PerceptronTagger() fout = ('embeddings/smiley_tweets_embedding_expanded_{}'.format(input_fname)) fname,delimiter,ndim = ('embeddings/smiley_tweets_embedding_{}'.format(input_fname),' ',52) word2vec = load_glove_vec(fname,{},delimiter,ndim) tagdict = tagger.tagdict tagidx = {} nRows = len(word2vec) nCols = len(tagdict) print nRows,':',nCols counter = 0 for tag in tagdict.keys(): tagidx[tag] = counter counter += 1 exp_wemb = {} for word in word2vec.keys(): exp_wemb[word] = np.zeros(nCols) print tagidx train = "semeval/task-B-train-plus-dev.tsv.gz" test = "semeval/task-B-test2014-twitter.tsv.gz" dev = "semeval/twitter-test-gold-B.downloaded.tsv.gz" test15 = "semeval/task-B-test2015-twitter.tsv.gz" smiley_pos = 'semeval/smiley_tweets_{}.gz'.format(input_fname) it = 0 files = [train,test,dev,test15,smiley_pos] for filen in files: for tweet in gzip.open(filen,'rb'): tweet = tknzr.tokenize(tweet.decode('utf-8')) tags = _pos_tag(tweet, None, tagger) for (word,tag) in tags: if word in exp_wemb.keys() and tag in tagidx.keys(): idx = tagidx[tag] exp_wemb[word][idx] = 1 if (it%10) == 0: print 'Progress:',it it += 1 f = open(fout,'wb') for word in exp_wemb: f.write(word) tags = exp_wemb[word] for i in np.nditer(tags): f.write(' {}'.format(i)) fname.write("\n")
def main(): HOME_DIR = "semeval_parsed" np.random.seed(123) input_fname = '200M' embedding = 'custom' type = '200M' ndim = 52 data_dir = HOME_DIR + '_' + input_fname fname_vocab = os.path.join(data_dir, 'vocab_{}.pickle'.format('topic')) tknr = TweetTokenizer() alphabet = cPickle.load(open(fname_vocab)) words = alphabet.keys() tok_words = {} words = [] for word,idx in alphabet.iteritems(): tok_word = tknr.tokenize(word.decode('utf-8')) tok_words[idx] = tok_word words.extend(tok_word) print len(tok_words) print len(words) print "Vocab size", len(alphabet) fname,delimiter,ndim = ('embeddings/updated_embeddings_custom_200M'.format(type,str(ndim)),' ',ndim) word2vec = load_glove_vec(fname,words,delimiter,ndim) print 'len',len(word2vec) ndim = len(word2vec[word2vec.keys()[0]]) print 'ndim', ndim random_words_count = 0 vocab_emb = np.zeros((len(alphabet) + 1, ndim),dtype='float32') for idx,tok_word in tok_words.iteritems(): isrand = 1 word_vec = np.zeros(ndim) for tok in tok_word: if tok in word2vec.keys(): word_vec += word2vec[tok] isrand = 0 if isrand: word_vec = np.random.uniform(-0.25, 0.25, ndim) random_words_count += 1 vocab_emb[idx] = word_vec.astype(np.float32)/len(tok_word) print "Using zero vector as random" print 'random_words_count', random_words_count svd = TruncatedSVD(n_components=5) vocab_emb = svd.fit_transform(vocab_emb).astype(np.float32) print vocab_emb.shape fname = 'embeddings/smiley_tweets_embedding_{}'.format('topic') outfile = os.path.join(data_dir, 'emb_{}.npy'.format(os.path.basename(fname))) print outfile np.save(outfile, vocab_emb)
def main(argv): np.random.seed(123) data_dir = 'preprocessed_data' emb_path = 'embeddings/smiley_tweets_embedding_multilingual300M' emb_name = 'smiley_tweets_embedding_mixed2M_words' fname_vocab = os.path.join(data_dir, 'vocab_reduced.pickle') multi_emb_path = [ 'embeddings/smiley_tweets_embedding_netherlands_300M', 'embeddings/smiley_tweets_embedding_german_300M', 'embeddings/smiley_tweets_embedding_italian_300M', 'embeddings/smiley_tweets_embedding_english_590M', ] try: opts, args = getopt.getopt(argv, "v:e:", ["vocab=", "embedding="]) except getopt.GetoptError: print 'test.py -i <inputfile> -o <outputfile>' sys.exit(2) for opt, arg in opts: if opt in ("-v", "--vocab"): fname_vocab = os.path.join(data_dir, '{}.pickle'.format(arg)) elif opt in ("-e", "--embedding"): emb_path = 'embeddings/{}'.format(arg) emb_name = arg #get vocabulary print(fname_vocab) alphabet = cPickle.load(open(fname_vocab)) words = alphabet.keys() print "Vocab size", len(alphabet) word2vec = {} #get embeddings for p in multi_emb_path: fname, delimiter, ndim = (p, ' ', 52) word2vec.update(load_glove_vec(fname, words, delimiter, ndim)) print len(word2vec.keys()) ndim = len(word2vec[word2vec.keys()[0]]) print 'ndim', ndim random_words_count = 0 vocab_emb = np.zeros((len(alphabet) + 1, ndim), dtype='float32') for word, (idx, freq) in alphabet.iteritems(): word_vec = word2vec.get(word, None) if word_vec is None or word_vec.shape[0] != 52: word_vec = np.random.uniform(-0.25, 0.25, ndim) random_words_count += 1 vocab_emb[idx] = word_vec print 'random_words_count', random_words_count print vocab_emb.shape outfile = os.path.join(data_dir, 'emb_{}.npy'.format(emb_name)) print outfile np.save(outfile, vocab_emb)
def main(): outdir = "preprocessed_data" out_file = 'vocal_wembext.pickle' fname, delimiter, ndim = ( 'embeddings/smiley_tweets_embedding_multilingual300M', ' ', 52) word2vec = load_glove_vec(fname, {}, delimiter, ndim) alphabet = Alphabet(start_feature_id=0) alphabet.add('UNKNOWN_WORD_IDX') alphabet.add('DUMMY_WORD_IDX') dummy_word_idx = alphabet.get('DUMMY_WORD_IDX') for token in word2vec.keys(): alphabet.add(token) print 'Alphabet before purge:', len(alphabet) cPickle.dump(alphabet, open(os.path.join(outdir, out_file), 'wb'))
def purge_dict(self,input_fname,min_freq=5): #removes all words from the alphabet which occur less than 5 times or are not contained in the word embeddings emb_fname,delimiter,ndim = ('embeddings/smiley_tweets_embedding_final',' ',52) word2vec = load_glove_vec(emb_fname,{},delimiter,ndim) for k in self.keys(): idx,freq = self[k] if freq < min_freq and word2vec.get(k, None) == None: del self[k] else: self[k] = idx #reset fid after deletion self['UNK'] = 0 counter = self.first for k,idx in sorted(self.items(),key=operator.itemgetter(1)): self[k] = counter counter += 1 self.fid = counter
def main(): data_dir = "parsed_tweets" wemb_dir = 'embeddings/smiley_tweets_embedding_final' wemb_delimiter = ' ' wemb_nidm = 52 vocabs = [ ('parsed_tweets/vocab_words.pickle','final'), ('parsed_tweets/vocab_hashtags.pickle','topn') ] for fname_vocab,name in vocabs: #get vocabulary alphabet = cPickle.load(open(fname_vocab)) words = alphabet.keys() print "Vocab size", len(alphabet) #get embeddings fname,delimiter,ndim = (wemb_dir,wemb_delimiter,wemb_nidm) word2vec = load_glove_vec(fname,words,delimiter,ndim) ndim = len(word2vec[word2vec.keys()[0]]) print 'ndim', ndim random_words_count = 0 vocab_emb = np.zeros((len(alphabet) + 1, ndim),dtype='float32') rand_vec = np.random.uniform(-0.25, 0.25, ndim) for word,idx in alphabet.iteritems(): word_vec = word2vec.get(word, None) if word_vec is None: word_vec = rand_vec random_words_count += 1 vocab_emb[idx] = word_vec print "Using zero vector as random" print 'random_words_count', random_words_count print vocab_emb.shape outfile = os.path.join(data_dir, 'emb_smiley_tweets_embedding_{}.npy'.format(name)) print outfile np.save(outfile, vocab_emb)
def main(): data_dir = "parsed_tweets" wemb_dir = 'embeddings/smiley_tweets_embedding_final' wemb_delimiter = ' ' wemb_nidm = 52 vocabs = [('parsed_tweets/vocab_words.pickle', 'final'), ('parsed_tweets/vocab_hashtags.pickle', 'topn')] for fname_vocab, name in vocabs: #get vocabulary alphabet = cPickle.load(open(fname_vocab)) words = alphabet.keys() print "Vocab size", len(alphabet) #get embeddings fname, delimiter, ndim = (wemb_dir, wemb_delimiter, wemb_nidm) word2vec = load_glove_vec(fname, words, delimiter, ndim) ndim = len(word2vec[word2vec.keys()[0]]) print 'ndim', ndim random_words_count = 0 vocab_emb = np.zeros((len(alphabet) + 1, ndim), dtype='float32') rand_vec = np.random.uniform(-0.25, 0.25, ndim) for word, idx in alphabet.iteritems(): word_vec = word2vec.get(word, None) if word_vec is None: word_vec = rand_vec random_words_count += 1 vocab_emb[idx] = word_vec print "Using zero vector as random" print 'random_words_count', random_words_count print vocab_emb.shape outfile = os.path.join( data_dir, 'emb_smiley_tweets_embedding_{}.npy'.format(name)) print outfile np.save(outfile, vocab_emb)