def wordVec_facebook(sents, path_w, name_w, win_size): list_all = list() for i in range(0, len(sents)): split_sent = sents[i].split() tokens = list() for token in split_sent: token_filter = filter_eachTok_rmLinks(token, 'model') if len(token_filter) > 0: tokens.append(token_filter.lower()) print i list_all.append(tokens) model = gensim.models.Word2Vec(list_all, size=win_size, window=5, min_count=1, workers=5) print model.most_similar(['bus']) list_write = list() for i in range(0, len(model.index2word)): # print model.index2word[i], model.syn0norm[i] line = model.index2word[i] for value in model.syn0norm[i]: line += '\t' + str(value) line = line.strip() list_write.append(line) print line write_file(path_w, name_w + '_%i' % win_size, list_write)
def load_facebook(texts): posts = list() for t in texts: split_t = t.split("\t") post = "" for w in split_t[2].split(): post += filter_eachTok_rmLinks(w, "model") + " " posts.append(post.strip().lower()) return posts
def load_tweets(texts): tweets = list() for t in texts: split_t = t.split("\t") tweet = "" for w in split_t[2].split(): tweet += filter_eachTok_rmLinks(w, "twitter") + " " tweets.append(tweet.strip().lower()) return tweets
def check_token(token, command): if command == 'twitter': text = filter_eachTok_rmLinks(token, command) if len(text.strip()) == 0: return True else: return False if command == 'sgforums': text = filter_eachToken(token, command) if len(text.strip()) == 0: return True else: return False if command == 'facebook': text = filter_eachTok_rmLinks(token, command) if len(text.strip()) == 0: return True else: return False
def filtering_text_demo(list_line, command): list_demo = list() for line in list_line: text = '' split_line = line.split() for token in split_line: token_filter = filter_eachTok_rmLinks(token, command) if len(token_filter) != 0: text += token_filter + '\t' # print text.strip() list_demo.append(text.strip()) list_demo.append('\n') list_demo.append('\n') return list_demo
def filterText_demo(list_line, command, command_data): list_convert = list() for i in range(0, len(list_line)): text = '' split_text = list_line[i].strip().split() for token_ in split_text: if command == 'removePunc': # remove all punctuations token_filter = filter_eachToken(token_, command_data) elif command == 'removeLink': # remove all punctuations and links in token token_filter = filter_eachTok_rmLinks(token_, command_data) else: print 'You need to give the correct command' quit() if len(token_filter) != 0: text += token_filter + '\t' list_convert.append(text.strip()) return list_convert