entities = [] actions = [] for line in codecs.open("keywords.txt",encoding='utf-8'): spl = line.strip().split("|") type_of_term = spl[1] if type_of_term == 'entity': entities.append(spl[0]) elif type_of_term == 'location': locations.append(spl[0]) elif type_of_term == 'action': actions.append(spl[0]) else: print 'TYPE WRONG!!' loc_regex = get_regexes(locations) entities_regex = get_regexes(entities) actions_regex = get_regexes(actions) to_ignore = [] for line in open("ignore.txt"): to_ignore.append(line.strip()) ignore_regex = get_regex_from_array(to_ignore) vieweg_censor = [] for line in open("vieweg_censor.txt"): vieweg_censor.append(line.strip()) censor_regex = get_regex_from_array(vieweg_censor) ush_counter = Counter() found_tweets = 0
from util import get_tweet, EARTHQUAKE_TWEET_TIME,get_regexes,get_from_regexes from collections import Counter import codecs, re,sys tweet_file = codecs.open("/Users/kjoseph/eclipse_workspace/InfoSocial/ordered_w_user.tab",'r','utf-8') tweet_out_fil = "/Users/kjoseph/eclipse_workspace/"\ "InfoSocial/old_train_out/old_tweet_likelihood_" in_fil = codecs.open("keywords_for_old.txt",encoding='utf-8') ush_terms = set([line.strip() for line in in_fil]) in_fil.close regexes = get_regexes(ush_terms) ush_counter = Counter() found_tweets = 1 last_dt = "" output_file = codecs.open(tweet_out_fil+"1.csv","w",encoding='utf-8') n_outfil = 2 i = 0 for line in tweet_file: i+=1 if found_tweets % 1000000 == 0 and len(ush_counter) >0: print last_dt for u,v in ush_counter.most_common(): output_file.write(u + "," + str(v) + ","+ str(found_tweets) + "\n") output_file.close() ush_counter=Counter() output_file = codecs.open(tweet_out_fil+str(n_outfil)+".csv", "w",encoding='utf-8') n_outfil+=1