Example #1
0
entities = []
actions = []
for line in codecs.open("keywords.txt",encoding='utf-8'):
	spl = line.strip().split("|")
	type_of_term = spl[1]

	if type_of_term == 'entity':
		entities.append(spl[0])
	elif type_of_term == 'location':
		locations.append(spl[0])
	elif type_of_term == 'action':
		actions.append(spl[0])
	else:
		print 'TYPE WRONG!!'

loc_regex = get_regexes(locations)
entities_regex = get_regexes(entities)
actions_regex = get_regexes(actions)

to_ignore = []
for line in open("ignore.txt"):
	to_ignore.append(line.strip())
ignore_regex = get_regex_from_array(to_ignore)

vieweg_censor = []
for line in open("vieweg_censor.txt"):
	vieweg_censor.append(line.strip())
censor_regex = get_regex_from_array(vieweg_censor)

ush_counter = Counter()
found_tweets = 0
from util import get_tweet, EARTHQUAKE_TWEET_TIME,get_regexes,get_from_regexes
from collections import Counter
import codecs, re,sys

tweet_file = codecs.open("/Users/kjoseph/eclipse_workspace/InfoSocial/ordered_w_user.tab",'r','utf-8')
tweet_out_fil = "/Users/kjoseph/eclipse_workspace/"\
				"InfoSocial/old_train_out/old_tweet_likelihood_"

in_fil = codecs.open("keywords_for_old.txt",encoding='utf-8')
ush_terms = set([line.strip() for line in in_fil])
in_fil.close

regexes = get_regexes(ush_terms)

ush_counter = Counter()
found_tweets = 1
last_dt = ""
output_file = codecs.open(tweet_out_fil+"1.csv","w",encoding='utf-8')
n_outfil = 2
i = 0
for line in tweet_file:
	i+=1
	if found_tweets % 1000000 == 0 and len(ush_counter) >0:
		print last_dt
		for u,v in ush_counter.most_common():
			output_file.write(u + "," + str(v) + ","+ str(found_tweets) + "\n")
		output_file.close()
		ush_counter=Counter()
		output_file = codecs.open(tweet_out_fil+str(n_outfil)+".csv",
								  "w",encoding='utf-8')
		n_outfil+=1