Beispiel #1
0
	#Bad tweet
	if lowercase_content is None:
		continue

	#To make the regexes easier to write
	lowercase_content+="\n"

	#Only considering from the week after the disaster
	last_dt = date_time
	if date_time > time_to_break:
		break

	##Find all the terms using regexes
	ignore_int =  get_from_single_regex(ignore_regex,lowercase_content)
	ins_loc = get_from_regexes(loc_regex,lowercase_content)
	ins_ent = get_from_regexes(entities_regex,lowercase_content)
	ins_act = get_from_regexes(actions_regex,lowercase_content)

	##This is kind of ugly, I'm going to check each one twice, but its okay
	if len(ins_loc) or len(ins_ent) or len(ins_act) or len(ignore_int):
		#If we found it, insert the tweet into mongo
		found_tweets +=1
		tweet_json['_id'] = i
		collection.insert(tweet_json)

		##We'll use this for results...write out which terms were found to a simple csv
		for to_ig in ignore_int:
			write_out_tweet(out_fil,i,to_ig,"ignore")
		for z in ins_loc:
			write_out_tweet(out_fil,i,z,"location")
	i+=1
	if found_tweets % 1000000 == 0 and len(ush_counter) >0:
		print last_dt
		for u,v in ush_counter.most_common():
			output_file.write(u + "," + str(v) + ","+ str(found_tweets) + "\n")
		output_file.close()
		ush_counter=Counter()
		output_file = codecs.open(tweet_out_fil+str(n_outfil)+".csv",
								  "w",encoding='utf-8')
		n_outfil+=1

	lowercase_content, time_in_minutes, date_time, tweet_json= get_tweet(line)
	if lowercase_content is None:
		continue

	last_dt = date_time
	if date_time > EARTHQUAKE_TWEET_TIME:
		break

	ins = get_from_regexes(regexes,lowercase_content)

	if len(ins) > 0:
		found_tweets +=1	
		for int_term in ins:
			ush_counter[int_term] +=1
found_tweets = str(found_tweets)


output_file.close()
print found_tweets
print i