#Bad tweet if lowercase_content is None: continue #To make the regexes easier to write lowercase_content+="\n" #Only considering from the week after the disaster last_dt = date_time if date_time > time_to_break: break ##Find all the terms using regexes ignore_int = get_from_single_regex(ignore_regex,lowercase_content) ins_loc = get_from_regexes(loc_regex,lowercase_content) ins_ent = get_from_regexes(entities_regex,lowercase_content) ins_act = get_from_regexes(actions_regex,lowercase_content) ##This is kind of ugly, I'm going to check each one twice, but its okay if len(ins_loc) or len(ins_ent) or len(ins_act) or len(ignore_int): #If we found it, insert the tweet into mongo found_tweets +=1 tweet_json['_id'] = i collection.insert(tweet_json) ##We'll use this for results...write out which terms were found to a simple csv for to_ig in ignore_int: write_out_tweet(out_fil,i,to_ig,"ignore") for z in ins_loc: write_out_tweet(out_fil,i,z,"location")
i+=1 if found_tweets % 1000000 == 0 and len(ush_counter) >0: print last_dt for u,v in ush_counter.most_common(): output_file.write(u + "," + str(v) + ","+ str(found_tweets) + "\n") output_file.close() ush_counter=Counter() output_file = codecs.open(tweet_out_fil+str(n_outfil)+".csv", "w",encoding='utf-8') n_outfil+=1 lowercase_content, time_in_minutes, date_time, tweet_json= get_tweet(line) if lowercase_content is None: continue last_dt = date_time if date_time > EARTHQUAKE_TWEET_TIME: break ins = get_from_regexes(regexes,lowercase_content) if len(ins) > 0: found_tweets +=1 for int_term in ins: ush_counter[int_term] +=1 found_tweets = str(found_tweets) output_file.close() print found_tweets print i