コード例 #1
0
from util import get_regex_from_array,get_from_single_regex
from pymongo import MongoClient

out_fil = open("censored_tweets.txt","w")
client = MongoClient()
collection = client["iscram"].tweets

vieweg_censor = []
for line in open("vieweg_censor.txt"):
	vieweg_censor.append(line.strip())
censor_regex = get_regex_from_array(vieweg_censor)

found_tweets = 0
i = 0
for tweet in collection.find():
	i+=1
	if i % 100000 == 0:
		print i
	lowercase_content = tweet["content"].lower()
	censor_int =  get_from_single_regex(censor_regex,lowercase_content)
	if len(censor_int) > 0 or 'pray' in lowercase_content or 'donat' in lowercase_content:
		found_tweets +=1
		out_fil.write(str(tweet["_id"]) + "\n")

out_fil.close()
print found_tweets
コード例 #2
0
		entities.append(spl[0])
	elif type_of_term == 'location':
		locations.append(spl[0])
	elif type_of_term == 'action':
		actions.append(spl[0])
	else:
		print 'TYPE WRONG!!'

loc_regex = get_regexes(locations)
entities_regex = get_regexes(entities)
actions_regex = get_regexes(actions)

to_ignore = []
for line in open("ignore.txt"):
	to_ignore.append(line.strip())
ignore_regex = get_regex_from_array(to_ignore)

vieweg_censor = []
for line in open("vieweg_censor.txt"):
	vieweg_censor.append(line.strip())
censor_regex = get_regex_from_array(vieweg_censor)

ush_counter = Counter()
found_tweets = 0
i = 0
last_dt = ""
time_to_break = EARTHQUAKE_TWEET_TIME+timedelta(7)
for line in tweet_file:
	i+=1
	if i % 100000 == 0:
		print last_dt