def spam_run(): """Takes a random collection of tweets that haven't yet been run through the spam run number and runs them through the spam detector, 10k at a time, if there are at least 10k that haven't yet been put through that run.""" docs_not_yet_included_in_this_run = collect.count( { 'spam_rating' : { '$exists' : False } } ) if docs_not_yet_included_in_this_run >= 10000: tweet_ids = [] tweet_texts = [] for x in range(10000): if x % 100 == 0: print 'filled %r random tweets into bucket' % x docs_to_skip = math.floor(random() * docs_not_yet_included_in_this_run/5) # Grab a single random tweet that doesn't have the spam_random_run1 field: tweet = collect.find({ 'spam_rating' : { '$exists' : False } } ).limit(1).skip(int(docs_to_skip)).next() tweet_texts.append(tweet['text']) tweet_ids.append(tweet['_id']) db1 = tweetPreprocessor.tweetDatabase(tweet_texts, batch_size = 10000) db1.identify_spam() db1.strip_and_lower_spam() spam_indices = db1.spam_indices spam_tweets = db1.spam_tweets_stripped_and_lowered # Use ids to insert spam_rating back into database: for tweet_index in range(len(tweet_ids)): # tweet_index is the id of the tweet in the python file! t_id = tweet_ids[tweet_index] # t_id is the id of the tweet in the mongodb! if tweet_index % 100 == 0: print 'inserted %r tweets BACK into mongo' % tweet_index if tweet_index in spam_indices: collect.update({ '_id' : t_id}, {'$set' : { 'spam_rating' : 1, 'tweet_processor_version' : 2}}, False) else: collect.update({ '_id' : t_id}, {'$set' : { 'spam_rating' : 0, 'tweet_processor_version' : 2}}, False)
def spam_run(): """Takes a random collection of tweets that haven't yet been run through the spam run number and runs them through the spam detector, 10k at a time, if there are at least 10k that haven't yet been put through that run.""" docs_not_yet_included_in_this_run = collect.count( {'spam_rating': { '$exists': False }}) if docs_not_yet_included_in_this_run >= 10000: tweet_ids = [] tweet_texts = [] for x in range(10000): if x % 100 == 0: print 'filled %r random tweets into bucket' % x docs_to_skip = math.floor(random() * docs_not_yet_included_in_this_run / 5) # Grab a single random tweet that doesn't have the spam_random_run1 field: tweet = collect.find({ 'spam_rating': { '$exists': False } }).limit(1).skip(int(docs_to_skip)).next() tweet_texts.append(tweet['text']) tweet_ids.append(tweet['_id']) db1 = tweetPreprocessor.tweetDatabase(tweet_texts, batch_size=10000) db1.identify_spam() db1.strip_and_lower_spam() spam_indices = db1.spam_indices spam_tweets = db1.spam_tweets_stripped_and_lowered # Use ids to insert spam_rating back into database: for tweet_index in range( len(tweet_ids) ): # tweet_index is the id of the tweet in the python file! t_id = tweet_ids[ tweet_index] # t_id is the id of the tweet in the mongodb! if tweet_index % 100 == 0: print 'inserted %r tweets BACK into mongo' % tweet_index if tweet_index in spam_indices: collect.update( {'_id': t_id}, {'$set': { 'spam_rating': 1, 'tweet_processor_version': 2 }}, False) else: collect.update( {'_id': t_id}, {'$set': { 'spam_rating': 0, 'tweet_processor_version': 2 }}, False)
if collect.count( {'$and' : [{ 'spam_rating_5ktweetbatches' : { '$exists' : False }}, { 'text' : { '$exists' : True }}, { 'user' : { '$exists' : True }} ] } ) >= 5000: found = collect.find( {'$and' : [{ 'spam_rating_5ktweetbatches' : { '$exists' : False }}, { 'text' : { '$exists' : True }}, { 'user' : { '$exists' : True }} ] } ).limit(5000) # Turn them into two lists: one of ids, one of tweet text: tweet_ids = [] tweet_texts = [] while found.alive == True: tweet = found.next() tweet_texts.append(tweet['text']) tweet_ids.append(tweet['_id']) db1 = tweetPreprocessor.tweetDatabase(tweet_texts, batch_size = 5000, sensitivity = .28) db1.identify_spam() db1.strip_and_lower_spam() spam_indices = db1.spam_indices spam_tweets = db1.spam_tweets_stripped_and_lowered # Use ids to insert spam_rating back into database: for tweet_index in range(len(tweet_ids)): # tweet_index is the id of the tweet in the python file! t_id = tweet_ids[tweet_index] # t_id is the id of the tweet in the mongodb! if tweet_index % 100 == 0: print 'finished %r tweets' % str(tweet_index) if tweet_index in spam_indices: collect.update({ '_id' : t_id}, {'$set' : {'spam_rating_5ktweetbatches' : 1, 'tweet_processor_v2_5k_tweet_batches' : 1}}, False) else: collect.update({ '_id' : t_id}, {'$set' : {'spam_rating_5ktweetbatches' : 0, 'tweet_processor_v2_5k_tweet_batches' : 1}}, False) # If there aren't at least 5000 tweets to work with, then break:
from random import random # Establish mongo info: client = MongoClient() db = client.tweets collect = db.test_collection found = collect.find() etsy_ebay = [] counter = 0 while found.alive == True: counter +=1 if counter % 100 == 0: print counter tweet = found.next() tweet_processed = tweetPreprocessor.singleTweet(tweet['text']) tweet_processed.strip_and_lower() if 'etsy' in tweet or 'ebay' in tweet_processed.tweet: etsy_ebay.append(tweet) etsy_ebay_tweets = [tweet['text'] for tweet in etsy_ebay] e_e = tweetPreprocessor.tweetDatabase(etsy_ebay_tweets) e_e.common_twitter_handles.extend(['etsy', 'ebay']) e_e.identify_spam() e_e.strip_and_lower_spam() def remove_non_ascii(text): return ''.join([i if ord(i) < 128 else ' ' for i in text]) with open('etsy_ebay_spam.txt', 'w') as outfile: for x in e_e.spam_tweets_stripped_and_lowered: outfile.write(remove_non_ascii(x) + '\n')
db = client.tweets collect = db.random_sample_remote_computer # Pick up a bunch of tweets, but only if there are at least 10k in there: for x in range(100): found = collect.find({'spam_rating': {'$exists': False}}).limit(10000) # Turn them into two lists: one of ids, one of tweet text: tweet_ids = [] tweet_texts = [] while found.alive == True: tweet = found.next() tweet_texts.append(tweet['text']) tweet_ids.append(tweet['_id']) db1 = tweetPreprocessor.tweetDatabase(tweet_texts, batch_size=10000) db1.identify_spam() db1.strip_and_lower_spam() spam_indices = db1.spam_indices spam_tweets = db1.spam_tweets_stripped_and_lowered # Use ids to insert spam_rating back into database: for tweet_index in range( len(tweet_ids )): # tweet_index is the id of the tweet in the python file! t_id = tweet_ids[ tweet_index] # t_id is the id of the tweet in the mongodb! if tweet_index % 100 == 0: print tweet_index if tweet_index in spam_indices: collect.update( {'_id': t_id},
client = MongoClient() db = client.tweets collect = db.sample_from_2009 # Pick up a bunch of tweets, but only if there are at least 5000 in there: while True: found = collect.find({'spam_rating' : {'$exists' : False}}).limit(10000) # Turn them into two lists: one of ids, one of tweet text: tweet_ids = [] tweet_texts = [] while found.alive == True: tweet = found.next() tweet_texts.append(tweet['text']) tweet_ids.append(tweet['_id']) db1 = tweetPreprocessor.tweetDatabase(tweet_texts, batch_size = 10000) db1.identify_spam() db1.strip_and_lower_spam() spam_indices = db1.spam_indices spam_tweets = db1.spam_tweets_stripped_and_lowered # Use ids to insert spam_rating back into database: for tweet_index in range(len(tweet_ids)): # tweet_index is the id of the tweet in the python file! t_id = tweet_ids[tweet_index] # t_id is the id of the tweet in the mongodb! if tweet_index % 100 == 0: print 'finished %r tweets' % str(tweet_index) if tweet_index in spam_indices: collect.update({ '_id' : t_id}, {'$set' : {'spam_rating' : 1, 'tweet_processor_version' : 2}}, False) else: collect.update({ '_id' : t_id}, {'$set' : {'spam_rating' : 0, 'tweet_processor_version' : 2}}, False)
}).limit(10000) print 'Working on the word "%r"' % target_word target_word_data[target_word] = [] while found.alive == True: target_word_data[target_word].append(found.next()['text']) def remove_non_ascii(text): return ''.join([i if ord(i) < 128 else ' ' for i in text]) path = '/Users/ilya/Projects/twitter_spam/target_word_spam_data/' spam_tweets = {} for target_word in target_words: tweetsdb = tweetPreprocessor.tweetDatabase(target_word_data[target_word]) tweetsdb.identify_spam() tweetsdb.strip_and_lower_spam() spam_tweets[target_word] = tweetsdb.spam_tweets_stripped_and_lowered with open(path + 'v2_raw_spam_' + str(target_word), 'w') as outfile: for t in spam_tweets[target_word]: outfile.write(remove_non_ascii(t) + '\n') spam_percent = {} for target_word in target_words: spam_percent[target_word] = float(len(spam_tweets[target_word])) / len( target_word_data[target_word]) # Write spam percents to a masterfile: with open('v2_target_word_spam_summary.txt', 'a') as outfile: for k, v in spam_percent.items():
print "NOT ENOUGH TWEETS THAT INCLUDE THE WORD %s, SKIPPING" % target_word del target_words[target_words.index(target_word)] continue found = db.test_collection.find( {'$text' : { '$search' : target_word}}).limit(10000) print 'Working on the word "%r"' % target_word target_word_data[target_word] = [] while found.alive == True: target_word_data[target_word].append(found.next()['text']) def remove_non_ascii(text): return ''.join([i if ord(i) < 128 else ' ' for i in text]) path = '/Users/ilya/Projects/twitter_spam/target_word_spam_data/' spam_tweets = {} for target_word in target_words: tweetsdb = tweetPreprocessor.tweetDatabase(target_word_data[target_word]) tweetsdb.identify_spam() tweetsdb.strip_and_lower_spam() spam_tweets[target_word] = tweetsdb.spam_tweets_stripped_and_lowered with open(path + 'v2_raw_spam_' + str(target_word), 'w') as outfile: for t in spam_tweets[target_word]: outfile.write(remove_non_ascii(t) + '\n') spam_percent = {} for target_word in target_words: spam_percent[target_word] = float(len(spam_tweets[target_word]))/len(target_word_data[target_word]) # Write spam percents to a masterfile: with open('v2_target_word_spam_summary.txt', 'a') as outfile: for k, v in spam_percent.items(): outfile.write(k + ',' + str(v) + '\n')