def spam_run():
    """Takes a random collection of tweets that haven't yet been run through the
    spam run number and runs them through the spam detector, 10k at a time, if 
    there are at least 10k that haven't yet been put through that run."""

    docs_not_yet_included_in_this_run = collect.count( { 'spam_rating' : { '$exists' : False } } )
    if docs_not_yet_included_in_this_run >= 10000:
        tweet_ids = []
        tweet_texts = []
        for x in range(10000):
            if x % 100 == 0: print 'filled %r random tweets into bucket' % x
            docs_to_skip = math.floor(random() * docs_not_yet_included_in_this_run/5)
            # Grab a single random tweet that doesn't have the spam_random_run1 field:
            tweet = collect.find({ 'spam_rating' : { '$exists' : False } } ).limit(1).skip(int(docs_to_skip)).next()
            tweet_texts.append(tweet['text'])
            tweet_ids.append(tweet['_id'])

        db1 = tweetPreprocessor.tweetDatabase(tweet_texts, batch_size = 10000)
        db1.identify_spam()
        db1.strip_and_lower_spam()
        spam_indices = db1.spam_indices
        spam_tweets = db1.spam_tweets_stripped_and_lowered

    # Use ids to insert spam_rating back into database:
        for tweet_index in range(len(tweet_ids)): # tweet_index is the id of the tweet in the python file!
            t_id = tweet_ids[tweet_index] # t_id is the id of the tweet in the mongodb!
            if tweet_index % 100 == 0: print 'inserted %r tweets BACK into mongo' % tweet_index
            if tweet_index in spam_indices:
                collect.update({ '_id' : t_id}, {'$set' : { 'spam_rating' : 1, 'tweet_processor_version' : 2}}, False)
            else:
                collect.update({ '_id' : t_id}, {'$set' : { 'spam_rating' : 0, 'tweet_processor_version' : 2}}, False)
Exemple #2
0
def spam_run():
    """Takes a random collection of tweets that haven't yet been run through the
    spam run number and runs them through the spam detector, 10k at a time, if 
    there are at least 10k that haven't yet been put through that run."""

    docs_not_yet_included_in_this_run = collect.count(
        {'spam_rating': {
            '$exists': False
        }})
    if docs_not_yet_included_in_this_run >= 10000:
        tweet_ids = []
        tweet_texts = []
        for x in range(10000):
            if x % 100 == 0: print 'filled %r random tweets into bucket' % x
            docs_to_skip = math.floor(random() *
                                      docs_not_yet_included_in_this_run / 5)
            # Grab a single random tweet that doesn't have the spam_random_run1 field:
            tweet = collect.find({
                'spam_rating': {
                    '$exists': False
                }
            }).limit(1).skip(int(docs_to_skip)).next()
            tweet_texts.append(tweet['text'])
            tweet_ids.append(tweet['_id'])

        db1 = tweetPreprocessor.tweetDatabase(tweet_texts, batch_size=10000)
        db1.identify_spam()
        db1.strip_and_lower_spam()
        spam_indices = db1.spam_indices
        spam_tweets = db1.spam_tweets_stripped_and_lowered

        # Use ids to insert spam_rating back into database:
        for tweet_index in range(
                len(tweet_ids)
        ):  # tweet_index is the id of the tweet in the python file!
            t_id = tweet_ids[
                tweet_index]  # t_id is the id of the tweet in the mongodb!
            if tweet_index % 100 == 0:
                print 'inserted %r tweets BACK into mongo' % tweet_index
            if tweet_index in spam_indices:
                collect.update(
                    {'_id': t_id},
                    {'$set': {
                        'spam_rating': 1,
                        'tweet_processor_version': 2
                    }}, False)
            else:
                collect.update(
                    {'_id': t_id},
                    {'$set': {
                        'spam_rating': 0,
                        'tweet_processor_version': 2
                    }}, False)
Exemple #3
0
    if collect.count( {'$and' : [{ 'spam_rating_5ktweetbatches' : { '$exists' : False }},
                                 { 'text' : { '$exists' : True }},
                                 { 'user' : { '$exists' : True }} ] } ) >= 5000:
        found = collect.find( {'$and' : [{ 'spam_rating_5ktweetbatches' : { '$exists' : False }},
                                         { 'text' : { '$exists' : True }},
                                         { 'user' : { '$exists' : True }} ] } ).limit(5000)

        # Turn them into two lists: one of ids, one of tweet text:
        tweet_ids = []
        tweet_texts = []
        while found.alive == True:
            tweet = found.next()
            tweet_texts.append(tweet['text'])
            tweet_ids.append(tweet['_id'])

        db1 = tweetPreprocessor.tweetDatabase(tweet_texts, batch_size = 5000, sensitivity = .28)
        db1.identify_spam()
        db1.strip_and_lower_spam()
        spam_indices = db1.spam_indices
        spam_tweets = db1.spam_tweets_stripped_and_lowered

    # Use ids to insert spam_rating back into database:
        for tweet_index in range(len(tweet_ids)): # tweet_index is the id of the tweet in the python file!
            t_id = tweet_ids[tweet_index] # t_id is the id of the tweet in the mongodb!
            if tweet_index % 100 == 0: print 'finished %r tweets' % str(tweet_index)
            if tweet_index in spam_indices:
                collect.update({ '_id' : t_id}, {'$set' : {'spam_rating_5ktweetbatches' : 1, 'tweet_processor_v2_5k_tweet_batches' : 1}}, False)
            else:
                collect.update({ '_id' : t_id}, {'$set' : {'spam_rating_5ktweetbatches' : 0, 'tweet_processor_v2_5k_tweet_batches' : 1}}, False)

        # If there aren't at least 5000 tweets to work with, then break:
from random import random

# Establish mongo info:
client = MongoClient()
db = client.tweets
collect = db.test_collection
found = collect.find()

etsy_ebay = []
counter = 0
while found.alive == True:
    counter +=1
    if counter % 100 == 0: print counter
    tweet = found.next()
    tweet_processed = tweetPreprocessor.singleTweet(tweet['text'])
    tweet_processed.strip_and_lower()
    if 'etsy' in tweet or 'ebay' in tweet_processed.tweet:
        etsy_ebay.append(tweet)

etsy_ebay_tweets = [tweet['text'] for tweet in etsy_ebay]

e_e = tweetPreprocessor.tweetDatabase(etsy_ebay_tweets)
e_e.common_twitter_handles.extend(['etsy', 'ebay'])
e_e.identify_spam()
e_e.strip_and_lower_spam()
def remove_non_ascii(text):
    return ''.join([i if ord(i) < 128 else ' ' for i in text])

with open('etsy_ebay_spam.txt', 'w') as outfile:
    for x in e_e.spam_tweets_stripped_and_lowered:
        outfile.write(remove_non_ascii(x) + '\n')
db = client.tweets
collect = db.random_sample_remote_computer

# Pick up a bunch of tweets, but only if there are at least 10k in there:
for x in range(100):
    found = collect.find({'spam_rating': {'$exists': False}}).limit(10000)

    # Turn them into two lists: one of ids, one of tweet text:
    tweet_ids = []
    tweet_texts = []
    while found.alive == True:
        tweet = found.next()
        tweet_texts.append(tweet['text'])
        tweet_ids.append(tweet['_id'])

    db1 = tweetPreprocessor.tweetDatabase(tweet_texts, batch_size=10000)
    db1.identify_spam()
    db1.strip_and_lower_spam()
    spam_indices = db1.spam_indices
    spam_tweets = db1.spam_tweets_stripped_and_lowered

    # Use ids to insert spam_rating back into database:
    for tweet_index in range(
            len(tweet_ids
                )):  # tweet_index is the id of the tweet in the python file!
        t_id = tweet_ids[
            tweet_index]  # t_id is the id of the tweet in the mongodb!
        if tweet_index % 100 == 0: print tweet_index
        if tweet_index in spam_indices:
            collect.update(
                {'_id': t_id},
client = MongoClient()
db = client.tweets
collect = db.sample_from_2009

# Pick up a bunch of tweets, but only if there are at least 5000 in there:
while True:
    found = collect.find({'spam_rating' : {'$exists' : False}}).limit(10000)

    # Turn them into two lists: one of ids, one of tweet text:
    tweet_ids = []
    tweet_texts = []
    while found.alive == True:
        tweet = found.next()
        tweet_texts.append(tweet['text'])
        tweet_ids.append(tweet['_id'])

    db1 = tweetPreprocessor.tweetDatabase(tweet_texts, batch_size = 10000)
    db1.identify_spam()
    db1.strip_and_lower_spam()
    spam_indices = db1.spam_indices
    spam_tweets = db1.spam_tweets_stripped_and_lowered

# Use ids to insert spam_rating back into database:
    for tweet_index in range(len(tweet_ids)): # tweet_index is the id of the tweet in the python file!
        t_id = tweet_ids[tweet_index] # t_id is the id of the tweet in the mongodb!
        if tweet_index % 100 == 0: print 'finished %r tweets' % str(tweet_index)
        if tweet_index in spam_indices:
            collect.update({ '_id' : t_id}, {'$set' : {'spam_rating' : 1, 'tweet_processor_version' : 2}}, False)
        else:
            collect.update({ '_id' : t_id}, {'$set' : {'spam_rating' : 0, 'tweet_processor_version' : 2}}, False)
    }).limit(10000)
    print 'Working on the word "%r"' % target_word
    target_word_data[target_word] = []
    while found.alive == True:
        target_word_data[target_word].append(found.next()['text'])


def remove_non_ascii(text):
    return ''.join([i if ord(i) < 128 else ' ' for i in text])


path = '/Users/ilya/Projects/twitter_spam/target_word_spam_data/'

spam_tweets = {}
for target_word in target_words:
    tweetsdb = tweetPreprocessor.tweetDatabase(target_word_data[target_word])
    tweetsdb.identify_spam()
    tweetsdb.strip_and_lower_spam()
    spam_tweets[target_word] = tweetsdb.spam_tweets_stripped_and_lowered
    with open(path + 'v2_raw_spam_' + str(target_word), 'w') as outfile:
        for t in spam_tweets[target_word]:
            outfile.write(remove_non_ascii(t) + '\n')

spam_percent = {}
for target_word in target_words:
    spam_percent[target_word] = float(len(spam_tweets[target_word])) / len(
        target_word_data[target_word])

# Write spam percents to a masterfile:
with open('v2_target_word_spam_summary.txt', 'a') as outfile:
    for k, v in spam_percent.items():
        print "NOT ENOUGH TWEETS THAT INCLUDE THE WORD %s, SKIPPING" % target_word
        del target_words[target_words.index(target_word)]
        continue
    found = db.test_collection.find( {'$text' : { '$search' : target_word}}).limit(10000)
    print 'Working on the word "%r"' % target_word
    target_word_data[target_word] = []
    while found.alive == True:
        target_word_data[target_word].append(found.next()['text'])

def remove_non_ascii(text):
    return ''.join([i if ord(i) < 128 else ' ' for i in text])
path = '/Users/ilya/Projects/twitter_spam/target_word_spam_data/'

spam_tweets = {}
for target_word in target_words:
    tweetsdb = tweetPreprocessor.tweetDatabase(target_word_data[target_word])
    tweetsdb.identify_spam()
    tweetsdb.strip_and_lower_spam()
    spam_tweets[target_word] = tweetsdb.spam_tweets_stripped_and_lowered
    with open(path + 'v2_raw_spam_' + str(target_word), 'w') as outfile:
        for t in spam_tweets[target_word]:
            outfile.write(remove_non_ascii(t) + '\n')

spam_percent = {}
for target_word in target_words:
    spam_percent[target_word] = float(len(spam_tweets[target_word]))/len(target_word_data[target_word])

# Write spam percents to a masterfile:
with open('v2_target_word_spam_summary.txt', 'a') as outfile:
    for k, v in spam_percent.items():
        outfile.write(k + ',' + str(v) + '\n')