Example #1
0
 def test_create_exist_twitter_warc(self):
     """
     crawling a exist twitter warc will replace new
     """
     wc.create_twitter_warc(TWITTER_HTML)
     os.chdir("..")
     self.assertTrue(os.path.isfile(WARC_TWITTER_DIRECTORY +
                                    "/https:__twitter.com_wesbos_status_" +
                                    "519123918422958081.warc.gz"))
Example #2
0
 def test_create_wrong_url_twitter_warc(self):
     """
     crawling a wrong twitter article should be going to success because
     there is 404 page
     """
     try:
         os.chdir("..")
         os.remove(WARC_TWITTER_DIRECTORY +
                   "/https:__twitter.com_wesbos_statu.warc.gz")
     except OSError:
         pass
     self.setUp()
     wc.create_twitter_warc(WRONG_TWITTER_HTML)
     os.chdir("..")
     time.sleep(1)
     self.assertTrue(os.path.isfile(WARC_TWITTER_DIRECTORY +
                                    "/https:__twitter.com_wesbos_statu" +
                                    ".warc.gz"))
Example #3
0
 def test_create_twitter_warc(self):
     """
     create a real twitter url warc should work
     """
     try:
         os.chdir("..")
         os.remove(WARC_TWITTER_DIRECTORY +
                   "/https:__twitter.com_wesbos_status_" +
                   "519123918422958081.warc.gz")
     except OSError:
         pass
     self.setUp()
     wc.create_twitter_warc(TWITTER_HTML)
     os.chdir("..")
     time.sleep(1)
     self.assertTrue(os.path.isfile(WARC_TWITTER_DIRECTORY +
                                    "/https:__twitter.com_wesbos_status_" +
                                    "519123918422958081.warc.gz"))
def parse_tweets(twitter_users, keywords, foreign_sites, tweet_number):
    """ (list of str, list of str, list of str, str) -> none
    Parses through tweets of users, looking for keywords and foreign sites.
    Relevant tweets will be sent to a database.

    Keyword arguments:
    twitter_users   -- List of strings as twitter handles
    keywords        -- List of strings as keywords to search for
    foreign_sites   -- List of strings as sources to search for
    db_name         -- String of Database
    """
    config = configuration()['storage']
    django.setup()
    added, updated, no_match = 0, 0, 0
    start = time.time()

    for user in twitter_users:
        # Check for any new command on communication stream
        check_command()
        processed = 0
        tweets = get_tweets(user, tweet_number)
        tweet_followers = get_follower_count(user)
        tweet_count = len(tweets)

        for tweet in tweets:
            # Check for any new command on communication stream
            check_command()


            #setting correct data for each field
            tweet_id = tweet.id
            tweet_date = timezone.localtime(
                timezone.make_aware(tweet.created_at,
                                    timezone=timezone.get_fixed_timezone(180)))
            tweet_user = tweet.user.screen_name
            tweet_store_date = timezone.localtime(timezone.now())
            tweet_keywords = get_keywords(tweet, keywords)
            tweet_sources = get_sources(tweet, foreign_sites)
            tweet_text = tweet.text

            if not(tweet_keywords == [] and tweet_sources == []):

                tweet_list = Tweet.objects.filter(tweet_id=tweet_id)
                if (not tweet_list):
                    #creating new intry in collection
                    tweet = Tweet(tweet_id=tweet_id, user=tweet_user,
                                  date_added=tweet_store_date,
                                  date_published=tweet_date,
                                  followers=tweet_followers, text=tweet_text)
                    tweet.save()

                    tweet = Tweet.objects.get(tweet_id=tweet_id)

                    for key in tweet_keywords:
                        tweet.keyword_set.create(keyword=key)

                    for source in tweet_sources:
                        tweet.source_set.create(url=source[0],
                                                url_origin=source[1])

                    added += 1

                else:

                    tweet = tweet_list[0]
                    tweet.text = tweet_text
                    tweet.tweet_id = tweet_id
                    tweet.user = tweet_user
                    # tweet.date_added = tweet_store_date
                    tweet.date_published = tweet_date
                    tweet.followers = tweet_followers
                    tweet.save()

                    for key in tweet_keywords:
                        if not T_keyword.objects.filter(keyword=key):
                            tweet.keyword_set.create(keyword=key)

                    for source in tweet_sources:
                        if not Source.objects.filter(url=source[0]):
                            tweet.source_set.create(
                                url=source[0], url_origin=source[1])
                    updated += 1

                warc_creator.create_twitter_warc(
                    'https://twitter.com/' + tweet.user + '/status/' +
                    str(tweet_id))
            else:
                no_match += 1
            processed += 1
            sys.stdout.write("%s (Twitter|%s) %i/%i          \r" %
                             (str(timezone.localtime(timezone.now()))[:-13],
                              user, processed, tweet_count))
            sys.stdout.flush()
        print format("%s (Twitter|%s) %i/%i          " % (
            str(timezone.localtime(timezone.now()))[:-13], user, processed,
            tweet_count))
Example #5
0
def parse_tweets(twitter_users, keywords, source_sites, tweet_number, source_twitter_list):
    """ (list of str, list of str, list of str, str) -> none
    Parses through tweets of users, looking for keywords and foreign sites.
    Relevant tweets will be sent to a database.

    Keyword arguments:
    twitter_users   -- List of strings as twitter handles
    keywords        -- List of strings as keywords to search for
    foreign_sites   -- List of strings as sources to search for
    db_name         -- String of Database
    """
    config = configuration()["storage"]
    django.setup()
    added, updated, no_match = 0, 0, 0
    start = time.time()

    for user in twitter_users:
        # Check for any new command on communication stream
        check_command()
        processed = 0
        tweets = get_tweets(user, tweet_number)
        tweet_followers = get_follower_count(user)
        tweet_count = len(tweets)
        for i in range(tweet_count):
            tweet = tweets[i]
            try:
                # Check for any new command on communication stream
                check_command()
            except (KeyboardInterrupt, SystemExit):
                raise

            # setting correct data for each field
            tweet_id = tweet.id
            tweet_date = timezone.localtime(
                timezone.make_aware(tweet.created_at, timezone=timezone.get_fixed_timezone(180))
            )
            tweet_user = tweet.user.screen_name
            tweet_store_date = timezone.localtime(timezone.now())
            tweet_keywords = get_keywords(tweet.text, keywords)
            tweet_sources = get_source_sites(tweet.entities["urls"], source_sites)
            twitter_accounts = get_sources_twitter(tweet.text, source_twitter_list)
            tweet_text = tweet.text

            if not (tweet_keywords == [] and tweet_sources[0] == [] and twitter_accounts[0] == []):
                retweet_count = tweet.retweet_count
                favorite_count = tweet.favorite_count

                tweet_list = Tweet.objects.filter(tweet_id=tweet_id)
                if not tweet_list:
                    # creating new intry in collection
                    tweet = Tweet(
                        tweet_id=tweet_id,
                        name=tweet_user,
                        date_added=tweet_store_date,
                        date_published=tweet_date,
                        text=tweet_text,
                    )
                    tweet.save()

                    tweet = Tweet.objects.get(tweet_id=tweet_id)

                    tweet.countlog_set.create(
                        retweet_count=retweet_count, favorite_count=favorite_count, date=tweet_store_date
                    )

                    for account in twitter_accounts[0]:
                        tweet.sourcetwitter_set.create(name=account, matched=True)

                    for account in twitter_accounts[1]:
                        tweet.sourcetwitter_set.create(name=account, matched=False)

                    for key in tweet_keywords:
                        tweet.keyword_set.create(name=key)

                    for source in tweet_sources[0]:
                        tweet.sourcesite_set.create(url=source[0], domain=source[1], matched=True)
                    for source in tweet_sources[1]:
                        tweet.sourcesite_set.create(url=source[0], domain=source[1], matched=False)

                    added += 1

                else:

                    tweet = tweet_list[0]
                    tweet.text = tweet_text
                    tweet.tweet_id = tweet_id
                    tweet.name = tweet_user
                    # tweet.date_added = tweet_store_date
                    tweet.date_published = tweet_date
                    tweet.save()

                    if not CountLog.objects.filter(retweet_count=retweet_count, favorite_count=favorite_count):
                        tweet.countlog_set.create(
                            retweet_count=retweet_count, favorite_count=favorite_count, date=tweet_store_date
                        )

                    for key in tweet_keywords:
                        if not TwitterKeyword.objects.filter(name=key):
                            tweet.keyword_set.create(name=key)

                    for source in tweet_sources[0]:
                        if not TwitterSourceSite.objects.filter(url=source[0]):
                            tweet.sourcesite_set.create(url=source[0], domain=source[1], matched=True)

                    for source in tweet_sources[1]:
                        if not TwitterSourceSite.objects.filter(url=source[0]):
                            tweet.sourcesite_set.create(url=source[0], domain=source[1], matched=False)

                    for account in twitter_accounts[0]:
                        if not TwitterSourceTwitter.objects.filter(name=account):
                            tweet.sourcetwitter_set.create(name=account, matched=True)

                    for account in twitter_accounts[1]:
                        if not TwitterSourceTwitter.objects.filter(name=account):
                            tweet.sourcetwitter_set.create(name=account, matched=False)

                    updated += 1

                warc_creator.create_twitter_warc("https://twitter.com/" + tweet.name + "/status/" + str(tweet_id))
            else:
                no_match += 1
            processed += 1
            print (
                "%s (Twitter|%s) %i/%i          \r"
                % (str(timezone.localtime(timezone.now()))[:-13], user, processed, tweet_count)
            )
            tweets[i] = None
        print format(
            "%s (Twitter|%s) %i/%i          "
            % (str(timezone.localtime(timezone.now()))[:-13], user, processed, tweet_count)
        )
Example #6
0
def process_tweet(tweet, keywords, source_sites, source_accounts):
    """
    Checks if the given tweet match the scope.
    """
    user, tweet_text, tweet_id, tweet_date = tweet.user, tweet.text, tweet.tweet_id, tweet.date
    tweet_store_date = timezone.localtime(timezone.now())
    tweet_keywords = get_keywords(tweet_text, keywords)
    tweet_sources = get_source_sites(tweet.urls, source_sites)
    twitter_accounts = get_source_twitter(tweet.mentions, source_accounts)
    retweet_count, favorite_count = tweet.retweet_count, tweet.favorite_count

    if len(tweet_text) > 450:
        try:
            tweet_text = tweet_text[:450]
        except:
            return NO_MATCH

    # finds match
    if tweet_keywords or tweet_sources[0] or twitter_accounts[0]:
        existing_tweets = Tweet.objects.filter(tweet_id=tweet_id)

        if not existing_tweets:
            tweet = Tweet(tweet_id=tweet_id,
                          name=user,
                          date_added=tweet_store_date,
                          date_published=tweet_date,
                          text=tweet_text)
            tweet.save()
            tweet = Tweet.objects.get(tweet_id=tweet_id)
            tweet.countlog_set.create(retweet_count=retweet_count,
                                      favorite_count=favorite_count,
                                      date=tweet_store_date)

            for account in twitter_accounts[0]:
                tweet.sourcetwitter_set.create(name=account, matched=True)
            for account in twitter_accounts[1]:
                tweet.sourcetwitter_set.create(name=account, matched=False)
            for key in tweet_keywords:
                tweet.keyword_set.create(name=key)
            for source in tweet_sources[0]:
                tweet.sourcesite_set.create(url=source[0],
                                            domain=source[1],
                                            matched=True)
            for source in tweet_sources[1]:
                tweet.sourcesite_set.create(url=source[0],
                                            domain=source[1],
                                            matched=False)
            try:
                warc_creator.create_twitter_warc('https://twitter.com/' +
                                                 tweet.name + '/status/' +
                                                 str(tweet_id))
                # adjustable, give time for warc creation and avoids using too many resources
                time.sleep(3)
            except:
                print("Warc error at {}.{}".format(user, tweet_id))
                logging.error("Warc error at {}.{}".format(user, tweet_id))

            return ADDED

        else:
            tweet = existing_tweets[0]
            if not tweet.countlog_set.filter(retweet_count=retweet_count,
                                             favorite_count=favorite_count):
                tweet.countlog_set.create(retweet_count=retweet_count,
                                          favorite_count=favorite_count,
                                          date=tweet_store_date)

            for key in tweet_keywords:
                if not tweet.keyword_set.filter(name=key):
                    tweet.keyword_set.create(name=key)

            for source in tweet_sources[0]:
                if not tweet.sourcesite_set.filter(url=source[0]):
                    tweet.sourcesite_set.create(url=source[0],
                                                domain=source[1],
                                                matched=True)

            for source in tweet_sources[1]:
                if not tweet.sourcesite_set.filter(url=source[0]):
                    tweet.sourcesite_set.create(url=source[0],
                                                domain=source[1],
                                                matched=False)

            for account in twitter_accounts[0]:
                if not tweet.sourcetwitter_set.filter(name=account):
                    tweet.sourcetwitter_set.create(name=account, matched=True)

            for account in twitter_accounts[1]:
                if not tweet.sourcetwitter_set.filter(name=account):
                    tweet.sourcetwitter_set.create(name=account, matched=False)
            return UPDATED
    return NO_MATCH
Example #7
0
def process_tweet(tweet, keywords, source_sites, source_accounts):
    """
    Checks if the given tweet match the scope.
    """
    user, tweet_text, tweet_id, tweet_date = tweet.user, tweet.text, tweet.tweet_id, tweet.date
    tweet_store_date = timezone.localtime(timezone.now())    
    tweet_keywords = get_keywords(tweet_text, keywords)
    tweet_sources = get_source_sites(tweet.urls, source_sites)
    twitter_accounts = get_source_twitter(tweet.mentions, source_accounts)
    retweet_count, favorite_count = tweet.retweet_count, tweet.favorite_count

    if len(tweet_text) > 450:
        try:
            tweet_text = tweet_text[:450]
        except:
            return NO_MATCH

    # finds match
    if tweet_keywords or tweet_sources[0] or twitter_accounts[0]:
        existing_tweets = Tweet.objects.filter(tweet_id=tweet_id)

        if not existing_tweets:
            tweet = Tweet(tweet_id=tweet_id,
                        name=user,
                        date_added=tweet_store_date,
                        date_published=tweet_date,
                        text=tweet_text)
            tweet.save()
            tweet = Tweet.objects.get(tweet_id=tweet_id)
            tweet.countlog_set.create(retweet_count = retweet_count,
                                    favorite_count = favorite_count,
                                    date =tweet_store_date)

            for account in twitter_accounts[0]:
                tweet.sourcetwitter_set.create(name=account, matched=True)
            for account in twitter_accounts[1]:
                tweet.sourcetwitter_set.create(name=account, matched=False)
            for key in tweet_keywords:
                tweet.keyword_set.create(name=key)
            for source in tweet_sources[0]:
                tweet.sourcesite_set.create(url=source[0], domain=source[1], matched=True)
            for source in tweet_sources[1]:
                tweet.sourcesite_set.create(url=source[0], domain=source[1], matched=False)
            try:
                warc_creator.create_twitter_warc(
                    'https://twitter.com/' + tweet.name + '/status/' +str(tweet_id))
                # adjustable, give time for warc creation and avoids using too many resources
                time.sleep(3)
            except:
                print("Warc error at {}.{}".format(user, tweet_id))
                logging.error("Warc error at {}.{}".format(user, tweet_id))

            return ADDED

        else:
            tweet = existing_tweets[0]
            if not tweet.countlog_set.filter(retweet_count=retweet_count, favorite_count=favorite_count):
                tweet.countlog_set.create(retweet_count=retweet_count, 
                    favorite_count=favorite_count, date=tweet_store_date)

            for key in tweet_keywords:
                if not tweet.keyword_set.filter(name=key):
                    tweet.keyword_set.create(name=key)

            for source in tweet_sources[0]:
                if not tweet.sourcesite_set.filter(url=source[0]):
                    tweet.sourcesite_set.create(
                        url=source[0], domain=source[1], matched=True)

            for source in tweet_sources[1]:
                if not tweet.sourcesite_set.filter(url=source[0]):
                    tweet.sourcesite_set.create(
                        url=source[0], domain=source[1], matched=False)

            for account in twitter_accounts[0]:
                if not tweet.sourcetwitter_set.filter(name=account):
                    tweet.sourcetwitter_set.create(name=account, matched=True)

            for account in twitter_accounts[1]:
                if not tweet.sourcetwitter_set.filter(name=account):
                    tweet.sourcetwitter_set.create(name=account, matched=False)
            return UPDATED
    return NO_MATCH