def main():
    # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)

    parser = argparse.ArgumentParser(description="")
    parser.add_argument('screen_name_file')
    args = parser.parse_args()

    logger = get_console_info_logger()

    ACCESS_TOKEN = Twython(consumer_key, consumer_secret,
                           oauth_version=2).obtain_access_token()
    twython = Twython(consumer_key, access_token=ACCESS_TOKEN)

    timeline_crawler = CrawlTwitterTimelines(twython, logger)
    ff_finder = FindFriendFollowers(twython, logger)

    screen_names = get_screen_names_from_file(args.screen_name_file)

    for screen_name in screen_names:
        ff_screen_names = ff_finder.get_ff_screen_names_for_screen_name(
            screen_name)
        save_screen_names_to_file(ff_screen_names, "%s.ff" % screen_name,
                                  logger)

        for ff_screen_name in ff_screen_names:
            tweet_filename = "%s.tweets" % ff_screen_name
            if os.path.exists(tweet_filename):
                logger.info(
                    "File '%s' already exists - will not attempt to download Tweets for '%s'"
                    % (tweet_filename, ff_screen_name))
            else:
                try:
                    tweets = timeline_crawler.get_all_timeline_tweets_for_screen_name(
                        ff_screen_name)
                except TwythonError as e:
                    print "TwythonError: %s" % e
                    if e.error_code == 404:
                        logger.warn(
                            "HTTP 404 error - Most likely, Twitter user '%s' no longer exists"
                            % ff_screen_name)
                    elif e.error_code == 401:
                        logger.warn(
                            "HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible"
                            % ff_screen_name)
                    else:
                        # Unhandled exception
                        raise e
                else:
                    save_tweets_to_json_file(tweets, tweet_filename)
def main():
    # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)

    parser = argparse.ArgumentParser(description="")
    parser.add_argument('screen_name_file')
    args = parser.parse_args()

    logger = get_console_info_logger()

    ACCESS_TOKEN = Twython(consumer_key, consumer_secret, oauth_version=2).obtain_access_token()
    twython = Twython(consumer_key, access_token=ACCESS_TOKEN)

    timeline_crawler = CrawlTwitterTimelines(twython, logger)
    ff_finder = FindFriendFollowers(twython, logger)

    screen_names = get_screen_names_from_file(args.screen_name_file)

    for screen_name in screen_names:
        ff_screen_names = ff_finder.get_ff_screen_names_for_screen_name(screen_name)
        save_screen_names_to_file(ff_screen_names, "%s.ff" % screen_name, logger)

        for ff_screen_name in ff_screen_names:
            tweet_filename = "%s.tweets" % ff_screen_name
            if os.path.exists(tweet_filename):
                logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, ff_screen_name))
            else:
                try:
                    tweets = timeline_crawler.get_all_timeline_tweets_for_screen_name(ff_screen_name)
                except TwythonError as e:
                    print "TwythonError: %s" % e
                    if e.error_code == 404:
                        logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % ff_screen_name)
                    elif e.error_code == 401:
                        logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % ff_screen_name)
                    else:
                        # Unhandled exception
                        raise e
                else:
                    save_tweets_to_json_file(tweets, tweet_filename)
Esempio n. 3
0
def main():
    # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)

    # Parse and document command line options
    parser = argparse.ArgumentParser(description="")
    parser.add_argument('-sn', dest='screen_name_file', default="example_screen_names.txt",
                   help='A text file with one screen name per line.')
    parser.add_argument('-t', dest='token_file', default=os.path.expanduser("~") + "/.trawler/default.yaml",
                    help='A configuration file with Twitter API access tokens. See example_token_file.yaml.')
    parser.add_argument('-d', dest='depth', default=0,
                    help='Friend and follower depth. A value of 1 will gather all tweets for users \
                    in the file as well as all tweets from their friends and followers. Default is 0.')
    args = parser.parse_args()

    # Set up loggers and output directory
    logger = get_console_info_logger()
    output_directory = "data/" + datetime.datetime.now().isoformat() + "/"
    try:
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)
    except:
        print "Could not create directory:", directory
        exit(0)
    logger.info("Created directory: %s" % output_directory)

    # Set up API access
    tokens = yaml.safe_load(open(args.token_file))
    ACCESS_TOKEN = Twython(tokens['consumer_key'], tokens['consumer_secret'], oauth_version=2).obtain_access_token()
    twython = Twython(tokens['consumer_key'], access_token=ACCESS_TOKEN)
    crawler = RateLimitedTwitterEndpoint(twython, "statuses/user_timeline", logger)

    # Gather unique screen names
    screen_names = get_screen_names_from_file(args.screen_name_file)
    depth = int(args.depth) # todo, validate args.depth
    unique_screen_names = []
    if depth > 0: # don't initiate ff_finder unless we have to
        ff_finder = FindFriendFollowers(twython, logger)
        ff_screen_names = get_ff(screen_names, depth, ff_finder, logger)
        unique_screen_names = set(ff_screen_names)
    else:
        unique_screen_names = set(screen_names) # assume the list has redundant names
    save_screen_names_to_file(unique_screen_names, output_directory + 'screen_names')

    # Gather tweets for each of the unique screen names
    for screen_name in unique_screen_names:
        tweet_filename = output_directory + screen_name + ".tweets"
        if os.path.exists(tweet_filename):
            logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, screen_name))
        else:
            try:
                logger.info("Retrieving Tweets for user " + screen_name + " writing to file " + tweet_filename)
                tweets = crawler.get_data(screen_name=screen_name, count=200)
            except TwythonError as e:
                print "TwythonError: %s" % e
                if e.error_code == 404:
                    logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % screen_name)
                elif e.error_code == 401:
                    logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % screen_name)
                else:
                    # Unhandled exception
                    raise e
            else:
                save_tweets_to_json_file(tweets, tweet_filename)