Esempio n. 1
0
def query_user_page(url, retry=10, timeout=60):
    """
    Returns the scraped user data from a twitter user page.

    :param url: The URL to get the twitter user info from (url contains the user page)
    :param retry: Number of retries if something goes wrong.
    :return: Returns the scraped user data from a twitter user page.
    """

    try:
        proxy = next(proxy_pool)
        logger.info('Using proxy {}'.format(proxy))
        response = requests.get(url, headers=HEADER, proxies={"http": proxy})
        html = response.text or ''

        user_info = User.from_html(html)
        if not user_info:
            return None

        return user_info

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(e, url))

    if retry > 0:
        logger.info('Retrying... (Attempts left: {})'.format(retry))
        return query_user_page(url, retry - 1)

    logger.error('Giving up.')
    return None
Esempio n. 2
0
def query_user_page(url, retry=10):
    """
    Returns the scraped user data from a twitter user page.

    :param url: The URL to get the twitter user info from (url contains the user page)
    :param retry: Number of retries if something goes wrong.
    :return: Returns the scraped user data from a twitter user page.
    """

    try:
        response = requests.get(url, headers=HEADER)
        html = response.text or ""

        user = User()
        user_info = user.from_html(html)
        if not user_info:
            return None

        return user_info

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(e, url))

    if retry > 0:
        logger.info("Retrying... (Attempts left: {})".format(retry))
        return query_user_page(url, retry - 1)

    logger.error("Giving up.")
    return None
Esempio n. 3
0
def query_single_page(query, lang, pos, retry=50, from_user=False):
    """
    Returns tweets from the given URL.

    :param query: The query parameter of the query url
    :param lang: The language parameter of the query url
    :param pos: The query url parameter that determines where to start looking
    :param retry: Number of retries if something goes wrong.
    :return: The list of tweets, the pos argument for getting the next page.
    """
    url = get_query_url(query, lang, pos, from_user)

    try:
        response = requests.get(url, headers=HEADER)
        if pos is None:  # html response
            html = response.text or ""
            json_resp = None
        else:
            html = ""
            try:
                json_resp = json.loads(response.text)
                html = json_resp["items_html"] or ""
            except ValueError as e:
                logger.exception(
                    'Failed to parse JSON "{}" while requesting "{}"'.format(
                        e, url))

        tweets = list(Tweet.from_html(html))

        if not tweets:
            if json_resp:
                pos = urllib.parse.quote(json_resp["min_position"])
            else:
                pos = None
            return [], pos

        if json_resp:
            return tweets, urllib.parse.quote(json_resp["min_position"])
        if from_user:
            return tweets, tweets[-1].id
        return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(e, url))
    except json.decoder.JSONDecodeError as e:
        logger.exception(
            'Failed to parse JSON "{}" while requesting "{}".'.format(e, url))

    if retry > 0:
        logger.info("Retrying... (Attempts left: {})".format(retry))
        return query_single_page(query, lang, pos, retry - 1)

    logger.error("Giving up.")
    return [], None
Esempio n. 4
0
def query_single_page(url, html_response=True, retry=10, from_user=False):
    """
    Returns tweets from the given URL.

    :param url: The URL to get the tweets from
    :param html_response: False, if the HTML is embedded in a JSON
    :param retry: Number of retries if something goes wrong.
    :return: The list of tweets, the pos argument for getting the next page.
    """

    try:
        response = requests.get(url, headers=HEADER)
        if html_response:
            html = response.text or ''
        else:
            html = ''
            try:
                json_resp = json.loads(response.text)
                html = json_resp['items_html'] or ''
            except ValueError as e:
                logger.exception(
                    'Failed to parse JSON "{}" while requesting "{}"'.format(
                        e, url))

        tweets = list(Tweet.from_html(html))

        if not tweets:
            return [], None

        if not html_response:
            return tweets, urllib.parse.quote(json_resp['min_position'])

        if from_user:
            return tweets, tweets[-1].id
        else:
            return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(e, url))
    except json.decoder.JSONDecodeError as e:
        logger.exception(
            'Failed to parse JSON "{}" while requesting "{}".'.format(e, url))

    if retry > 0:
        logger.info('Retrying... (Attempts left: {})'.format(retry))
        return query_single_page(url, html_response, retry - 1)

    logger.error('Giving up.')
    return [], None
Esempio n. 5
0
def query_user_page(url, retry=10):
    """
    Returns the scraped user data from a twitter user page.

    :param url: The URL to get the twitter user info from (url contains the user page)
    :param retry: Number of retries if something goes wrong.
    :return: Returns the scraped user data from a twitter user page.
    """

    try:
        response = requests.get(url, headers=HEADER)
        html = response.text or ''

        user = User()
        user_info = user.from_html(html)
        if not user_info:
            return None

        return user_info

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(
            e, url))

    if retry > 0:
        logger.info('Retrying... (Attempts left: {})'.format(retry))
        return query_user_page(url, retry-1)

    logger.error('Giving up.')
    return None
Esempio n. 6
0
def query_single_page(query, lang, pos, retry=50, from_user=False, timeout=60):
    """
    Returns tweets from the given URL.

    :param query: The query parameter of the query url
    :param lang: The language parameter of the query url
    :param pos: The query url parameter that determines where to start looking
    :param retry: Number of retries if something goes wrong.
    :return: The list of tweets, the pos argument for getting the next page.
    """
    url = get_query_url(query, lang, pos, from_user)
    logger.info('Scraping tweets from {}'.format(url))

    try:
        proxy = next(proxy_pool)
        logger.info('Using proxy {}'.format(proxy))
        response = requests.get(url,
                                headers=HEADER,
                                proxies={"http": proxy},
                                timeout=timeout)
        if pos is None:  # html response
            html = response.text or ''
            json_resp = None
        else:
            html = ''
            try:
                json_resp = json.loads(response.text)
                html = json_resp['items_html'] or ''
            except ValueError as e:
                logger.exception(
                    'Failed to parse JSON "{}" while requesting "{}"'.format(
                        e, url))

        tweets = list(Tweet.from_html(html))

        if not tweets:
            try:
                if json_resp:
                    pos = json_resp['min_position']
                    has_more_items = json_resp['has_more_items']
                    if not has_more_items:
                        logger.info("Twitter returned : 'has_more_items' ")
                        return [], None
                else:
                    pos = None
            except:
                pass
            if retry > 0:
                logger.info('Retrying... (Attempts left: {})'.format(retry))
                return query_single_page(query, lang, pos, retry - 1,
                                         from_user)
            else:
                return [], pos

        if json_resp:
            return tweets, urllib.parse.quote(json_resp['min_position'])
        if from_user:
            return tweets, tweets[-1].tweet_id
        return tweets, "TWEET-{}-{}".format(tweets[-1].tweet_id,
                                            tweets[0].tweet_id)

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(e, url))
    except json.decoder.JSONDecodeError as e:
        logger.exception(
            'Failed to parse JSON "{}" while requesting "{}".'.format(e, url))

    if retry > 0:
        logger.info('Retrying... (Attempts left: {})'.format(retry))
        return query_single_page(query, lang, pos, retry - 1)

    logger.error('Giving up.')
    return [], None
Esempio n. 7
0
def main():
    try:
        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
            description=__doc__
        )

        parser.add_argument("query", type=str, help="Advanced twitter query")
        parser.add_argument("-o", "--output", type=str, default="tweets.json",
                            help="Path to a JSON file to store the gathered "
                                 "tweets to.")
        parser.add_argument("-l", "--limit", type=int, default=None,
                            help="Number of minimum tweets to gather.")
        parser.add_argument("-a", "--all", action='store_true',
                            help="Set this flag if you want to get all tweets "
                                 "in the history of twitter. Begindate is set to 2006-03-01."
                                 "This may take a while. You can increase the number of parallel"
                                 "processes depending on the computational power you have.")
        parser.add_argument("-c", "--csv", action='store_true',
                                help="Set this flag if you want to save the results to a CSV format.")
        parser.add_argument("-u", "--user", action='store_true',
                            help="Set this flag to if you want to scrape tweets from a specific user"
                                 "The query should then consist of the profilename (user) you want to scrape without @")
        parser.add_argument("--profiles", action='store_true',
                            help="Set this flag to if you want to scrape profile info of all the users where you" 
                            "have previously scraped from. After all of the tweets have been scraped it will start"
                            "a new process of scraping profile pages.")
        parser.add_argument("--lang", type=str, default=None,
                            help="Set this flag if you want to query tweets in \na specific language. You can choose from:\n"
                                 "en (English)\nar (Arabic)\nbn (Bengali)\n"
                                 "cs (Czech)\nda (Danish)\nde (German)\nel (Greek)\nes (Spanish)\n"
                                 "fa (Persian)\nfi (Finnish)\nfil (Filipino)\nfr (French)\n"
                                 "he (Hebrew)\nhi (Hindi)\nhu (Hungarian)\n"
                                 "id (Indonesian)\nit (Italian)\nja (Japanese)\n"
                                 "ko (Korean)\nmsa (Malay)\nnl (Dutch)\n"
                                 "no (Norwegian)\npl (Polish)\npt (Portuguese)\n"
                                 "ro (Romanian)\nru (Russian)\nsv (Swedish)\n"
                                 "th (Thai)\ntr (Turkish)\nuk (Ukranian)\n"
                                 "ur (Urdu)\nvi (Vietnamese)\n"
                                 "zh-cn (Chinese Simplified)\n"
                                 "zh-tw (Chinese Traditional)"
                                 )
        parser.add_argument("-d", "--dump", action="store_true",
                            help="Set this flag if you want to dump the tweets \nto the console rather than outputting to a file")
        parser.add_argument("-bd", "--begindate", type=valid_date, default="2006-03-21",
                            help="Scrape for tweets starting from this date. Format YYYY-MM-DD. \nDefault value is 2006-03-21", metavar='\b')
        parser.add_argument("-ed", "--enddate", type=valid_date, default=dt.date.today(),
                            help="Scrape for tweets until this date. Format YYYY-MM-DD. \nDefault value is the date of today.", metavar='\b')
        parser.add_argument("-p", "--poolsize", type=int, default=20, help="Specify the number of parallel process you want to run. \n"
                            "Default value is set to 20. \nYou can change this number if you have more computing power available. \n"
                            "Set to 1 if you dont want to run any parallel processes.", metavar='\b')
        args = parser.parse_args()

        if isfile(args.output) and not args.dump:
            logger.error("Output file already exists! Aborting.")
            exit(-1)

        if args.all:
            args.begindate = dt.date(2006,3,1)
            args.enddate = dt.date.today()

        if args.user:
            tweets = query_tweets_from_user(user = args.query, limit = args.limit)
        else:
            tweets = query_tweets(query = args.query, limit = args.limit,
                              begindate = args.begindate, enddate = args.enddate,
                              poolsize = args.poolsize, lang = args.lang)

        if args.dump:
            print(json.dumps(tweets, cls=JSONEncoder))
        else:
            if tweets:
                with open(args.output, "w", encoding="utf-8") as output:
                    if args.csv:
                        f = csv.writer(output)
                        f.writerow(["user", "fullname", "tweet-id", "timestamp", "url", "likes", "replies", "retweets", "text", "html"])
                        for x in tweets:
                            f.writerow([x.user, x.fullname, x.id, x.timestamp, x.url,
                                        x.likes, x.replies, x.retweets,
                                        x.text, x.html])
                    else:
                        json.dump(tweets, output, cls=JSONEncoder)

            if args.profiles and tweets:
                list_users = list(set([tweet.user for tweet in tweets]))
                # list_users_info = [query_user_info(elem) for elem in list_users]
                filename = 'userprofiles_' + args.output

                with open(filename, "w", encoding="utf-8") as output:
                    if args.csv:
                        f = csv.writer(output)
                        f.writerow(["user","fullname","location","blog","date_joined","id","num_tweets","following","followers","likes","lists"])
                        for elem in list_users:
                            u = query_user_info(elem)
                            if u is None:
                                continue
                            else:
                                f.writerow([u.user, u.full_name, u.location, u.blog, u.date_joined, u.id, u.tweets, u.following,
                                u.followers, u.likes, u.lists])

                    else:
                        for elem in list_users:
                            u = query_user_info(elem)
                            if u is None:
                                continue
                            else:
                                json.dump(u, output, cls=JSONEncoder, indent=2)

    except KeyboardInterrupt:
        logger.info("Program interrupted by user. Quitting...")
Esempio n. 8
0
def main():
    try:
        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
            description=__doc__
        )

        parser.add_argument("query", type=str, help="Advanced twitter query")
        parser.add_argument("-o", "--output", type=str, default="tweets.json",
                            help="Path to a JSON file to store the gathered "
                                 "tweets to.")
        parser.add_argument("-l", "--limit", type=int, default=None,
                            help="Number of minimum tweets to gather.")
        parser.add_argument("-a", "--all", action='store_true',
                            help="Set this flag if you want to get all tweets "
                                 "in the history of twitter. Begindate is set to 2006-03-01."
                                 "This may take a while. You can increase the number of parallel"
                                 "processes depending on the computational power you have.")
        parser.add_argument("-c", "--csv", action='store_true',
                                help="Set this flag if you want to save the results to a CSV format.")
        parser.add_argument("-u", "--user", action='store_true',
                            help="Set this flag to if you want to scrape tweets from a specific user"
                                 "The query should then consist of the profilename you want to scrape without @")
        parser.add_argument("--lang", type=str, default=None,
                            help="Set this flag if you want to query tweets in \na specific language. You can choose from:\n"
                                 "en (English)\nar (Arabic)\nbn (Bengali)\n"
                                 "cs (Czech)\nda (Danish)\nde (German)\nel (Greek)\nes (Spanish)\n"
                                 "fa (Persian)\nfi (Finnish)\nfil (Filipino)\nfr (French)\n"
                                 "he (Hebrew)\nhi (Hindi)\nhu (Hungarian)\n"
                                 "id (Indonesian)\nit (Italian)\nja (Japanese)\n"
                                 "ko (Korean)\nmsa (Malay)\nnl (Dutch)\n"
                                 "no (Norwegian)\npl (Polish)\npt (Portuguese)\n"
                                 "ro (Romanian)\nru (Russian)\nsv (Swedish)\n"
                                 "th (Thai)\ntr (Turkish)\nuk (Ukranian)\n"
                                 "ur (Urdu)\nvi (Vietnamese)\n"
                                 "zh-cn (Chinese Simplified)\n"
                                 "zh-tw (Chinese Traditional)"
                                 )
        parser.add_argument("-d", "--dump", action="store_true",
                            help="Set this flag if you want to dump the tweets \nto the console rather than outputting to a file")
        parser.add_argument("-bd", "--begindate", type=valid_date, default="2006-03-21",
                            help="Scrape for tweets starting from this date. Format YYYY-MM-DD. \nDefault value is 2006-03-21", metavar='\b')
        parser.add_argument("-ed", "--enddate", type=valid_date, default=dt.date.today(),
                            help="Scrape for tweets until this date. Format YYYY-MM-DD. \nDefault value is the date of today.", metavar='\b')
        parser.add_argument("-p", "--poolsize", type=int, default=20, help="Specify the number of parallel process you want to run. \n"
                            "Default value is set to 20. \nYou can change this number if you have more computing power available. \n"
                            "Set to 1 if you dont want to run any parallel processes.", metavar='\b')
        args = parser.parse_args()

        if isfile(args.output) and not args.dump:
            logger.error("Output file already exists! Aborting.")
            exit(-1)

        if args.all:
            args.begindate = dt.date(2006,3,1)

        if args.user:
            tweets = query_tweets_from_user(user = args.query, limit = args.limit)
        else:
            tweets = query_tweets(query = args.query, limit = args.limit,
                              begindate = args.begindate, enddate = args.enddate,
                              poolsize = args.poolsize, lang = args.lang)

        if args.dump:
            print(json.dumps(tweets, cls=JSONEncoder))
        else:
            if tweets:
                with open(args.output, "w", encoding="utf-8") as output:
                    if args.csv:
                        f = csv.writer(output)
                        f.writerow(["user", "fullname", "tweet-id", "timestamp", "url", "likes", "replies", "retweets", "text", "html"])
                        for x in tweets:
                            f.writerow([x.user, x.fullname, x.id, x.timestamp, x.url,
                                        x.likes, x.replies, x.retweets,
                                        x.text, x.html])
                    else:
                        json.dump(tweets, output, cls=JSONEncoder)
    except KeyboardInterrupt:
        logger.info("Program interrupted by user. Quitting...")
Esempio n. 9
0
def main():
    try:
        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter, description=__doc__)

        parser.add_argument("query", type=str, help="Advanced twitter query")
        parser.add_argument("-o",
                            "--output",
                            type=str,
                            default="tweets.json",
                            help="Path to a JSON file to store the gathered "
                            "tweets to.")
        parser.add_argument("-l",
                            "--limit",
                            type=int,
                            default=None,
                            help="Number of minimum tweets to gather.")
        parser.add_argument(
            "-a",
            "--all",
            action='store_true',
            help="Set this flag if you want to get all tweets "
            "in the history of twitter. Begindate is set to 2006-03-01."
            "This may take a while. You can increase the number of parallel"
            "processes depending on the computational power you have.")
        parser.add_argument(
            "-c",
            "--csv",
            action='store_true',
            help=
            "Set this flag if you want to save the results to a CSV format.")
        parser.add_argument(
            "-u",
            "--user",
            action='store_true',
            help=
            "Set this flag to if you want to scrape tweets from a specific user"
            "The query should then consist of the profilename you want to scrape without @"
        )
        parser.add_argument(
            "--profiles",
            action='store_true',
            help=
            "Set this flag to if you want to scrape profile info of all the users where you"
            "have previously scraped from. After all of the tweets have been scraped it will start"
            "a new process of scraping profile pages.")
        parser.add_argument(
            "--lang",
            type=str,
            default=None,
            help=
            "Set this flag if you want to query tweets in \na specific language. You can choose from:\n"
            "en (English)\nar (Arabic)\nbn (Bengali)\n"
            "cs (Czech)\nda (Danish)\nde (German)\nel (Greek)\nes (Spanish)\n"
            "fa (Persian)\nfi (Finnish)\nfil (Filipino)\nfr (French)\n"
            "he (Hebrew)\nhi (Hindi)\nhu (Hungarian)\n"
            "id (Indonesian)\nit (Italian)\nja (Japanese)\n"
            "ko (Korean)\nmsa (Malay)\nnl (Dutch)\n"
            "no (Norwegian)\npl (Polish)\npt (Portuguese)\n"
            "ro (Romanian)\nru (Russian)\nsv (Swedish)\n"
            "th (Thai)\ntr (Turkish)\nuk (Ukranian)\n"
            "ur (Urdu)\nvi (Vietnamese)\n"
            "zh-cn (Chinese Simplified)\n"
            "zh-tw (Chinese Traditional)")
        parser.add_argument(
            "-d",
            "--dump",
            action="store_true",
            help=
            "Set this flag if you want to dump the tweets \nto the console rather than outputting to a file"
        )
        parser.add_argument(
            "-ow",
            "--overwrite",
            action="store_true",
            help=
            "Set this flag if you want to overwrite the existing output file.")
        parser.add_argument(
            "-bd",
            "--begindate",
            type=valid_date,
            default="2006-03-21",
            help=
            "Scrape for tweets starting from this date. Format YYYY-MM-DD. \nDefault value is 2006-03-21",
            metavar='\b')
        parser.add_argument(
            "-ed",
            "--enddate",
            type=valid_date,
            default=dt.date.today(),
            help=
            "Scrape for tweets until this date. Format YYYY-MM-DD. \nDefault value is the date of today.",
            metavar='\b')
        parser.add_argument(
            "-p",
            "--poolsize",
            type=int,
            default=20,
            help="Specify the number of parallel process you want to run. \n"
            "Default value is set to 20. \nYou can change this number if you have more computing power available. \n"
            "Set to 1 if you dont want to run any parallel processes.",
            metavar='\b')
        parser.add_argument(
            "-i",
            "--images",
            action="store_true",
            help=
            "Set this flag if you want to download all images from the query.")
        parser.add_argument(
            "-io",
            "--imagesoutput",
            type=str,
            default="./",
            help=
            "The path to the folder to download all of the images to, using -i."
        )
        parser.add_argument(
            "-ex",
            "--onlymedia",
            action="store_true",
            help="Set this flag if you want exclude tweets without media.")
        args = parser.parse_args()

        if isfile(args.output) and not args.dump and not args.overwrite:
            logger.error("Output file already exists! Aborting.")
            exit(-1)

        if args.all:
            args.begindate = dt.date(2006, 3, 1)

        if args.user:
            tweets = query_tweets_from_user(user=args.query,
                                            limit=args.limit,
                                            dl_imgs=args.onlymedia)
        else:
            tweets = query_tweets(query=args.query,
                                  limit=args.limit,
                                  begindate=args.begindate,
                                  enddate=args.enddate,
                                  poolsize=args.poolsize,
                                  lang=args.lang,
                                  dl_imgs=args.onlymedia)

        if args.dump:
            pprint([tweet.__dict__ for tweet in tweets])
        else:
            if tweets:
                with open(args.output, "w", encoding="utf-8") as output:
                    if args.csv:
                        f = csv.writer(output,
                                       delimiter=";",
                                       quoting=csv.QUOTE_NONNUMERIC)
                        f.writerow([
                            "screen_name", "username", "user_id", "tweet_id",
                            "tweet_url", "timestamp", "timestamp_epochs",
                            "text", "text_html", "links", "hashtags",
                            "has_media", "img_urls", "video_url", "likes",
                            "retweets", "replies", "is_replied", "is_reply_to",
                            "parent_tweet_id", "reply_to_users"
                        ])
                        for t in tweets:
                            f.writerow([
                                t.screen_name, t.username, t.user_id,
                                t.tweet_id, t.tweet_url, t.timestamp,
                                t.timestamp_epochs, t.text, t.text_html,
                                t.links, t.hashtags, t.has_media, t.img_urls,
                                t.video_url, t.likes, t.retweets, t.replies,
                                t.is_replied, t.is_reply_to, t.parent_tweet_id,
                                t.reply_to_users
                            ])

                        if args.images:
                            if args.user:
                                download_all_images(tweets,
                                                    args.imagesoutput,
                                                    username=args.query)
                            else:
                                download_all_images(tweets, args.imagesoutput)
                    else:
                        if args.images:
                            if args.user:
                                download_all_images(tweets,
                                                    args.imagesoutput,
                                                    username=args.query)
                            else:
                                download_all_images(tweets, args.imagesoutput)
                        json.dump(tweets, output, cls=JSONEncoder)
            if args.profiles and tweets:
                list_users = list(set([tweet.username for tweet in tweets]))
                list_users_info = [
                    query_user_info(elem) for elem in list_users
                ]
                filename = 'userprofiles_' + args.output

                if args.images:
                    download_all_images(list_users_info, args.imagesoutput)

                with open(filename, "w", encoding="utf-8") as output:
                    json.dump(list_users_info, output, cls=JSONEncoder)
    except KeyboardInterrupt:
        logger.info("Program interrupted by user. Quitting...")
Esempio n. 10
0
def query_single_page(query, lang, pos, retry=50, from_user=False):
    """
    Returns tweets from the given URL.

    :param query: The query parameter of the query url
    :param lang: The language parameter of the query url
    :param pos: The query url parameter that determines where to start looking
    :param retry: Number of retries if something goes wrong.
    :return: The list of tweets, the pos argument for getting the next page.
    """
    url = get_query_url(query, lang, pos, from_user)

    try:
        response = requests.get(url, headers=HEADER)
        if pos is None:  # html response
            html = response.text or ''
            json_resp = None
        else:
            html = ''
            try:
                json_resp = json.loads(response.text)
                html = json_resp['items_html'] or ''
            except ValueError as e:
                logger.exception('Failed to parse JSON "{}" while requesting "{}"'.format(e, url))

        tweets = list(Tweet.from_html(html))

        if not tweets:
            if json_resp:
                pos = json_resp['min_position']
            else:
                pos = None
            if retry > 0:
                return query_single_page(query, lang, pos, retry - 1, from_user)
            else:
                return [], pos

        if json_resp:
            return tweets, urllib.parse.quote(json_resp['min_position'])
        if from_user:
            return tweets, tweets[-1].id
        return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(
            e, url))
    except json.decoder.JSONDecodeError as e:
        logger.exception('Failed to parse JSON "{}" while requesting "{}".'.format(
            e, url))

    if retry > 0:
        logger.info('Retrying... (Attempts left: {})'.format(retry))
        return query_single_page(query, lang, pos, retry - 1)

    logger.error('Giving up.')
    return [], None