def query_tweets_from_user(user, limit=None): pos = None tweets = [] try: while True: new_tweets, pos = query_single_page(query, lang='', pos=pos, from_user=True) if len(new_tweets) == 0: logger.info("Got {} tweets from username {}".format(len(tweets), user)) return tweets tweets += new_tweets if limit and len(tweets) >= limit: logger.info("Got {} tweets from username {}".format(len(tweets), user)) return tweets except KeyboardInterrupt: logger.info("Program interrupted by user. Returning tweets gathered " "so far...") except BaseException: logger.exception("An unknown error occurred! Returning tweets " "gathered so far.") logger.info("Got {} tweets from username {}.".format( len(tweets), user)) return tweets
def query_tweets_from_user(user, limit=None): pos = None tweets = [] try: while True: new_tweets, pos = query_single_page(user, lang="", pos=pos, from_user=True) if len(new_tweets) == 0: logger.info("Got {} tweets from username {}".format( len(tweets), user)) return tweets tweets += new_tweets if limit and len(tweets) >= limit: logger.info("Got {} tweets from username {}".format( len(tweets), user)) return tweets except KeyboardInterrupt: logger.info("Program interrupted by user. Returning tweets gathered " "so far...") except BaseException: logger.exception("An unknown error occurred! Returning tweets " "gathered so far.") logger.info("Got {} tweets from username {}.".format(len(tweets), user)) return tweets
def query_single_page(query, lang, pos, retry=50, from_user=False): """ Returns tweets from the given URL. :param query: The query parameter of the query url :param lang: The language parameter of the query url :param pos: The query url parameter that determines where to start looking :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ url = get_query_url(query, lang, pos, from_user) try: response = requests.get(url, headers=HEADER) if pos is None: # html response html = response.text or "" json_resp = None else: html = "" try: json_resp = json.loads(response.text) html = json_resp["items_html"] or "" except ValueError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}"'.format( e, url)) tweets = list(Tweet.from_html(html)) if not tweets: if json_resp: pos = urllib.parse.quote(json_resp["min_position"]) else: pos = None return [], pos if json_resp: return tweets, urllib.parse.quote(json_resp["min_position"]) if from_user: return tweets, tweets[-1].id return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format(e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format(e, url)) except json.decoder.JSONDecodeError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}".'.format(e, url)) if retry > 0: logger.info("Retrying... (Attempts left: {})".format(retry)) return query_single_page(query, lang, pos, retry - 1) logger.error("Giving up.") return [], None
def query_single_page(url, html_response=True, retry=10, from_user=False): """ Returns tweets from the given URL. :param url: The URL to get the tweets from :param html_response: False, if the HTML is embedded in a JSON :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ try: response = requests.get(url, headers=HEADER) if html_response: html = response.text or '' else: html = '' try: json_resp = json.loads(response.text) html = json_resp['items_html'] or '' except ValueError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}"'.format( e, url)) tweets = list(Tweet.from_html(html)) if not tweets: return [], None if not html_response: return tweets, urllib.parse.quote(json_resp['min_position']) if from_user: return tweets, tweets[-1].id else: return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format(e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format(e, url)) except json.decoder.JSONDecodeError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}".'.format(e, url)) if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_single_page(url, html_response, retry - 1) logger.error('Giving up.') return [], None
def query_tweets_once_generator(query, limit=None, lang='', pos=None, dl_imgs=False): """ Queries twitter for all the tweets you want! It will load all pages it gets from twitter. However, twitter might out of a sudden stop serving new pages, in that case, use the `query_tweets` method. Note that this function catches the KeyboardInterrupt so it can return tweets on incomplete queries if the user decides to abort. :param query: Any advanced query you want to do! Compile it at https://twitter.com/search-advanced and just copy the query! :param limit: Scraping will be stopped when at least ``limit`` number of items are fetched. :param pos: Field used as a "checkpoint" to continue where you left off in iteration :return: A list of twitterscraper.Tweet objects. You will get at least ``limit`` number of items. """ logger.info('Querying {}'.format(query)) query = query.replace(' ', '%20').replace('#', '%23').replace( ':', '%3A').replace('&', '%26') num_tweets = 0 try: while True: new_tweets, new_pos = query_single_page(query, lang, pos, dl_imgs) if len(new_tweets) == 0: logger.info('Got {} tweets for {}.'.format(num_tweets, query)) return for t in new_tweets: yield t, pos # use new_pos only once you have iterated through all old tweets pos = new_pos num_tweets += len(new_tweets) if limit and num_tweets >= limit: logger.info('Got {} tweets for {}.'.format(num_tweets, query)) return except KeyboardInterrupt: logger.info('Program interrupted by user. Returning tweets gathered ' 'so far...') except BaseException: logger.exception('An unknown error occurred! Returning tweets ' 'gathered so far.') logger.info('Got {} tweets for {}.'.format(num_tweets, query))
def query_tweets_once_generator(query, limit=None, lang='', db=None): """ Queries twitter for all the tweets you want! It will load all pages it gets from twitter. However, twitter might out of a sudden stop serving new pages, in that case, use the `query_tweets` method. Note that this function catches the KeyboardInterrupt so it can return tweets on incomplete queries if the user decides to abort. :param query: Any advanced query you want to do! Compile it at https://twitter.com/search-advanced and just copy the query! :param limit: Scraping will be stopped when at least ``limit`` number of items are fetched. :param num_tweets: Number of tweets fetched outside this function. :return: A list of twitterscraper.Tweet objects. You will get at least ``limit`` number of items. """ logger.info('Querying {}'.format(query)) query = query.replace(' ', '%20').replace('#', '%23').replace(':', '%3A') pos = None num_tweets = 0 try: while True: new_tweets, pos = query_single_page( INIT_URL.format(q=query, lang=lang) if pos is None else RELOAD_URL.format(q=query, pos=pos, lang=lang), pos is None) if len(new_tweets) == 0: logger.info('Got {} tweets for {}.'.format(num_tweets, query)) return if db: db(new_tweets) return new_tweets[-1], pos else: for t in new_tweets: yield t, pos num_tweets += len(new_tweets) if limit and num_tweets >= limit: logger.info('Got {} tweets for {}.'.format(num_tweets, query)) return except KeyboardInterrupt: logger.info('Program interrupted by user. Returning tweets gathered ' 'so far...') except BaseException: logger.exception('An unknown error occurred! Returning tweets ' 'gathered so far.') logger.info('Got {} tweets for {}.'.format(num_tweets, query))
def query_tweets_once_generator(query, limit=None, lang='', pos=None): """ Queries twitter for all the tweets you want! It will load all pages it gets from twitter. However, twitter might out of a sudden stop serving new pages, in that case, use the `query_tweets` method. Note that this function catches the KeyboardInterrupt so it can return tweets on incomplete queries if the user decides to abort. :param query: Any advanced query you want to do! Compile it at https://twitter.com/search-advanced and just copy the query! :param limit: Scraping will be stopped when at least ``limit`` number of items are fetched. :param pos: Field used as a "checkpoint" to continue where you left off in iteration :return: A list of twitterscraper.Tweet objects. You will get at least ``limit`` number of items. """ logger.info('Querying {}'.format(query)) query = query.replace(' ', '%20').replace('#', '%23').replace(':', '%3A') num_tweets = 0 try: while True: new_tweets, new_pos = query_single_page(query, lang, pos) if len(new_tweets) == 0: logger.info('Got {} tweets for {}.'.format( num_tweets, query)) return for t in new_tweets: yield t, pos # use new_pos only once you have iterated through all old tweets pos = new_pos num_tweets += len(new_tweets) if limit and num_tweets >= limit: logger.info('Got {} tweets for {}.'.format( num_tweets, query)) return except KeyboardInterrupt: logger.info('Program interrupted by user. Returning tweets gathered ' 'so far...') except BaseException: logger.exception('An unknown error occurred! Returning tweets ' 'gathered so far.') logger.info('Got {} tweets for {}.'.format( num_tweets, query))
def query_user_info(user): """ Returns the scraped user data from a twitter user page. :param user: the twitter user to web scrape its twitter page info """ try: user_info = query_user_page(INIT_URL_USER.format(u=user)) if user_info: logger.info("Got user information from username {}".format(user)) return user_info except KeyboardInterrupt: logger.info("Program interrupted by user. Returning user information gathered so far...") except BaseException: logger.exception("An unknown error occurred! Returning user information gathered so far...") logger.info("Got user information from username {}".format(user)) return user_info
def query_tweet_page(user, status_id): tweets = [] try: new_tweets, pos = query_single_page(user, lang='', pos=None, from_user=True, status_id=status_id) if len(new_tweets) == 0: logger.info("Got {} tweets from user {} and status {}".format( len(tweets), user, status_id)) tweets += new_tweets return tweets except KeyboardInterrupt: logger.info("Program interrupted by user. Returning tweets gathered " "so far...") except BaseException: logger.exception("An unknown error occurred! Returning tweets " "gathered so far.") logger.info("Got {} tweets from username {}.".format(len(tweets), user)) return tweets
def query_user_info(user): """ Returns the scraped user data from a twitter user page. :param user: the twitter user to web scrape its twitter page info """ try: user_info = query_user_page(INIT_URL_USER.format(u=user)) if user_info: logger.info(f"Got user information from username {user}") return user_info except KeyboardInterrupt: logger.info("Program interrupted by user. Returning user information gathered so far...") except BaseException: logger.exception("An unknown error occurred! Returning user information gathered so far...") logger.info(f"Got user information from username {user}") return user_info
def query_user_page(url, retry=10): """ Returns the scraped user data from a twitter user page. :param url: The URL to get the twitter user info from (url contains the user page) :param retry: Number of retries if something goes wrong. :return: Returns the scraped user data from a twitter user page. """ try: response = requests.get(url, headers=HEADER) html = response.text or "" user = User() user_info = user.from_html(html) if not user_info: return None return user_info except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format(e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format(e, url)) if retry > 0: logger.info("Retrying... (Attempts left: {})".format(retry)) return query_user_page(url, retry - 1) logger.error("Giving up.") return None
def query_user_page(url, retry=10, timeout=60): """ Returns the scraped user data from a twitter user page. :param url: The URL to get the twitter user info from (url contains the user page) :param retry: Number of retries if something goes wrong. :return: Returns the scraped user data from a twitter user page. """ try: proxy = next(proxy_pool) logger.info('Using proxy {}'.format(proxy)) response = requests.get(url, headers=HEADER, proxies={"http": proxy}) html = response.text or '' user_info = User.from_html(html) if not user_info: return None return user_info except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format(e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format(e, url)) if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_user_page(url, retry - 1) logger.error('Giving up.') return None
def query_user_page(url, retry=10): """ Returns the scraped user data from a twitter user page. :param url: The URL to get the twitter user info from (url contains the user page) :param retry: Number of retries if something goes wrong. :return: Returns the scraped user data from a twitter user page. """ try: response = requests.get(url, headers=HEADER) html = response.text or '' user = User() user_info = user.from_html(html) if not user_info: return None return user_info except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format( e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format( e, url)) if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_user_page(url, retry-1) logger.error('Giving up.') return None
def query_single_page(query, lang, pos, retry=50, from_user=False, timeout=60): """ Returns tweets from the given URL. :param query: The query parameter of the query url :param lang: The language parameter of the query url :param pos: The query url parameter that determines where to start looking :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ url = get_query_url(query, lang, pos, from_user) logger.info('Scraping tweets from {}'.format(url)) try: proxy = next(proxy_pool) logger.info('Using proxy {}'.format(proxy)) response = requests.get(url, headers=HEADER, proxies={"http": proxy}, timeout=timeout) if pos is None: # html response html = response.text or '' json_resp = None else: html = '' try: json_resp = json.loads(response.text) html = json_resp['items_html'] or '' except ValueError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}"'.format( e, url)) tweets = list(Tweet.from_html(html)) if not tweets: try: if json_resp: pos = json_resp['min_position'] has_more_items = json_resp['has_more_items'] if not has_more_items: logger.info("Twitter returned : 'has_more_items' ") return [], None else: pos = None except: pass if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_single_page(query, lang, pos, retry - 1, from_user) else: return [], pos if json_resp: return tweets, urllib.parse.quote(json_resp['min_position']) if from_user: return tweets, tweets[-1].tweet_id return tweets, "TWEET-{}-{}".format(tweets[-1].tweet_id, tweets[0].tweet_id) except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format(e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format(e, url)) except json.decoder.JSONDecodeError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}".'.format(e, url)) if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_single_page(query, lang, pos, retry - 1) logger.error('Giving up.') return [], None
def query_single_page(query, lang, pos, retry=50, from_user=False): """ Returns tweets from the given URL. :param query: The query parameter of the query url :param lang: The language parameter of the query url :param pos: The query url parameter that determines where to start looking :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ url = get_query_url(query, lang, pos, from_user) try: response = requests.get(url, headers=HEADER) if pos is None: # html response html = response.text or '' json_resp = None else: html = '' try: json_resp = json.loads(response.text) html = json_resp['items_html'] or '' except ValueError as e: logger.exception('Failed to parse JSON "{}" while requesting "{}"'.format(e, url)) tweets = list(Tweet.from_html(html)) if not tweets: if json_resp: pos = json_resp['min_position'] else: pos = None if retry > 0: return query_single_page(query, lang, pos, retry - 1, from_user) else: return [], pos if json_resp: return tweets, urllib.parse.quote(json_resp['min_position']) if from_user: return tweets, tweets[-1].id return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format( e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format( e, url)) except json.decoder.JSONDecodeError as e: logger.exception('Failed to parse JSON "{}" while requesting "{}".'.format( e, url)) if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_single_page(query, lang, pos, retry - 1) logger.error('Giving up.') return [], None