def query_single_page(query, lang, pos, retry=50, from_user=False): """ Returns tweets from the given URL. :param query: The query parameter of the query url :param lang: The language parameter of the query url :param pos: The query url parameter that determines where to start looking :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ url = get_query_url(query, lang, pos, from_user) try: response = requests.get(url, headers=HEADER) if pos is None: # html response html = response.text or "" json_resp = None else: html = "" try: json_resp = json.loads(response.text) html = json_resp["items_html"] or "" except ValueError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}"'.format( e, url)) tweets = list(Tweet.from_html(html)) if not tweets: if json_resp: pos = urllib.parse.quote(json_resp["min_position"]) else: pos = None return [], pos if json_resp: return tweets, urllib.parse.quote(json_resp["min_position"]) if from_user: return tweets, tweets[-1].id return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format(e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format(e, url)) except json.decoder.JSONDecodeError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}".'.format(e, url)) if retry > 0: logger.info("Retrying... (Attempts left: {})".format(retry)) return query_single_page(query, lang, pos, retry - 1) logger.error("Giving up.") return [], None
def query_single_page(self, url, html_response=True, retry=10): """ Returns tweets from the given URL. :param url: The URL to get the tweets from :param html_response: False, if the HTML is embedded in a JSON :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ headers = {'User-Agent': random.choice(self.HEADERS_LIST)} try: if (self.proxy == None): response = requests.get(url, headers=headers) else: response = requests.get(url, proxies=self.proxy, headers=headers) if html_response: html = response.text or '' else: json_resp = json.loads(response.text) html = json_resp['items_html'] or '' tweets = list(Tweet.from_html(html)) if not tweets: return [], None if not html_response: return tweets, json_resp['min_position'] return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except requests.exceptions.HTTPError as e: self.logger.exception('HTTPError {} while requesting "{}"'.format( e, url)) except requests.exceptions.ConnectionError as e: self.logger.exception( 'ConnectionError {} while requesting "{}"'.format(e, url)) except requests.exceptions.Timeout as e: self.logger.exception('TimeOut {} while requesting "{}"'.format( e, url)) except json.decoder.JSONDecodeError as e: self.logger.exception( 'Failed to parse JSON "{}" while requesting "{}".'.format( e, url)) except ValueError as e: self.logger.exception( 'Failed to parse JSON "{}" while requesting "{}"'.format( e, url)) if retry > 0: self.logger.info("Retrying... (Attempts left: {})".format(retry)) return self.query_single_page(url, html_response, retry - 1) self.logger.error("Giving up.") return [], None
def query_single_page(url, html_response=True, retry=10, from_user=False): """ Returns tweets from the given URL. :param url: The URL to get the tweets from :param html_response: False, if the HTML is embedded in a JSON :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ try: response = requests.get(url, headers=HEADER) if html_response: html = response.text or '' else: html = '' try: json_resp = json.loads(response.text) html = json_resp['items_html'] or '' except ValueError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}"'.format( e, url)) tweets = list(Tweet.from_html(html)) if not tweets: return [], None if not html_response: return tweets, urllib.parse.quote(json_resp['min_position']) if from_user: return tweets, tweets[-1].id else: return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format(e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format(e, url)) except json.decoder.JSONDecodeError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}".'.format(e, url)) if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_single_page(url, html_response, retry - 1) logger.error('Giving up.') return [], None
def query_single_page(url, html_response=True, retry=10): # Returns Tweets from the URL given headers = {'User-Agent': random.choice(HEADERS_LIST)} try: response = requests.get(url, headers=headers) #draws from database specific information if html_response: html = response.text #inputs html received else: json_resp = response.json() html = json_resp['items_html'] #inputs json received tweets = list(Tweet.from_html(html)) if not tweets: return [], None #data validation for non existent if not html_response: return tweets, json_resp['min_position'] #data validation for non existent return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except requests.exceptions.HTTPError as e: logging.exception('HTTPError {} while requesting "{}"'.format(e, url)) #checks for error except requests.exceptions.ConnectionError as e: logging.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logging.exception('TimeOut {} while requesting "{}"'.format(e, url)) except json.decoder.JSONDecodeError as e: logging.exception( 'Failed to parse JSON "{}" while requesting "{}".'.format(e, url)) if retry > 0: logging.info("Retrying... (Attempts left: {})".format(retry)) return query_single_page(url, html_response, retry - 1) logging.error("Quitting.") #errors when logging return [], None
def query_single_page(url, html_response=True, retry=10): """ Returns tweets from the given URL. :param url: The URL to get the tweets from :param html_response: False, if the HTML is embedded in a JSON :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ headers = {'User-Agent': random.choice(HEADERS_LIST)} try: response = requests.get(url, headers=headers) if html_response: html = response.text else: json_resp = response.json() html = json_resp['items_html'] tweets = list(Tweet.from_html(html)) if not tweets: return [], None if not html_response: return tweets, json_resp['min_position'] return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except requests.exceptions.HTTPError as e: logging.exception('HTTPError {} while requesting "{}"'.format( e, url)) except requests.exceptions.ConnectionError as e: logging.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logging.exception('TimeOut {} while requesting "{}"'.format( e, url)) if retry > 0: logging.info("Retrying...") return query_single_page(url, html_response, retry-1) logging.error("Giving up.") return [], None
def query_single_page(url, html_response=True, retry=3): """ Returns tweets from the given URL. :param url: The URL to get the tweets from :param html_response: False, if the HTML is embedded in a JSON :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ headers = {'User-Agent': random.choice(HEADERS_LIST)} req = urllib.request.Request(url, headers=headers) try: response = urllib.request.urlopen(req).read().decode() if html_response: html = response else: json_resp = json.loads(response) html = json_resp['items_html'] tweets = list(Tweet.from_html(html)) if not tweets: return [], None if not html_response: return tweets, json_resp['min_position'] return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except urllib.request.HTTPError as e: logging.exception('HTTPError {} while requesting "{}"'.format( e.code, url)) except urllib.request.URLError as e: logging.exception('URLError {} while requesting "{}"'.format( e.reason, url)) if retry > 0: logging.info("Retrying...") return query_single_page(url, html_response, retry - 1) logging.error("Giving up.") return [], None
def query_single_page(query, lang, pos, retry=50, from_user=False, timeout=60): """ Returns tweets from the given URL. :param query: The query parameter of the query url :param lang: The language parameter of the query url :param pos: The query url parameter that determines where to start looking :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ url = get_query_url(query, lang, pos, from_user) logger.info('Scraping tweets from {}'.format(url)) try: proxy = next(proxy_pool) logger.info('Using proxy {}'.format(proxy)) response = requests.get(url, headers=HEADER, proxies={"http": proxy}, timeout=timeout) if pos is None: # html response html = response.text or '' json_resp = None else: html = '' try: json_resp = json.loads(response.text) html = json_resp['items_html'] or '' except ValueError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}"'.format( e, url)) tweets = list(Tweet.from_html(html)) if not tweets: try: if json_resp: pos = json_resp['min_position'] has_more_items = json_resp['has_more_items'] if not has_more_items: logger.info("Twitter returned : 'has_more_items' ") return [], None else: pos = None except: pass if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_single_page(query, lang, pos, retry - 1, from_user) else: return [], pos if json_resp: return tweets, urllib.parse.quote(json_resp['min_position']) if from_user: return tweets, tweets[-1].tweet_id return tweets, "TWEET-{}-{}".format(tweets[-1].tweet_id, tweets[0].tweet_id) except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format(e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format(e, url)) except json.decoder.JSONDecodeError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}".'.format(e, url)) if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_single_page(query, lang, pos, retry - 1) logger.error('Giving up.') return [], None
def query_single_page(query, lang, pos, retry=50, from_user=False): """ Returns tweets from the given URL. :param query: The query parameter of the query url :param lang: The language parameter of the query url :param pos: The query url parameter that determines where to start looking :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ url = get_query_url(query, lang, pos, from_user) try: response = requests.get(url, headers=HEADER) if pos is None: # html response html = response.text or '' json_resp = None else: html = '' try: json_resp = json.loads(response.text) html = json_resp['items_html'] or '' except ValueError as e: logger.exception('Failed to parse JSON "{}" while requesting "{}"'.format(e, url)) tweets = list(Tweet.from_html(html)) if not tweets: if json_resp: pos = json_resp['min_position'] else: pos = None if retry > 0: return query_single_page(query, lang, pos, retry - 1, from_user) else: return [], pos if json_resp: return tweets, urllib.parse.quote(json_resp['min_position']) if from_user: return tweets, tweets[-1].id return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format( e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format( e, url)) except json.decoder.JSONDecodeError as e: logger.exception('Failed to parse JSON "{}" while requesting "{}".'.format( e, url)) if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_single_page(query, lang, pos, retry - 1) logger.error('Giving up.') return [], None