def query_single_page(url, html_response=True, retry=10): """ Returns tweets from the given URL. :param url: The URL to get the tweets from :param html_response: False, if the HTML is embedded in a JSON :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ headers = {'User-Agent': random.choice(HEADERS_LIST)} try: response = requests.get(url, headers=headers) if html_response: html = response.text or '' else: html = '' try: json_resp = json.loads(response.text) html = json_resp['items_html'] or '' except ValueError as e: logging.exception( 'Failed to parse JSON "{}" while requesting "{}"'.format( e, url)) tweets = list(Tweet.from_html(html)) if not tweets: return [], None if not html_response: return tweets, json_resp['min_position'] return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except requests.exceptions.HTTPError as e: logging.exception('HTTPError {} while requesting "{}"'.format(e, url)) except requests.exceptions.ConnectionError as e: logging.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logging.exception('TimeOut {} while requesting "{}"'.format(e, url)) except json.decoder.JSONDecodeError as e: logging.exception( 'Failed to parse JSON "{}" while requesting "{}".'.format(e, url)) if retry > 0: logging.info("Retrying... (Attempts left: {})".format(retry)) return query_single_page(url, html_response, retry - 1) logging.error("Giving up.") return [], None
def query_single_page(url, user_agent, html_response=True, retry=3): """ Returns tweets from the given URL. :param url: The URL to get the tweets from :param user_agent request head :param html_response: False, if the HTML is embedded in a JSON :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ headers = {'User-Agent': user_agent} json_resp = {} try: proxy_server = get_proxy_server() proxies = {"http": proxy_server, "https": proxy_server} response = requests.get(url, headers=headers, proxies=proxies) if response.status_code != 200: return [], None if html_response: html = response.text else: json_resp = response.json() html = json_resp['items_html'] tweets = list(Tweet.from_html(html)) if not tweets: return [], None if not html_response: return tweets, json_resp['min_position'] return tweets, "TWEET-{}-{}".format(tweets[-1].status_id, tweets[0].status_id) except requests.exceptions.HTTPError as e: logging.exception('HTTPError {} while requesting "{}"'.format(e, url)) except requests.exceptions.ConnectionError as e: logging.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logging.exception('TimeOut {} while requesting "{}"'.format(e, url)) if retry > 0: logging.info("Retrying...") return query_single_page(url, user_agent, html_response, retry - 1) logging.error("Giving up.") return [], None
def scroll(self): try: html = requests.get(url=self.RELOAD_URI.format(query=self.query, pos=self.pos, lang=self.lang), headers=HEADER) self.tweets = list(Tweet.from_html(html.json()["items_html"])) self.pos = html.json()["min_position"] if len(self.tweets) == 0: raise IndexError print(self.tweets) return except: print("これ以上見つけられないよ") raise AssertionError return
def search(self, query): self.query = query.replace(' ', '%20').replace('#', '%23').replace(':', '%3A') self.pos = "" print(self.query) print(self.pos) html = requests.get(url=self.BASE_URI.format(query=self.query, lang=self.lang), headers=HEADER) self.tweets = list(Tweet.from_html(html.text)) # .json()["items_html"])) try: self.pos = BeautifulSoup(html.text, "html.parser").find("div", attrs={"class": "stream-container "})[ "data-min-position"] print(self.tweets) except: pass print(len(self.tweets))
def query_single_page(query, lang, pos, retry=50, from_user=False, timeout=60): url = get_query_url(query, lang, pos, from_user) proxy = next(proxy_pool) response = requests.get(url, headers=HEADER, proxies={"http": proxy}, timeout=timeout) if pos is None: # html response html = response.text or '' json_resp = None else: html = '' json_resp = response.json() html = json_resp['items_html'] or '' tweets = list(Tweet.from_html(html)) if not tweets: if json_resp: pos = json_resp['min_position'] has_more_items = json_resp['has_more_items'] if not has_more_items: return [], None else: pos = None if retry > 0: return query_single_page(query, lang, pos, retry - 1, from_user) else: return [], pos if json_resp: return tweets, urllib.parse.quote(json_resp['min_position']) if from_user: return tweets, tweets[-1].tweet_id return tweets, "TWEET-{}-{}".format(tweets[-1].tweet_id, tweets[0].tweet_id) if retry > 0: return query_single_page(query, lang, pos, retry - 1) return [], None
def query_single_page(url, html_response=True, retry=3): """ Returns tweets from the given URL. :param url: The URL to get the tweets from :param html_response: False, if the HTML is embedded in a JSON :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ headers = {'User-Agent': random.choice(HEADERS_LIST)} req = Request(url, headers=headers) try: response = urlopen(req).read().decode('utf-8') if html_response: html = response else: json_resp = json.loads(response) html = json_resp['items_html'] tweets = list(Tweet.from_html(html)) if not tweets: return [], None if not html_response: return tweets, json_resp['min_position'] return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except HTTPError as e: logging.exception('HTTPError {} while requesting "{}"'.format( e.code, url)) except URLError as e: logging.exception('URLError {} while requesting "{}"'.format( e.reason, url)) if retry > 0: logging.info("Retrying...") return query_single_page(url, html_response, retry - 1) logging.error("Giving up.") return [], None
def query_single_page(query, lang, pos, retry=50, from_user=False): """ Returns tweets from the given URL. :param query: The query parameter of the query url :param lang: The language parameter of the query url :param pos: The query url parameter that determines where to start looking :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ url = get_query_url(query, lang, pos, from_user) logger.info('Scraping tweets from {}', url) try: response = requests.get(url, headers=HEADER) if pos is None: # html response html = response.text or '' json_resp = None else: html = '' try: json_resp = json.loads(response.text) html = json_resp['items_html'] or '' except ValueError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}"'.format( e, url)) tweets = list(Tweet.from_html(html)) if not tweets: try: if json_resp: pos = json_resp['min_position'] has_more_items = json_resp['has_more_items'] if not has_more_items: logger.info("Twitter returned : 'has_more_items' ") return [], None else: pos = None except: pass if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_single_page(query, lang, pos, retry - 1, from_user) else: return [], pos if json_resp: return tweets, urllib.parse.quote(json_resp['min_position']) if from_user: return tweets, tweets[-1].tweet_id return tweets, "TWEET-{}-{}".format(tweets[-1].tweet_id, tweets[0].tweet_id) except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format(e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format(e, url)) except json.decoder.JSONDecodeError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}".'.format(e, url)) if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_single_page(query, lang, pos, retry - 1) logger.error('Giving up.') return [], None