def query_single_page(query, lang, pos, retry=50, from_user=False): """ Returns tweets from the given URL. :param query: The query parameter of the query url :param lang: The language parameter of the query url :param pos: The query url parameter that determines where to start looking :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ url = get_query_url(query, lang, pos, from_user) try: response = requests.get(url, headers=HEADER) if pos is None: # html response html = response.text or "" json_resp = None else: html = "" try: json_resp = json.loads(response.text) html = json_resp["items_html"] or "" except ValueError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}"'.format( e, url)) tweets = list(Tweet.from_html(html)) if not tweets: if json_resp: pos = urllib.parse.quote(json_resp["min_position"]) else: pos = None return [], pos if json_resp: return tweets, urllib.parse.quote(json_resp["min_position"]) if from_user: return tweets, tweets[-1].id return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format(e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format(e, url)) except json.decoder.JSONDecodeError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}".'.format(e, url)) if retry > 0: logger.info("Retrying... (Attempts left: {})".format(retry)) return query_single_page(query, lang, pos, retry - 1) logger.error("Giving up.") return [], None
def query_single_page(self, url, html_response=True, retry=10): """ Returns tweets from the given URL. :param url: The URL to get the tweets from :param html_response: False, if the HTML is embedded in a JSON :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ headers = {'User-Agent': random.choice(self.HEADERS_LIST)} try: if (self.proxy == None): response = requests.get(url, headers=headers) else: response = requests.get(url, proxies=self.proxy, headers=headers) if html_response: html = response.text or '' else: json_resp = json.loads(response.text) html = json_resp['items_html'] or '' tweets = list(Tweet.from_html(html)) if not tweets: return [], None if not html_response: return tweets, json_resp['min_position'] return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except requests.exceptions.HTTPError as e: self.logger.exception('HTTPError {} while requesting "{}"'.format( e, url)) except requests.exceptions.ConnectionError as e: self.logger.exception( 'ConnectionError {} while requesting "{}"'.format(e, url)) except requests.exceptions.Timeout as e: self.logger.exception('TimeOut {} while requesting "{}"'.format( e, url)) except json.decoder.JSONDecodeError as e: self.logger.exception( 'Failed to parse JSON "{}" while requesting "{}".'.format( e, url)) except ValueError as e: self.logger.exception( 'Failed to parse JSON "{}" while requesting "{}"'.format( e, url)) if retry > 0: self.logger.info("Retrying... (Attempts left: {})".format(retry)) return self.query_single_page(url, html_response, retry - 1) self.logger.error("Giving up.") return [], None
def query_single_page(url, html_response=True, retry=10, from_user=False): """ Returns tweets from the given URL. :param url: The URL to get the tweets from :param html_response: False, if the HTML is embedded in a JSON :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ try: response = requests.get(url, headers=HEADER) if html_response: html = response.text or '' else: html = '' try: json_resp = json.loads(response.text) html = json_resp['items_html'] or '' except ValueError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}"'.format( e, url)) tweets = list(Tweet.from_html(html)) if not tweets: return [], None if not html_response: return tweets, urllib.parse.quote(json_resp['min_position']) if from_user: return tweets, tweets[-1].id else: return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format(e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format(e, url)) except json.decoder.JSONDecodeError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}".'.format(e, url)) if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_single_page(url, html_response, retry - 1) logger.error('Giving up.') return [], None
def query_single_page(url, html_response=True, retry=10): # Returns Tweets from the URL given headers = {'User-Agent': random.choice(HEADERS_LIST)} try: response = requests.get(url, headers=headers) #draws from database specific information if html_response: html = response.text #inputs html received else: json_resp = response.json() html = json_resp['items_html'] #inputs json received tweets = list(Tweet.from_html(html)) if not tweets: return [], None #data validation for non existent if not html_response: return tweets, json_resp['min_position'] #data validation for non existent return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except requests.exceptions.HTTPError as e: logging.exception('HTTPError {} while requesting "{}"'.format(e, url)) #checks for error except requests.exceptions.ConnectionError as e: logging.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logging.exception('TimeOut {} while requesting "{}"'.format(e, url)) except json.decoder.JSONDecodeError as e: logging.exception( 'Failed to parse JSON "{}" while requesting "{}".'.format(e, url)) if retry > 0: logging.info("Retrying... (Attempts left: {})".format(retry)) return query_single_page(url, html_response, retry - 1) logging.error("Quitting.") #errors when logging return [], None
def query_single_page(url, html_response=True, retry=10): """ Returns tweets from the given URL. :param url: The URL to get the tweets from :param html_response: False, if the HTML is embedded in a JSON :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ headers = {'User-Agent': random.choice(HEADERS_LIST)} try: response = requests.get(url, headers=headers) if html_response: html = response.text else: json_resp = response.json() html = json_resp['items_html'] tweets = list(Tweet.from_html(html)) if not tweets: return [], None if not html_response: return tweets, json_resp['min_position'] return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except requests.exceptions.HTTPError as e: logging.exception('HTTPError {} while requesting "{}"'.format( e, url)) except requests.exceptions.ConnectionError as e: logging.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logging.exception('TimeOut {} while requesting "{}"'.format( e, url)) if retry > 0: logging.info("Retrying...") return query_single_page(url, html_response, retry-1) logging.error("Giving up.") return [], None
def query_single_page(url, html_response=True, retry=3): """ Returns tweets from the given URL. :param url: The URL to get the tweets from :param html_response: False, if the HTML is embedded in a JSON :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ headers = {'User-Agent': random.choice(HEADERS_LIST)} req = urllib.request.Request(url, headers=headers) try: response = urllib.request.urlopen(req).read().decode() if html_response: html = response else: json_resp = json.loads(response) html = json_resp['items_html'] tweets = list(Tweet.from_html(html)) if not tweets: return [], None if not html_response: return tweets, json_resp['min_position'] return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except urllib.request.HTTPError as e: logging.exception('HTTPError {} while requesting "{}"'.format( e.code, url)) except urllib.request.URLError as e: logging.exception('URLError {} while requesting "{}"'.format( e.reason, url)) if retry > 0: logging.info("Retrying...") return query_single_page(url, html_response, retry - 1) logging.error("Giving up.") return [], None
def query_single_page(query, lang, pos, retry=50, from_user=False, timeout=60): """ Returns tweets from the given URL. :param query: The query parameter of the query url :param lang: The language parameter of the query url :param pos: The query url parameter that determines where to start looking :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ url = get_query_url(query, lang, pos, from_user) logger.info('Scraping tweets from {}'.format(url)) try: proxy = next(proxy_pool) logger.info('Using proxy {}'.format(proxy)) response = requests.get(url, headers=HEADER, proxies={"http": proxy}, timeout=timeout) if pos is None: # html response html = response.text or '' json_resp = None else: html = '' try: json_resp = json.loads(response.text) html = json_resp['items_html'] or '' except ValueError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}"'.format( e, url)) tweets = list(Tweet.from_html(html)) if not tweets: try: if json_resp: pos = json_resp['min_position'] has_more_items = json_resp['has_more_items'] if not has_more_items: logger.info("Twitter returned : 'has_more_items' ") return [], None else: pos = None except: pass if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_single_page(query, lang, pos, retry - 1, from_user) else: return [], pos if json_resp: return tweets, urllib.parse.quote(json_resp['min_position']) if from_user: return tweets, tweets[-1].tweet_id return tweets, "TWEET-{}-{}".format(tweets[-1].tweet_id, tweets[0].tweet_id) except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format(e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format(e, url)) except json.decoder.JSONDecodeError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}".'.format(e, url)) if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_single_page(query, lang, pos, retry - 1) logger.error('Giving up.') return [], None
def from_soup(cls, soup): try: sideBar = soup.find('div', 'ProfileHeaderCard') except: sideBar = "" try: username = sideBar.find('span', 'username').get_text() except: username = "" try: topBar = soup.find('ul', 'ProfileNav-list') except: topBar = "" try: location = sideBar.find( 'div', 'ProfileHeaderCard-location').get_text().strip() or 0 except: location = 0 try: has_avatar = 0 if 'default_profile_images' in soup.find( 'img', 'ProfileAvatar-image')['src'] else 1 except: has_avatar = "" try: joined = sideBar.find('span', 'ProfileHeaderCard-joinDateText')['title'] created = datetime.strptime( joined, "%I:%M %p - %d %b %Y").strftime("%Y-%m-%d") except Exception as e: print(str(e)) created = 0 try: soup.find('div', 'ProfileCanopy-headerBg').find('img')['src'] has_background = 1 except: has_background = 0 try: a = soup.find('h2', 'ProtectedTimeline-heading') if a == None: protected = 0 else: protected = 1 except: protected = 0 tweets = soup.find_all('div', 'tweet') all_tweets = [] for tweet in tweets: if " Retweeted" not in tweet.get_text(): all_tweets.append(Tweet.from_soup(tweet)) try: isVerified = 0 if sideBar.find('span', 'Icon--verified') == None else 1 except: isVerified = 0 try: total_tweets = topBar.find('li', 'ProfileNav-item--tweets').find( 'span', 'ProfileNav-value')['data-count'] or 0 except: total_tweets = 0 try: total_following = topBar.find( 'li', 'ProfileNav-item--following').find( 'span', 'ProfileNav-value')['data-count'] or 0 except: total_following = 0 try: total_followers = topBar.find( 'li', 'ProfileNav-item--followers').find( 'span', 'ProfileNav-value')['data-count'] or 0 except: total_followers = 0 try: total_likes = topBar.find('li', 'ProfileNav-item--favorites').find( 'span', 'ProfileNav-value')['data-count'] or 0 except: total_likes = 0 return cls(username=username, location=location, has_location=0 if location == 0 else 1, created=created, is_verified=isVerified, total_tweets=total_tweets, total_following=total_following, total_followers=total_followers, total_likes=total_likes, has_avatar=has_avatar, has_background=has_background, is_protected=protected, profile_modified=1 if has_background == 1 or has_avatar == 1 or location != 0 else 0, tweets=all_tweets)
def __from_soup(self, tweet_div): # user name & id screen_name = tweet_div["data-screen-name"].strip('@') username = tweet_div["data-name"] user_id = tweet_div["data-user-id"] # tweet basic data tweet_id = tweet_div["data-tweet-id"] # equal to 'data-item-id' tweet_url = tweet_div["data-permalink-path"] timestamp_epochs = int( tweet_div.find('span', '_timestamp')['data-time']) timestamp = datetime.datetime.utcfromtimestamp(timestamp_epochs) # tweet text soup_html = tweet_div \ .find('div', 'js-tweet-text-container') \ .find('p', 'tweet-text') text_html = str(soup_html) or "" text = soup_html.text or "" links = [ atag.get('data-expanded-url', atag['href']) for atag in soup_html.find_all('a', class_='twitter-timeline-link') if 'pic.twitter' not in atag.text # eliminate picture ] hashtags = [tag.strip('#') for tag in re.findall(r'#\w+', text)] # tweet media # --- imgs soup_imgs = tweet_div.find_all('div', 'AdaptiveMedia-photoContainer') img_urls = [img['data-image-url'] for img in soup_imgs] if soup_imgs else [] # --- videos video_div = tweet_div.find('div', 'PlayableMedia-container') video_url = video_div.find( 'div')['data-playable-media-url'] if video_div else '' has_media = True if img_urls or video_url else False # update 'links': eliminate 'video_url' from 'links' for duplicate links = list(filter(lambda x: x != video_url, links)) # tweet actions numbers action_div = tweet_div.find('div', 'ProfileTweet-actionCountList') # --- likes likes = int( action_div.find('span', 'ProfileTweet-action--favorite').find( 'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0') # --- RT retweets = int( action_div.find('span', 'ProfileTweet-action--retweet').find( 'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0') # --- replies replies = int( action_div.find( 'span', 'ProfileTweet-action--reply u-hiddenVisually').find( 'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0') is_replied = False if replies == 0 else True # detail of reply to others # - reply to others parent_tweet_id = tweet_div['data-conversation-id'] # parent tweet if tweet_id == parent_tweet_id: is_reply_to = False parent_tweet_id = '' reply_to_users = [] else: is_reply_to = True soup_reply_to_users = \ tweet_div.find('div', 'ReplyingToContextBelowAuthor') \ .find_all('a') reply_to_users = [{ 'screen_name': user.text.strip('@'), 'user_id': user['data-user-id'] } for user in soup_reply_to_users] return self.__tweet_line( Tweet(screen_name, username, user_id, tweet_id, tweet_url, timestamp, timestamp_epochs, text, text_html, links, hashtags, has_media, img_urls, video_url, likes, retweets, replies, is_replied, is_reply_to, parent_tweet_id, reply_to_users))
def query_single_page(query, lang, pos, retry=50, from_user=False): """ Returns tweets from the given URL. :param query: The query parameter of the query url :param lang: The language parameter of the query url :param pos: The query url parameter that determines where to start looking :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ url = get_query_url(query, lang, pos, from_user) try: response = requests.get(url, headers=HEADER) if pos is None: # html response html = response.text or '' json_resp = None else: html = '' try: json_resp = json.loads(response.text) html = json_resp['items_html'] or '' except ValueError as e: logger.exception('Failed to parse JSON "{}" while requesting "{}"'.format(e, url)) tweets = list(Tweet.from_html(html)) if not tweets: if json_resp: pos = json_resp['min_position'] else: pos = None if retry > 0: return query_single_page(query, lang, pos, retry - 1, from_user) else: return [], pos if json_resp: return tweets, urllib.parse.quote(json_resp['min_position']) if from_user: return tweets, tweets[-1].id return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format( e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format( e, url)) except json.decoder.JSONDecodeError as e: logger.exception('Failed to parse JSON "{}" while requesting "{}".'.format( e, url)) if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_single_page(query, lang, pos, retry - 1) logger.error('Giving up.') return [], None