def parse_following(self, response: HtmlResponse, username, user_id, variables): data = response.json() data = data["data"]["user"]["edge_follow"] page_info = data.get("page_info", None) if page_info["has_next_page"]: variables["after"] = page_info["end_cursor"] str_variables = quote( str(variables).replace(" ", "").replace("'", '"')) url = self.graphql_url + f"query_hash={self.following_hash}&variables={str_variables}" yield response.follow(url, callback=self.parse_following, cb_kwargs={ "username": username, "user_id": user_id, "variables": deepcopy(variables) }) followings = data["edges"] followings_summary = [] for following in followings: following_summary = {} following_summary['username'] = following['node']['username'] following_summary['user_id'] = following['node']['id'] following_summary['photo'] = following['node']['profile_pic_url'] following_summary['is_private'] = following['node']['is_private'] followings_summary.append(following_summary) yield InstaparserItem(followings=followings_summary, user_id=user_id, username=username)
def user_subscribers_parse(self, response: HtmlResponse, username, user_id, variables): j_data = response.json() page_info = j_data.get('data').get('user').get('edge_followed_by').get( 'page_info') if page_info.get('has_next_page'): variables['after'] = page_info.get('end_cursor') url_subscribers = f'{self.graphql_url}query_hash={self.subscriber_hash}&{urlencode(variables)}' yield response.follow(url_subscribers, callback=self.user_subscribers_parse, cb_kwargs={ 'username': username, 'user_id': user_id, 'variables': deepcopy(variables) }) subscribers = j_data.get('data').get('user').get( 'edge_followed_by').get('edges') for subscriber in subscribers: item = InstaparserItem( subscribe_user_id=user_id, photo=subscriber.get('node').get('profile_pic_url'), user_id=subscriber.get('node').get('id'), user_name=subscriber.get('node').get('username'), user_data=subscriber.get('node')) yield item
def parse_hidden_news(self, response: HtmlResponse, payload, i): post = response.json() self.parced_items.append(i) if post['status'] == 'Ok': a = post['payload']['news']['id'] if post['payload']['news']['id'] >= self.last_item_id: item = DakotaparserItem( type='news', visible=False, published_at=post['payload']['news']['date'], post_id=post['payload']['news']['id'], title=post['payload']['news']['title'], body=post['payload']['news']['body'], img_big=post['payload']['news']['img_big'], tickers=post['payload']['news']['tickers'], provider=post['payload']['news']['provider']['name'], item=post['payload']) yield item else: item = DakotaparserItem(type='news', visible=False, published_at='2020-01-01T00:00:01+03:00', post_id=i, title='empty', body='', img_big='', tickers='', provider='', item='') yield item
def login_response_callback(self, response: HtmlResponse): try: j_data = response.json() current_user_id = j_data.get(Fields.userId) if j_data.get("authenticated"): self.__logger.info(f"{self.login} успешно авторизован") # сохраняем текущего пользователя yield InstaparserItem(userId=current_user_id, friend_id=current_user_id, username=self.login, full_name=self.full_name, profile_pic_url=None) for insta_user in self.insta_users: user_id = insta_user[Fields.userId] self.__logger.info( f"Получение даных пользователя {insta_user[Fields.username]}" ) # получение списка подписок while self.parse_state.get_continue_scroll_followings( user_id): yield self.following_scroll(response, user_id) # получение списка подписчиков while self.parse_state.get_continue_scroll_followers( user_id): yield self.followers_scroll(response, user_id) else: self.__logger.warning(f"Отказано в авторизации") except Exception as ex: self.__logger.error(f"Ошибка получения подписок/подписчиков: {ex}")
def user_subscription_parse(self, response: HtmlResponse, username, user_id, variables): j_data = response.json() page_info = j_data.get('data').get('user').get('edge_follow').get( 'page_info') if page_info.get('has_next_page'): variables['after'] = page_info.get('end_cursor') url_posts = f'{self.graphql_url}query_hash={self.subscription_hash}&{urlencode(variables)}' yield response.follow(url_posts, callback=self.user_subscription_parse, cb_kwargs={ 'username': username, 'user_id': user_id, 'variables': deepcopy(variables) }) posts = j_data.get('data').get('user').get('edge_follow').get('edges') for post in posts: item = InstaparserItem( user_id=user_id, subscription_id=post.get('node').get('id'), subscription_username=post.get('node').get('username'), subscriber_id=user_id, subscriber_username=username, full_name=post.get('node').get('full_name'), profile_pic_url=post.get('node').get('profile_pic_url'), post_data=post.get('node')) yield item
def user_login(self, response: HtmlResponse): j_data = response.json() if j_data['authenticated']: for parse_user in self.parse_users: yield response.follow(f'/{parse_user}', callback=self.user_data_parse, cb_kwargs={'username': parse_user})
def user_login(self, response: HtmlResponse): """User login.""" j_body = response.json() if j_body.get('authenticated'): for user in self.parse_users: yield response.follow(f'/{user}', callback=self.parse_user_data, cb_kwargs={'username': user})
def user_login(self, response: HtmlResponse): j_data = response.json() print(f"j_data={j_data}") if j_data['authenticated']: print('auth') for usr in self.parse_user: yield response.follow(f'/{usr}', callback=self.user_data_parse, cb_kwargs={'username': usr})
def parse_hidden_ticker(self, response: HtmlResponse): item = response.meta['item'] item['tickers'].append(response.json()) a = self._handle_tickers(item, response.meta['t']) if not isinstance(a, RuinvestingcomItem): yield a else: self.parced_items.append(int(item['post_id'])) yield item
def user_login(self, response: HtmlResponse): j_body = response.json() if j_body.get('authenticated'): for name in self.parse_user: yield response.follow( f'/{name}/', # слэш в конце чтобы не получать редирект по 301 статусу callback=self.user_data_parse, cb_kwargs={'username': deepcopy(name)} )
def response_sid(self, response: HtmlResponse): j_body = response.json() if j_body.get('resultCode') == 'OK': payload = j_body.get('payload') trackingId = j_body.get('trackingId') request_url = f'{self.feed_url}?sessionId={payload}' yield response.follow(request_url, callback=self.parce_news, cb_kwargs={ 'payload': payload, 'trackingId': trackingId, 'cursor': '' })
def user_login(self, response: HtmlResponse): print() json_data = response.json() if json_data["user"] and json_data["authenticated"]: self.user_id = json_data["userId"] user_to_scrape_urls = [ f'/{user_to_scrape}' for user_to_scrape in self.users_to_scrape ] for user_to_scrape_url, user_to_scrape in zip( user_to_scrape_urls, self.users_to_scrape): yield response.follow(user_to_scrape_url, callback=self.user_data_parse, cb_kwargs={"username": user_to_scrape})
def followings_scroll_response_callback(self, response: HtmlResponse, userId): data = response.json() users = data.get(self.users) has_users = len(users) > 0 self.parse_state.set_state_followings(userId, has_users, 12) if has_users: for user in users: yield InstaparserItem(userId=userId, friend_id=user.get("pk"), username=user.get(Fields.username), full_name=user.get(Fields.full_name), profile_pic_url=user.get( Fields.profile_pic_url))
def followers_scroll_response_callback(self, response: HtmlResponse, userId): data = response.json() users = data.get(self.users) next_max_id = data.get("next_max_id") has_next = next_max_id is not None self.parse_state.set_state_followers(userId, has_next, next_max_id) if len(users) > 0: for user in users: yield InstaparserItem(userId=userId, friend_id=user.get("pk"), username=user.get(Fields.username), full_name=user.get(Fields.full_name), profile_pic_url=user.get( Fields.profile_pic_url))
def tickers_list_get(self, response: HtmlResponse, url): j_body = response.json() if j_body.get('resultCode') == 'OK': payload = j_body.get('payload') trackingId = j_body.get('trackingId') request_url = f"{self.info_urls[url]['list']}?sessionId={payload}" # print(request_url) body = self.info_urls[url]['list_request'] yield response.follow(request_url, callback=self.tickers_info, method='POST', headers={'Content-Type': 'application/json'}, dont_filter=True, body=json.dumps(body), cb_kwargs={ 'payload': payload, 'trackingId': trackingId, 'url': url })
def parce_news(self, response: HtmlResponse, payload, trackingId, cursor): j_body = response.json() this_parsed = [] if j_body.get('status') == 'Ok': cursor = j_body.get('payload').get('meta').get('cursor') request_url = f'{self.feed_url}?sessionId={payload}&cursor={quote_plus(cursor)}' trackingId = j_body.get('trackingId') posts = j_body.get('payload').get('items') if posts[0]['type'] == 'company_news': start_item_id = posts[0]['item']['items'][0]['item']['id'] else: start_item_id = posts[0]['item']['id'] if start_item_id > self.last_item_id: # падает если сверху новости компании yield response.follow(request_url, callback=self.parce_news, cb_kwargs={ 'payload': payload, 'trackingId': trackingId, 'cursor': cursor }) borders = self.get_min_and_max(posts) for post in posts: if post['type'] == 'company_news': for company_news in post['item']['items']: if company_news['item']['id'] >= self.last_item_id: item = DakotaparserItem( type=company_news['type'], visible=True, published_at=company_news['published_at'], post_id=company_news['item']['id'], title=company_news['item']['title'], body=company_news['item']['body'], img_big=company_news['item']['img_big'], tickers=company_news['item']['tickers'], provider=company_news['item']['provider'] ['name'], item=company_news) self.parced_items.append( company_news['item']['id']) this_parsed.append(company_news['item']['id']) yield item else: if post['item']['id'] >= self.last_item_id: if post['type'] == 'news' or post['type'] == 'review': item = DakotaparserItem( type=post['type'], visible=True, published_at=post['published_at'], post_id=post['item']['id'], title=post['item']['title'], body=post['item']['body'], img_big=post['item']['img_big'], tickers=post['item']['tickers'], provider=post['item']['provider']['name'], item=post) self.parced_items.append(post['item']['id']) this_parsed.append(post['item']['id']) yield item # elif post['type'] == 'idea': # item = DakotaparserIdeaItem( # type=post['type'], # visible=True, # published_at=post['published_at'], # post_id=post['item']['id'], # broker_id=post['item']['broker_id'], # horizon=post['item']['horizon'], # date_start=post['item']['date_start'], # date_end=post['item']['date_end'], # target_yield=post['item']['target_yield'], # title=post['item']['title'], # body=post['item']['description'], # tickers=post['item']['tickers'], # provider=post['item']['broker']['name'], # provider_accuracy=post['item']['broker']['accuracy'], # ) # yield item else: print(f"found new type: {post['type']}") # parse hidden news try: for i in [ x for x in range(borders['min'], borders['max']) if x not in this_parsed ]: hidden_url = f'{self.single_news_url}/{i}?sessionId={payload}' yield response.follow(hidden_url, callback=self.parse_hidden_news, cb_kwargs={ 'payload': payload, 'i': i }) except Exception as e: print(f'{e}')