Example #1
0
 def parse_following(self, response: HtmlResponse, username, user_id,
                     variables):
     data = response.json()
     data = data["data"]["user"]["edge_follow"]
     page_info = data.get("page_info", None)
     if page_info["has_next_page"]:
         variables["after"] = page_info["end_cursor"]
         str_variables = quote(
             str(variables).replace(" ", "").replace("'", '"'))
         url = self.graphql_url + f"query_hash={self.following_hash}&variables={str_variables}"
         yield response.follow(url,
                               callback=self.parse_following,
                               cb_kwargs={
                                   "username": username,
                                   "user_id": user_id,
                                   "variables": deepcopy(variables)
                               })
     followings = data["edges"]
     followings_summary = []
     for following in followings:
         following_summary = {}
         following_summary['username'] = following['node']['username']
         following_summary['user_id'] = following['node']['id']
         following_summary['photo'] = following['node']['profile_pic_url']
         following_summary['is_private'] = following['node']['is_private']
         followings_summary.append(following_summary)
     yield InstaparserItem(followings=followings_summary,
                           user_id=user_id,
                           username=username)
Example #2
0
 def user_subscribers_parse(self, response: HtmlResponse, username, user_id,
                            variables):
     j_data = response.json()
     page_info = j_data.get('data').get('user').get('edge_followed_by').get(
         'page_info')
     if page_info.get('has_next_page'):
         variables['after'] = page_info.get('end_cursor')
         url_subscribers = f'{self.graphql_url}query_hash={self.subscriber_hash}&{urlencode(variables)}'
         yield response.follow(url_subscribers,
                               callback=self.user_subscribers_parse,
                               cb_kwargs={
                                   'username': username,
                                   'user_id': user_id,
                                   'variables': deepcopy(variables)
                               })
     subscribers = j_data.get('data').get('user').get(
         'edge_followed_by').get('edges')
     for subscriber in subscribers:
         item = InstaparserItem(
             subscribe_user_id=user_id,
             photo=subscriber.get('node').get('profile_pic_url'),
             user_id=subscriber.get('node').get('id'),
             user_name=subscriber.get('node').get('username'),
             user_data=subscriber.get('node'))
         yield item
Example #3
0
 def parse_hidden_news(self, response: HtmlResponse, payload, i):
     post = response.json()
     self.parced_items.append(i)
     if post['status'] == 'Ok':
         a = post['payload']['news']['id']
         if post['payload']['news']['id'] >= self.last_item_id:
             item = DakotaparserItem(
                 type='news',
                 visible=False,
                 published_at=post['payload']['news']['date'],
                 post_id=post['payload']['news']['id'],
                 title=post['payload']['news']['title'],
                 body=post['payload']['news']['body'],
                 img_big=post['payload']['news']['img_big'],
                 tickers=post['payload']['news']['tickers'],
                 provider=post['payload']['news']['provider']['name'],
                 item=post['payload'])
             yield item
     else:
         item = DakotaparserItem(type='news',
                                 visible=False,
                                 published_at='2020-01-01T00:00:01+03:00',
                                 post_id=i,
                                 title='empty',
                                 body='',
                                 img_big='',
                                 tickers='',
                                 provider='',
                                 item='')
         yield item
Example #4
0
    def login_response_callback(self, response: HtmlResponse):
        try:
            j_data = response.json()
            current_user_id = j_data.get(Fields.userId)

            if j_data.get("authenticated"):
                self.__logger.info(f"{self.login} успешно авторизован")
                # сохраняем текущего пользователя
                yield InstaparserItem(userId=current_user_id,
                                      friend_id=current_user_id,
                                      username=self.login,
                                      full_name=self.full_name,
                                      profile_pic_url=None)

                for insta_user in self.insta_users:
                    user_id = insta_user[Fields.userId]
                    self.__logger.info(
                        f"Получение даных пользователя {insta_user[Fields.username]}"
                    )

                    # получение списка подписок
                    while self.parse_state.get_continue_scroll_followings(
                            user_id):
                        yield self.following_scroll(response, user_id)

                    # получение списка подписчиков
                    while self.parse_state.get_continue_scroll_followers(
                            user_id):
                        yield self.followers_scroll(response, user_id)
            else:
                self.__logger.warning(f"Отказано в авторизации")
        except Exception as ex:
            self.__logger.error(f"Ошибка получения подписок/подписчиков: {ex}")
Example #5
0
    def user_subscription_parse(self, response: HtmlResponse, username,
                                user_id, variables):
        j_data = response.json()
        page_info = j_data.get('data').get('user').get('edge_follow').get(
            'page_info')
        if page_info.get('has_next_page'):
            variables['after'] = page_info.get('end_cursor')

            url_posts = f'{self.graphql_url}query_hash={self.subscription_hash}&{urlencode(variables)}'
            yield response.follow(url_posts,
                                  callback=self.user_subscription_parse,
                                  cb_kwargs={
                                      'username': username,
                                      'user_id': user_id,
                                      'variables': deepcopy(variables)
                                  })
        posts = j_data.get('data').get('user').get('edge_follow').get('edges')
        for post in posts:
            item = InstaparserItem(
                user_id=user_id,
                subscription_id=post.get('node').get('id'),
                subscription_username=post.get('node').get('username'),
                subscriber_id=user_id,
                subscriber_username=username,
                full_name=post.get('node').get('full_name'),
                profile_pic_url=post.get('node').get('profile_pic_url'),
                post_data=post.get('node'))
            yield item
Example #6
0
 def user_login(self, response: HtmlResponse):
     j_data = response.json()
     if j_data['authenticated']:
         for parse_user in self.parse_users:
             yield response.follow(f'/{parse_user}',
                                   callback=self.user_data_parse,
                                   cb_kwargs={'username': parse_user})
Example #7
0
 def user_login(self, response: HtmlResponse):
     """User login."""
     j_body = response.json()
     if j_body.get('authenticated'):
         for user in self.parse_users:
             yield response.follow(f'/{user}',
                                   callback=self.parse_user_data,
                                   cb_kwargs={'username': user})
Example #8
0
 def user_login(self, response: HtmlResponse):
     j_data = response.json()
     print(f"j_data={j_data}")
     if j_data['authenticated']:
         print('auth')
         for usr in self.parse_user:
             yield response.follow(f'/{usr}',
                                   callback=self.user_data_parse,
                                   cb_kwargs={'username': usr})
Example #9
0
 def parse_hidden_ticker(self, response: HtmlResponse):
     item = response.meta['item']
     item['tickers'].append(response.json())
     a = self._handle_tickers(item, response.meta['t'])
     if not isinstance(a, RuinvestingcomItem):
         yield a
     else:
         self.parced_items.append(int(item['post_id']))
         yield item
 def user_login(self, response: HtmlResponse):
     j_body = response.json()
     if j_body.get('authenticated'):
         for name in self.parse_user:
             yield response.follow(
                 f'/{name}/',
                 # слэш в конце чтобы не получать редирект по 301 статусу
                 callback=self.user_data_parse,
                 cb_kwargs={'username': deepcopy(name)}
             )
Example #11
0
 def response_sid(self, response: HtmlResponse):
     j_body = response.json()
     if j_body.get('resultCode') == 'OK':
         payload = j_body.get('payload')
         trackingId = j_body.get('trackingId')
         request_url = f'{self.feed_url}?sessionId={payload}'
         yield response.follow(request_url,
                               callback=self.parce_news,
                               cb_kwargs={
                                   'payload': payload,
                                   'trackingId': trackingId,
                                   'cursor': ''
                               })
Example #12
0
 def user_login(self, response: HtmlResponse):
     print()
     json_data = response.json()
     if json_data["user"] and json_data["authenticated"]:
         self.user_id = json_data["userId"]
         user_to_scrape_urls = [
             f'/{user_to_scrape}' for user_to_scrape in self.users_to_scrape
         ]
         for user_to_scrape_url, user_to_scrape in zip(
                 user_to_scrape_urls, self.users_to_scrape):
             yield response.follow(user_to_scrape_url,
                                   callback=self.user_data_parse,
                                   cb_kwargs={"username": user_to_scrape})
Example #13
0
    def followings_scroll_response_callback(self, response: HtmlResponse,
                                            userId):
        data = response.json()
        users = data.get(self.users)
        has_users = len(users) > 0
        self.parse_state.set_state_followings(userId, has_users, 12)

        if has_users:
            for user in users:
                yield InstaparserItem(userId=userId,
                                      friend_id=user.get("pk"),
                                      username=user.get(Fields.username),
                                      full_name=user.get(Fields.full_name),
                                      profile_pic_url=user.get(
                                          Fields.profile_pic_url))
Example #14
0
    def followers_scroll_response_callback(self, response: HtmlResponse,
                                           userId):
        data = response.json()
        users = data.get(self.users)
        next_max_id = data.get("next_max_id")
        has_next = next_max_id is not None
        self.parse_state.set_state_followers(userId, has_next, next_max_id)

        if len(users) > 0:
            for user in users:
                yield InstaparserItem(userId=userId,
                                      friend_id=user.get("pk"),
                                      username=user.get(Fields.username),
                                      full_name=user.get(Fields.full_name),
                                      profile_pic_url=user.get(
                                          Fields.profile_pic_url))
Example #15
0
 def tickers_list_get(self, response: HtmlResponse, url):
     j_body = response.json()
     if j_body.get('resultCode') == 'OK':
         payload = j_body.get('payload')
         trackingId = j_body.get('trackingId')
         request_url = f"{self.info_urls[url]['list']}?sessionId={payload}"
         # print(request_url)
         body = self.info_urls[url]['list_request']
         yield response.follow(request_url,
                               callback=self.tickers_info,
                               method='POST',
                               headers={'Content-Type': 'application/json'},
                               dont_filter=True,
                               body=json.dumps(body),
                               cb_kwargs={
                                   'payload': payload,
                                   'trackingId': trackingId,
                                   'url': url
                               })
Example #16
0
    def parce_news(self, response: HtmlResponse, payload, trackingId, cursor):
        j_body = response.json()
        this_parsed = []
        if j_body.get('status') == 'Ok':
            cursor = j_body.get('payload').get('meta').get('cursor')
            request_url = f'{self.feed_url}?sessionId={payload}&cursor={quote_plus(cursor)}'
            trackingId = j_body.get('trackingId')
            posts = j_body.get('payload').get('items')

            if posts[0]['type'] == 'company_news':
                start_item_id = posts[0]['item']['items'][0]['item']['id']
            else:
                start_item_id = posts[0]['item']['id']

            if start_item_id > self.last_item_id:  # падает если сверху новости компании
                yield response.follow(request_url,
                                      callback=self.parce_news,
                                      cb_kwargs={
                                          'payload': payload,
                                          'trackingId': trackingId,
                                          'cursor': cursor
                                      })

            borders = self.get_min_and_max(posts)

            for post in posts:
                if post['type'] == 'company_news':
                    for company_news in post['item']['items']:
                        if company_news['item']['id'] >= self.last_item_id:
                            item = DakotaparserItem(
                                type=company_news['type'],
                                visible=True,
                                published_at=company_news['published_at'],
                                post_id=company_news['item']['id'],
                                title=company_news['item']['title'],
                                body=company_news['item']['body'],
                                img_big=company_news['item']['img_big'],
                                tickers=company_news['item']['tickers'],
                                provider=company_news['item']['provider']
                                ['name'],
                                item=company_news)
                            self.parced_items.append(
                                company_news['item']['id'])
                            this_parsed.append(company_news['item']['id'])
                            yield item
                else:
                    if post['item']['id'] >= self.last_item_id:
                        if post['type'] == 'news' or post['type'] == 'review':
                            item = DakotaparserItem(
                                type=post['type'],
                                visible=True,
                                published_at=post['published_at'],
                                post_id=post['item']['id'],
                                title=post['item']['title'],
                                body=post['item']['body'],
                                img_big=post['item']['img_big'],
                                tickers=post['item']['tickers'],
                                provider=post['item']['provider']['name'],
                                item=post)
                            self.parced_items.append(post['item']['id'])
                            this_parsed.append(post['item']['id'])
                            yield item
                        # elif post['type'] == 'idea':
                        #     item = DakotaparserIdeaItem(
                        #         type=post['type'],
                        #         visible=True,
                        #         published_at=post['published_at'],
                        #         post_id=post['item']['id'],
                        #         broker_id=post['item']['broker_id'],
                        #         horizon=post['item']['horizon'],
                        #         date_start=post['item']['date_start'],
                        #         date_end=post['item']['date_end'],
                        #         target_yield=post['item']['target_yield'],
                        #         title=post['item']['title'],
                        #         body=post['item']['description'],
                        #         tickers=post['item']['tickers'],
                        #         provider=post['item']['broker']['name'],
                        #         provider_accuracy=post['item']['broker']['accuracy'],
                        #     )
                        # yield item
                        else:
                            print(f"found new type: {post['type']}")

            # parse hidden news
            try:
                for i in [
                        x for x in range(borders['min'], borders['max'])
                        if x not in this_parsed
                ]:
                    hidden_url = f'{self.single_news_url}/{i}?sessionId={payload}'
                    yield response.follow(hidden_url,
                                          callback=self.parse_hidden_news,
                                          cb_kwargs={
                                              'payload': payload,
                                              'i': i
                                          })
            except Exception as e:
                print(f'{e}')