コード例 #1
0
ファイル: instagram.py プロジェクト: superbe/datamining
 def user_parse(self, response: HtmlResponse):
     j_body = json.loads(response.text)
     for user in self.parse_user:
         yield response.follow(f'/{user}',
                               callback=self.userdata_parse,
                               cb_kwargs={'username': user})
コード例 #2
0
 def user_parse(self, response: HtmlResponse):
     j_body = json.loads(response.text)
     if j_body['authenticated']:
         yield response.follow(f'/{self.parse_user}',
                               callback=self.user_followers_parse,
                               cb_kwargs={'username': self.parse_user})
コード例 #3
0
    def f_parse(self, response: HtmlResponse, user_id: str, level: int):
        try:
            resp = self.vk_api.friends.get(user_id=user_id, v="5.8")
        except vk.exceptions.VkAPIError as e:
            print(
                f'User {user_id} {self.vk_api.users.get(user_id=user_id, v="5.8")} -- {e.message}'
            )
            #vk.exceptions.VkAPIError: 15. Access denied: this profile is private
            #vk.exceptions.VkAPIError: 18. User was deleted or banned
            return None
        f_level = level + 1

        user_friends = resp["items"]
        try:
            user_friends.remove(self.user1_id)
        except:
            pass

        print(
            f'PARENT_USER:{user_id} ({self.get_username(user_id)}), level:{level} ,friends:{len(user_friends)}'
        )
        for idx, f in enumerate(user_friends):
            # self.graph[f_level] = f
            self.graph[level] = user_id
            if f == self.user2_id:
                self.graph[f_level] = f
                print(
                    f'F_PARSE(user:{user_id}): graph has been found on level:{f_level} user(idx:{idx}, {f}, {self.get_username(f)})!'
                )
                print(self.graph)
                graph_ = list(filter(lambda x: x != None, self.graph))
                time.sleep(1)
                graph_names_ = list(map(lambda x: self.get_username(x),
                                        graph_))
                time.sleep(1)
                item = GraphItem(
                    user1={
                        "link": self.USER1,
                        "id": self.user1_id,
                        "username": self.get_username(self.user1_id)
                    },
                    user2={
                        "link": self.USER2,
                        "id": self.user2_id,
                        "username": self.get_username(self.user2_id)
                    },
                    graph_id=tuple(graph_),
                    graph_username=graph_names_,
                )
                yield item
                raise CloseSpider('We found graph')

            f_path = 'https://vk.com/id' + str(f)
            yield response.follow(
                f_path,  #self.parse_user
                callback=self.f_parse,
                cb_kwargs={
                    'user_id': f,
                    'level': f_level
                }  #self.parse_user
            )
        print(
            f"Пройдены все друзья ({idx}) пользователя {user_id} ({self.get_username(user_id)})"
        )
コード例 #4
0
 def parse(self, response: HtmlResponse):
     button_next = response.css("a[title=Следующая] ").extract_first()
     links = response.xpath("//a[@class='cover']/@href").extract()
     for i in links:
         yield response.follow(i, callback=self.book_parse)
     yield response.follow(button_next, callback=self.parse)
コード例 #5
0
 def parse_car_spec(self, response: HtmlResponse, item, page_id):
     json_car_spec = json.loads(response.body_as_unicode())
     item.add_value('json_car_spec', json_car_spec)
     url_VIN_db = f'https://www.avito.ru/web/1/swaha/v1/autoteka/teaser/{page_id}'
     yield response.follow(url_VIN_db, callback=self.parse_VIN_db, cb_kwargs={'item': item, 'page_id': page_id})
コード例 #6
0
 def parse(self, response: HtmlResponse):
     for town in self.town_list:
         yield response.follow(urljoin(self.start_urls[0], town),
                               callback=self.parse_town,
                               cb_kwargs={'town': town})
コード例 #7
0
ファイル: leroymerlinru.py プロジェクト: bostspb/parsing
 def parse(self, response: HtmlResponse):
     last_page_number = int(
         response.xpath("//uc-pagination/@total").extract_first())
     for page_number in range(1, last_page_number + 1, 1):
         page_link = response.url + '?page=' + str(page_number)
         yield response.follow(page_link, callback=self.page_parse)
コード例 #8
0
 def parse(self, response: HtmlResponse):
     links = response.xpath("//uc-plp-item-new/@href")
     for link in links:
         yield response.follow(link, callback=self.good_parse)
コード例 #9
0
ファイル: bankchartru.py プロジェクト: bostspb/parsing
 def atms_parse(self, response: HtmlResponse):
     links = response.xpath("//div[@class='alphabet__list']//a[contains(@href,'/spravochniki/bankomaty/')]/@href").extract()
     for link in links:
         yield response.follow(link, callback=self.atm_parse)
コード例 #10
0
 def parse(self, response: HtmlResponse):
     for url in response.xpath(
             '//div[contains(@data-marker, "item")]/div[@class="item__line"]//h3/a[@itemprop="url"]'
     ):
         yield response.follow(url, callback=self.avd_parse)
コード例 #11
0
ファイル: bankchartru.py プロジェクト: bostspb/parsing
 def branches_parse(self, response: HtmlResponse):
     links = response.xpath("//div[@class='alphabet__list']//a[contains(@href,'/spravochniki/otdeleniya/')]/@href").extract()
     for link in links:
         yield response.follow(link, callback=self.branch_parse)
コード例 #12
0
ファイル: geekbrains.py プロジェクト: egorsemevskiy/parisng
 def parse(self, response: HtmlResponse):
     next_page = response.css('ul.gb__pagination li.page a[rel=next]::attr(href)').extract_first()
     yield response.follow(next_page, callback=self.parse)
     posts = response.css('div.post-items-wrapper div.post-item a.post-item__title::attr(href)').extract()
     for post in posts:
         yield response.follow(post, callback=self.post_parse)
コード例 #13
0
ファイル: avito_spider.py プロジェクト: Irina991/third_course
 def parse(self, response: HtmlResponse):
     ads_links = response.xpath(
         '//a[@class="styles-link-36uWZ"]/@href').extract()
     for link in ads_links:
         yield response.follow(link, self.parse_ads)
コード例 #14
0
 def parse(self, response: HtmlResponse):
     next_page = response.css("a.HH-Pager-Controls-Next::attr(href)").extract_first()
     vacancy_links = response.xpath("//a[@class='bloko-link HH-LinkModifier']/@href").extract()
     for link in vacancy_links:
         yield response.follow(link, callback=self.vacancy_parse)
     yield response.follow(next_page, callback=self.parse)
コード例 #15
0
ファイル: leroymerlinru.py プロジェクト: bostspb/parsing
 def page_parse(self, response: HtmlResponse):
     product_links = response.xpath("//a[@slot='picture']")
     for link in product_links:
         yield response.follow(link, callback=self.product_parse)
コード例 #16
0
 def parse(self, response: HtmlResponse) -> Generator:
     for suggestion_url in response.css(
             ".topic-title > a:last-of-type::attr('href')").extract():
         yield response.follow(suggestion_url,
                               Wuppertal2017Spider.parse_suggestion)
コード例 #17
0
 def parse(self, response: HtmlResponse):
     ads_links = response.xpath(
         '//a[@class="snippet-link"]/@href').extract()
     for link in ads_links:
         yield response.follow(link, callback=self.parse_ads)
コード例 #18
0
ファイル: avito.py プロジェクト: kesch9/Scrumping
 def parse(self, response: HtmlResponse):
     ads_links = response.xpath(
         '//a[@class="item-description-title-link"]/@href').extract()
     for link in ads_links:
         yield response.follow(link, callback=self.parse_ads)
     pass
コード例 #19
0
ファイル: avito.py プロジェクト: superbe/datamining
 def parse(self, response: HtmlResponse):
     urls = response.xpath(
         '//h3[contains(@data-marker, "item-title")]/a[contains(@itemprop, "url")]/@href'
     ).extract()
     for ad_url in urls:
         yield response.follow(ad_url, callback=self.ad_parse)
コード例 #20
0
 def parse(self, response: HtmlResponse):
     for url in response.xpath('//span[@class="g-user-content"]/a[@href]'):
         yield response.follow(url, callback=self.vac_parse)
コード例 #21
0
 def page_parse(self, response: HtmlResponse):
     posts = response.css(
         'div.snippet-title-row h3.snippet-title a.snippet-link::attr(href)'
     ).extract()
     for post in posts:
         yield response.follow(post, callback=self.post_parse)
コード例 #22
0
ファイル: lerua.py プロジェクト: matveykortsev/data_mining
 def parse(self, response: HtmlResponse):
     item_links = response.xpath(
         '//a[contains(@class,"plp-item__info__title")]//@href').getall()
     for link in item_links:
         yield response.follow(link, callback=self.parse_item)
コード例 #23
0
ファイル: FB.py プロジェクト: VokiVon-P/FB_shortcut
    def parse_page(self, response: HtmlResponse, w_driver, level):

        if self.BINGO:
            return

        tmp_driver = w_driver
        # переходим на персональную страницу c друзьями
        url_friends = self.make_friends_url(self.get_clear_url(response.url))
        tmp_driver.get(url_friends)
        # time.sleep(2)
        """
        Секция обработки персональных данных
        """
        item_id = tmp_driver.find_element_by_xpath(
            '//meta[@property="al:ios:url"]').get_attribute(
                'content')[5:].split('/')[1]

        print(f"ID_PERSON = {item_id}, LEVEL = {level}")
        try:
            # проверим наличие идентификатора пользователя в базе
            item = self.mongo_base[self.name].find_one({"id_person": item_id},
                                                       {
                                                           "id_person": 1,
                                                           "level": 1
                                                       })
            # если есть - удача, вычисляем кол-во рукопожатий и завершаем работу
            if item is not None:
                item_level = item['level']

                bingo = item_level + level
                print(f"BINGOOOO! Number of handshakes = {bingo}")
                self.BINGO = True
                return
        except:
            pass
        """
        Секция обработки список друзей
        """
        # прокручиваем список
        body = tmp_driver.find_element_by_tag_name('body')
        _friend_list_item = '//div[@data-testid="friend_list_item"]/a'
        friends_len = len(tmp_driver.find_elements_by_xpath(_friend_list_item))
        while True:
            body.send_keys(Keys.PAGE_DOWN)
            body.send_keys(Keys.PAGE_DOWN)
            body.send_keys(Keys.PAGE_DOWN)
            time.sleep(2)
            tmp_len = len(tmp_driver.find_elements_by_xpath(_friend_list_item))
            if friends_len == tmp_len:
                break
            friends_len = len(
                tmp_driver.find_elements_by_xpath(_friend_list_item))

        # если открытых друзей нет - прекращаем обработку
        if friends_len != 0:
            # получаем и обрабатываем список друзей
            friends_list = tmp_driver.find_elements_by_xpath(_friend_list_item)
            friends_list = list(
                map(lambda x: x.get_attribute('href'), friends_list))
            print(f'Друзей = {len(friends_list)}')
            item_friends_href_list = list(map(self.get_clear_url,
                                              friends_list))

            # подготавливаем список кортежей и добавляем его в очередь обработки
            level_urls = [(level + 1, href) for href in friends_list]
            self.level_manager.extend(level_urls)
            print(f'Список обработки = {len(self.level_manager)}\n')

            # сохраняем в базу
            yield FbParserItem(id_person=item_id,
                               level=level,
                               friends_count=friends_len,
                               friends=item_friends_href_list)

        # продолжаем дальше ходить по нашему списку
        # вынимаем первый элемент из очереди и запускаем страницу в обработку
        level = -1
        page_url = None
        if len(self.level_manager):
            try:
                level, page_url = self.level_manager.popleft()
            except:
                pass
        #если все ок - идем дальше
        if page_url is not None and level >= 0:
            yield response.follow(page_url,
                                  callback=self.parse_page,
                                  cb_kwargs={
                                      'w_driver': w_driver,
                                      'level': level
                                  })