def user_parse(self, response: HtmlResponse): j_body = json.loads(response.text) for user in self.parse_user: yield response.follow(f'/{user}', callback=self.userdata_parse, cb_kwargs={'username': user})
def user_parse(self, response: HtmlResponse): j_body = json.loads(response.text) if j_body['authenticated']: yield response.follow(f'/{self.parse_user}', callback=self.user_followers_parse, cb_kwargs={'username': self.parse_user})
def f_parse(self, response: HtmlResponse, user_id: str, level: int): try: resp = self.vk_api.friends.get(user_id=user_id, v="5.8") except vk.exceptions.VkAPIError as e: print( f'User {user_id} {self.vk_api.users.get(user_id=user_id, v="5.8")} -- {e.message}' ) #vk.exceptions.VkAPIError: 15. Access denied: this profile is private #vk.exceptions.VkAPIError: 18. User was deleted or banned return None f_level = level + 1 user_friends = resp["items"] try: user_friends.remove(self.user1_id) except: pass print( f'PARENT_USER:{user_id} ({self.get_username(user_id)}), level:{level} ,friends:{len(user_friends)}' ) for idx, f in enumerate(user_friends): # self.graph[f_level] = f self.graph[level] = user_id if f == self.user2_id: self.graph[f_level] = f print( f'F_PARSE(user:{user_id}): graph has been found on level:{f_level} user(idx:{idx}, {f}, {self.get_username(f)})!' ) print(self.graph) graph_ = list(filter(lambda x: x != None, self.graph)) time.sleep(1) graph_names_ = list(map(lambda x: self.get_username(x), graph_)) time.sleep(1) item = GraphItem( user1={ "link": self.USER1, "id": self.user1_id, "username": self.get_username(self.user1_id) }, user2={ "link": self.USER2, "id": self.user2_id, "username": self.get_username(self.user2_id) }, graph_id=tuple(graph_), graph_username=graph_names_, ) yield item raise CloseSpider('We found graph') f_path = 'https://vk.com/id' + str(f) yield response.follow( f_path, #self.parse_user callback=self.f_parse, cb_kwargs={ 'user_id': f, 'level': f_level } #self.parse_user ) print( f"Пройдены все друзья ({idx}) пользователя {user_id} ({self.get_username(user_id)})" )
def parse(self, response: HtmlResponse): button_next = response.css("a[title=Следующая] ").extract_first() links = response.xpath("//a[@class='cover']/@href").extract() for i in links: yield response.follow(i, callback=self.book_parse) yield response.follow(button_next, callback=self.parse)
def parse_car_spec(self, response: HtmlResponse, item, page_id): json_car_spec = json.loads(response.body_as_unicode()) item.add_value('json_car_spec', json_car_spec) url_VIN_db = f'https://www.avito.ru/web/1/swaha/v1/autoteka/teaser/{page_id}' yield response.follow(url_VIN_db, callback=self.parse_VIN_db, cb_kwargs={'item': item, 'page_id': page_id})
def parse(self, response: HtmlResponse): for town in self.town_list: yield response.follow(urljoin(self.start_urls[0], town), callback=self.parse_town, cb_kwargs={'town': town})
def parse(self, response: HtmlResponse): last_page_number = int( response.xpath("//uc-pagination/@total").extract_first()) for page_number in range(1, last_page_number + 1, 1): page_link = response.url + '?page=' + str(page_number) yield response.follow(page_link, callback=self.page_parse)
def parse(self, response: HtmlResponse): links = response.xpath("//uc-plp-item-new/@href") for link in links: yield response.follow(link, callback=self.good_parse)
def atms_parse(self, response: HtmlResponse): links = response.xpath("//div[@class='alphabet__list']//a[contains(@href,'/spravochniki/bankomaty/')]/@href").extract() for link in links: yield response.follow(link, callback=self.atm_parse)
def parse(self, response: HtmlResponse): for url in response.xpath( '//div[contains(@data-marker, "item")]/div[@class="item__line"]//h3/a[@itemprop="url"]' ): yield response.follow(url, callback=self.avd_parse)
def branches_parse(self, response: HtmlResponse): links = response.xpath("//div[@class='alphabet__list']//a[contains(@href,'/spravochniki/otdeleniya/')]/@href").extract() for link in links: yield response.follow(link, callback=self.branch_parse)
def parse(self, response: HtmlResponse): next_page = response.css('ul.gb__pagination li.page a[rel=next]::attr(href)').extract_first() yield response.follow(next_page, callback=self.parse) posts = response.css('div.post-items-wrapper div.post-item a.post-item__title::attr(href)').extract() for post in posts: yield response.follow(post, callback=self.post_parse)
def parse(self, response: HtmlResponse): ads_links = response.xpath( '//a[@class="styles-link-36uWZ"]/@href').extract() for link in ads_links: yield response.follow(link, self.parse_ads)
def parse(self, response: HtmlResponse): next_page = response.css("a.HH-Pager-Controls-Next::attr(href)").extract_first() vacancy_links = response.xpath("//a[@class='bloko-link HH-LinkModifier']/@href").extract() for link in vacancy_links: yield response.follow(link, callback=self.vacancy_parse) yield response.follow(next_page, callback=self.parse)
def page_parse(self, response: HtmlResponse): product_links = response.xpath("//a[@slot='picture']") for link in product_links: yield response.follow(link, callback=self.product_parse)
def parse(self, response: HtmlResponse) -> Generator: for suggestion_url in response.css( ".topic-title > a:last-of-type::attr('href')").extract(): yield response.follow(suggestion_url, Wuppertal2017Spider.parse_suggestion)
def parse(self, response: HtmlResponse): ads_links = response.xpath( '//a[@class="snippet-link"]/@href').extract() for link in ads_links: yield response.follow(link, callback=self.parse_ads)
def parse(self, response: HtmlResponse): ads_links = response.xpath( '//a[@class="item-description-title-link"]/@href').extract() for link in ads_links: yield response.follow(link, callback=self.parse_ads) pass
def parse(self, response: HtmlResponse): urls = response.xpath( '//h3[contains(@data-marker, "item-title")]/a[contains(@itemprop, "url")]/@href' ).extract() for ad_url in urls: yield response.follow(ad_url, callback=self.ad_parse)
def parse(self, response: HtmlResponse): for url in response.xpath('//span[@class="g-user-content"]/a[@href]'): yield response.follow(url, callback=self.vac_parse)
def page_parse(self, response: HtmlResponse): posts = response.css( 'div.snippet-title-row h3.snippet-title a.snippet-link::attr(href)' ).extract() for post in posts: yield response.follow(post, callback=self.post_parse)
def parse(self, response: HtmlResponse): item_links = response.xpath( '//a[contains(@class,"plp-item__info__title")]//@href').getall() for link in item_links: yield response.follow(link, callback=self.parse_item)
def parse_page(self, response: HtmlResponse, w_driver, level): if self.BINGO: return tmp_driver = w_driver # переходим на персональную страницу c друзьями url_friends = self.make_friends_url(self.get_clear_url(response.url)) tmp_driver.get(url_friends) # time.sleep(2) """ Секция обработки персональных данных """ item_id = tmp_driver.find_element_by_xpath( '//meta[@property="al:ios:url"]').get_attribute( 'content')[5:].split('/')[1] print(f"ID_PERSON = {item_id}, LEVEL = {level}") try: # проверим наличие идентификатора пользователя в базе item = self.mongo_base[self.name].find_one({"id_person": item_id}, { "id_person": 1, "level": 1 }) # если есть - удача, вычисляем кол-во рукопожатий и завершаем работу if item is not None: item_level = item['level'] bingo = item_level + level print(f"BINGOOOO! Number of handshakes = {bingo}") self.BINGO = True return except: pass """ Секция обработки список друзей """ # прокручиваем список body = tmp_driver.find_element_by_tag_name('body') _friend_list_item = '//div[@data-testid="friend_list_item"]/a' friends_len = len(tmp_driver.find_elements_by_xpath(_friend_list_item)) while True: body.send_keys(Keys.PAGE_DOWN) body.send_keys(Keys.PAGE_DOWN) body.send_keys(Keys.PAGE_DOWN) time.sleep(2) tmp_len = len(tmp_driver.find_elements_by_xpath(_friend_list_item)) if friends_len == tmp_len: break friends_len = len( tmp_driver.find_elements_by_xpath(_friend_list_item)) # если открытых друзей нет - прекращаем обработку if friends_len != 0: # получаем и обрабатываем список друзей friends_list = tmp_driver.find_elements_by_xpath(_friend_list_item) friends_list = list( map(lambda x: x.get_attribute('href'), friends_list)) print(f'Друзей = {len(friends_list)}') item_friends_href_list = list(map(self.get_clear_url, friends_list)) # подготавливаем список кортежей и добавляем его в очередь обработки level_urls = [(level + 1, href) for href in friends_list] self.level_manager.extend(level_urls) print(f'Список обработки = {len(self.level_manager)}\n') # сохраняем в базу yield FbParserItem(id_person=item_id, level=level, friends_count=friends_len, friends=item_friends_href_list) # продолжаем дальше ходить по нашему списку # вынимаем первый элемент из очереди и запускаем страницу в обработку level = -1 page_url = None if len(self.level_manager): try: level, page_url = self.level_manager.popleft() except: pass #если все ок - идем дальше if page_url is not None and level >= 0: yield response.follow(page_url, callback=self.parse_page, cb_kwargs={ 'w_driver': w_driver, 'level': level })