def most_popular_page(self, response: HtmlResponse): description_list = response.css('div.descriptionContainer') for item in description_list: title = item.css('a::text').extract_first() sub_link = item.css('a::attr(href)').extract_first() channel_url = response.urljoin(sub_link) self.logger.warning('get channel:{0} ,link is:{1}'.format( title, channel_url)) yield scrapy.Request(channel_url, callback=self.channel_page_see_all) # determine has next page next_page_li = response.css('li.page.next.wl-page') if next_page_li: next_page_sub_link = next_page_li.css( 'a::attr(href)').extract_first() page_number = int(next_page_sub_link.split('page=')[1]) page_number_start = self.settings.get('PAGE_NUMBER_START') page_number_end = self.settings.get('PAGE_NUMBER_END') if page_number_end is not None: if page_number_start < page_number <= page_number_end: next_page_url = response.urljoin(next_page_sub_link) self.logger.warning( 'has next page, url is:{0}'.format(next_page_url)) yield scrapy.Request(next_page_url, callback=self.most_popular_page) else: self.logger.warning('has next page, but is in limit') else: next_page_url = response.urljoin(next_page_sub_link) self.logger.warning( 'has next page, url is:{0}'.format(next_page_url)) yield scrapy.Request(next_page_url, callback=self.most_popular_page)
def categories_parse(self, response: HtmlResponse, category): next_url_list = response.css('a.button.prev::attr(href)').getall() if len(next_url_list) > 1: yield scrapy.Request(url=response.urljoin(next_url_list[1]), callback=self.categories_parse, cb_kwargs={'category': category}) else: yield scrapy.Request(url=response.urljoin(next_url_list[0]), callback=self.categories_parse, cb_kwargs={'category': category}) for item in response.css('div.thumb-video.cf').css('a.thumb-video-link::attr(href)').getall(): yield scrapy.Request(url=item, callback=self.video_parse, cb_kwargs={'category': category})
def parse(self, response: HtmlResponse): print(type(response), '+++++++++++++++++++++++++') print(response.encoding) print(response.status) with open('books.html', 'w', encoding='utf8') as f: f.write(response.text) subjects = response.xpath('//li[@class="subject-item"]') for subject in subjects: item = DoubanbookItem() title = subject.xpath('.//h2/a/text()').extract_first() item['title'] = title.strip() rate = subject.xpath( './/span[@class="rating_nums"]/text()').extract_first() item['rate'] = rate publish = subject.xpath( './/div[@class="pub"]/text()').extract_first() item['publish'] = publish.strip() yield item for i in range(2): next_pag = response.xpath( '//div[@class="paginator"]/a/@href').extract_first() url = response.urljoin(next_pag) yield scrapy.Request(url=url, callback=self.parse)
def ajax_model_page(self, response: HtmlResponse): model_info_list = response.css('li.pcVideoListItem') for item in model_info_list: # type: SelectorList video_url = item.css('span.title').css('a::attr(href)').get() yield scrapy.Request(response.urljoin(video_url), callback=self.video_page, priority=100)
def channel_page(self, response: HtmlResponse): video_css = response.css('span.title') for item in video_css: video_sub_link = item.css('a::attr(href)').extract_first() video_url = response.urljoin(video_sub_link) self.logger.warning( 'send to parse real video, url is:{0}'.format(video_url)) yield scrapy.Request(video_url, callback=self.video_page)
def parse(self, response: HtmlResponse): videos_list = response.css('ul.videos.row-5-thumbs.videosGridWrapper') video_css = videos_list.css('span.title') for item in video_css: # type: SelectorList video_sub_link = item.css('a::attr(href)').get() video_url = response.urljoin(video_sub_link) title = item.css('a::text').get() self.logger.info('send [%s] to parse real video', title) yield scrapy.Request(video_url, callback=self.video_page, priority=100) # determine has next page next_page_li = response.css('li.page_next') if next_page_li: next_page_sub_link = next_page_li.css('a::attr(href)').get() next_page_url = response.urljoin(next_page_sub_link) yield scrapy.Request(next_page_url)
def porn_star_page(self, response: HtmlResponse): # p**n star type no need page number,because next page=2 not show all 2 page videos li_list = response.css('div.videoUList').css('ul').css('li') for li_tag in li_list: # type: SelectorList video_url = li_tag.css('span.title').css('a::attr(href)').get() yield scrapy.Request(response.urljoin(video_url), callback=self.video_page, priority=100) # check has next button page_element = response.css('div.pagination3') if page_element: # if in last page, page_next css not exist next_element = page_element.css('li.page_next') if next_element: next_url = next_element.css('a::attr(href)').get() yield scrapy.Request(response.urljoin(next_url), callback=self.porn_star_page, priority=10)
def parse_next_link(self, response: HtmlResponse) -> str: next_page_tag = response.css( 'a[href*="?category=long&viewtype=basic"]') next_link = None for item in next_page_tag: if '»' == item.css('a::text').extract_first(): ori_link = item.css('a::attr(href)').extract_first() next_link = response.urljoin(ori_link) return next_link
def parse_list_of_tasks(self, response: HtmlResponse, max_number=0, next_number=0, step=5, subsection:str = ''): task_urls = response.css('.problemsmallnumber .componentboxlink::attr(href)').extract() for task_url in task_urls: callback = partial(self.parse_task, subsection=subsection) yield response.follow(response.urljoin(task_url), callback=callback) if next_number < max_number: url = set_get_parameter(response.url, 'start', next_number) callback = partial(self.parse_list_of_tasks, max_number=max_number, next_number=next_number + step, subsection=subsection) yield response.follow(url, callback=callback)
def model_page(self, response: HtmlResponse): # parse current page li_list = response.css('div.videoUList').css('ul').css('li') for li_tag in li_list: # type: SelectorList video_url = li_tag.css('span.title').css('a::attr(href)').get() yield scrapy.Request(response.urljoin(video_url), callback=self.video_page, priority=100) # check has "Load More" button more_button = response.css('#moreDataBtnStream') if more_button: max_page = more_button.css('::attr(data-maxpage)').get() load_more_ori_str = more_button.css('::attr(onclick)').get() ajax_url = load_more_ori_str.split("'")[1] for i in range(2, int(max_page) + 1): new_link = '{0}&page={1}'.format(response.urljoin(ajax_url), i) yield scrapy.Request(new_link, callback=self.ajax_model_page, priority=10)
def video_parse(self, response: HtmlResponse, category): link = response.urljoin(response.css("a.full_download_link[onclick*='mp43000']::attr(href)").get()) title = '' for i in response.css('div.title_bar::text').getall(): i = i.strip() if i: title = i break if link != 'http://www.hotwiferio.com/members/': yield HotItem(name=title, link=link, category=category)
def parse_country_links(self, response: HtmlResponse) -> Request: table = response.css(self.config.countries_table) all_link_tags = table.css('a') country_links = [link.attrib['href'] for link in all_link_tags] for country_link in country_links: full_country_link = response.urljoin(country_link) current_country = country_link.split('/')[1] yield scrapy.Request(full_country_link, callback=self.parse_country, cb_kwargs={"country": current_country})
def parse(self, response: HtmlResponse, **kwargs): for li in response.css('nav').css('li'): if li.css('::text').get() == 'Scenes': for child in li.css('ul>li'): try: year = int(child.css('::text').get()) link = child.css('::attr(href)').get() yield scrapy.Request(url=response.urljoin(link), cb_kwargs={'category': year}, callback=self.parse_page) except ValueError as e: pass
def parse(self, response: HtmlResponse): model_filter_list = self.settings.getlist('MODEL_FILTER_LIST') li_tag_list = response.css('div.sectionWrapper').css( 'ul#moreData').css('li') for item in li_tag_list: # type: SelectorList sub_link = item.css('a.usernameLink').css('a::attr(href)').get() model_name = sub_link.split('/')[-1] if model_name in model_filter_list or len(model_filter_list) == 0: # filter user, model, pornStar if '/model/' in sub_link: yield scrapy.Request(response.urljoin(sub_link + '/videos/upload'), callback=self.model_page, priority=10) elif '/pornstar/' in sub_link: yield scrapy.Request(response.urljoin(sub_link + '/videos/upload'), callback=self.porn_star_page, priority=10) else: yield scrapy.Request(response.urljoin(sub_link + '/videos/public'), callback=self.model_page, priority=10)
def parse_table_of_context(self, response: HtmlResponse): ul = response.css('ul.componentboxlist') themes = {text.strip(): [url, 0] for text, url in zip(ul.css('a::text').extract(), ul.css('a::attr(href)').extract())} titles = ul[0].css('::text').extract() for i, s in enumerate(titles): s = s.strip() if s in themes: s_n = titles[i + 1] n = int(s_n.strip().split()[0][1:]) themes[s][1] = n for subsection, (url, n) in themes.items(): url = set_get_parameter(response.urljoin(url), 'start', 0) callback = partial(self.parse_list_of_tasks, max_number=n, next_number=5, subsection=subsection) yield response.follow(url, callback=callback)
def parse(self, response: HtmlResponse): poems = response.css('.left > .sons') for poem in poems: # 判断一下是整首诗词还是只是句子 title = poem.css('b::text').extract() item = GushiwenItem() if title: # 标题 item['title'] = title[0] # [朝代,作者] source = poem.css('.source > a::text').extract() # 朝代 item['dynasty'] = source[0] # 作者 item['author'] = source[1] # 内容 item['content'] = ''.join( poem.css('.contson *::text').getall()).strip() yield item else: # 诗句 verses = poem.css('.cont') for verse in verses: # 作者<<标题>> source = verse.css('a::text').extract()[1] # 作者 item['author'] = re.search(r'(.*?)《(.*?)》', source).group(1) # 标题 item['title'] = re.search(r'(.*?)《(.*?)》', source).group(2) # 句子内容 item['content'] = verse.css('a::text').extract()[0] # 朝代 item['dynasty'] = '' yield item next_href = response.css('#amore::attr(href)').get() if next_href: next_url = response.urljoin(next_href) request = scrapy.Request(next_url) yield request
def parse(self, response: HtmlResponse): channel_list = [] description_list = response.css('div.descriptionContainer') for item in description_list: title = item.css('a::text').get() sub_link = item.css('a::attr(href)').get() self.logger.info('get channel:{0} ,link is:{1}'.format( title, sub_link)) save_name = sub_link.split('/')[2] channel_list.append(save_name + '\n') # determine has next page next_page_li = response.css('li.page_next') if next_page_li: next_page_sub_link = next_page_li.css('a::attr(href)').get() next_page_url = response.urljoin(next_page_sub_link) self.logger.info('has next page, url is:{0}'.format(next_page_url)) yield scrapy.Request(next_page_url) with open('channel.txt', 'a+') as f: f.writelines(channel_list)
def parse(self, response: HtmlResponse): json_resp = json.loads(response.text) current_page = json_resp['result']['page'] max_pages = json_resp['result']['pages'] if current_page < max_pages: next_url_suffix = '?page={0}'.format(current_page + 1) next_url = response.urljoin(next_url_suffix) self.logger.warn('next url:{0}'.format(next_url)) yield scrapy.Request(next_url) video_info_list = json_resp['result']['docs'] self.logger.warn('第{0}页,有{1}视频'.format(current_page, len(video_info_list))) for info in video_info_list: title = info['title'] video_id = info['videoId'] splice_url = 'https://www.netflav.com/video?id={0}'.format( video_id) addition_meta = {'name': title} yield scrapy.Request(url=splice_url, callback=self.real_video_parse, meta=addition_meta)
def _parse_principals(self, response: HtmlResponse): """ Parse principal and yield to retrieve one's documents. Parsing documents is not cached because different principals of the same country can have the same documents. """ field = self._retrieve_table_field(response) processed_rows = 0 while True: try: data = { 'url': response.urljoin( next(field).css('a::attr(href)').extract_first()), 'foreign_principal': self._extract_next(field), 'date': self._parse_date(self._extract_next(field)), 'address': self._extract_next(field), 'state': self._extract_next(field), 'country': self._extract_next(field), 'registrant': self._extract_next(field), 'reg_num': self._extract_next(field), } next(field) # just skip 'reg_date' documents_request = scrapy.Request( url=data['url'], callback=self._parse_documents, dont_filter=True, ) documents_request.meta['data'] = data yield documents_request except StopIteration: break processed_rows += 1 if processed_rows < self.ROWS_FETCH: return meta = response.meta meta['first_row'] += processed_rows meta['form_data'][ 'p_widget_action_mod'] = self._generate_next_page_action( meta['first_row']) request = scrapy.FormRequest( url=meta['url'], headers=meta['headers'], formdata=meta['form_data'], callback=self._parse_principals, ) for field in ['url', 'headers', 'form_data', 'first_row']: request.meta[field] = meta[field] yield request
def collect_all_tagets(self, driver, starturl: str): """ 从入口starturl开始,遍历每一页收集所有标的url,该函数执行前务必已经`driver.get(starturl)` """ try: total = WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.CSS_SELECTOR, 'div.pagination.J_Pagination .page-total'))) except TimeoutException: # TimeoutException self.logger.debug("导航页有问题,获取导航失败:{}".format(starturl)) return # 分页信息分析 max_page_num = int(total.text) try: nav = WebDriverWait(driver, 10).until( EC.presence_of_all_elements_located( (By.CSS_SELECTOR, 'div.pagination.J_Pagination a'))) except TimeoutException: # TimeoutException self.logger.debug("导航页有问题,获取导航失败:{}".format(starturl)) return # 跳转到首页,第二个是第一页 driver.execute_script("arguments[0].click();", nav[1]) # 从首页开始遍历每页,动作是收集所有标的,然后到下一页. self.logger.info("将收集{}页,starturl{}".format(max_page_num, starturl)) max_retry_times = 5 # 尝试5次 retrys = max_retry_times for i in range(max_page_num): # 提示 if i < max_page_num - 1 and i % 50 == 0: self.logger.info("第{}页开始收集".format(i)) elif i == max_page_num - 1: self.logger.info("已收集{}页".format(i)) try: _ = WebDriverWait(driver, 20).until( EC.presence_of_all_elements_located( (By.CSS_SELECTOR, ".sf-content .sf-item-list li a"))) #从实践看,网页会变化,所以采用保存下来再解析 response = HtmlResponse(driver.current_url, body=str.encode(driver.page_source), encoding='utf-8') yield list( set( map( lambda x: response.urljoin(x), response.css( ".sf-content .sf-item-list li a::attr(href)"). getall()))) except TimeoutException: self.logger.info("{} 页收集标的url失败".format(i)) # 已经到了最后页,就不需要跳转下一页了 if i < max_page_num - 1: # 跳转到下一页, if self._to_next_page(driver) == -1: retrys -= 1 if retrys <= 0: break # 如果跳转失败跳出循环结束 else: retrys = max_retry_times # return urls return
def channel_page_see_all(self, response: HtmlResponse): # get see all button see_all = response.css( 'a.greyButton.light::attr(href)').extract_first() all_url = response.urljoin(see_all) yield scrapy.Request(all_url, callback=self.channel_page)
def extract_task(self, content: scrapy.Selector, response: HtmlResponse): # TODO: обновить документацию """ Вытаскивает информацию о задаче из содержимого страницы :return: словарь с ключами по именам секций задачи и со значениями вида text, images, где images -- список кортежей вида (image_url, tex_view) """ # Вытаскивание текста # text = list(map(lambda s: s.strip(), ''.join(content.extract()).split('\n'))) # text = list(map(lambda s: s.strip(), re.split(r'(<br>|<p>|</p>|>\n)', ''.join(content.extract())))) text = list(map(lambda s: s.strip(), re.split(r'(</?\w{,10}|>)', ''.join(content.extract())))) task_dict = {} current_section = '' session_text = [] image_urls = [] images_urls_of_section = [] text_iterator = enumerate(text) tex_used = False while True: try: i, line = next(text_iterator) new_section = None except StopIteration: break if line == '<h3': next(text_iterator) next(text_iterator) i, line = next(text_iterator) new_section = re.findall(r'(Условие|Подсказка|Решение|Ответ|Источники и прецеденты использования)', line) if (not new_section) and (not current_section): continue if new_section: if current_section: session_text = ' '.join(filter(lambda s: s, session_text)) session_text = ' '.join(session_text.split()) session_text.replace('\'', '') task_dict[DEFAULT_NAMES[current_section]] = session_text, images_urls_of_section current_section = new_section[0] if current_section == 'Источники и прецеденты использования': break session_text = [] images_urls_of_section = [] continue if '<img' in line: i, line = next(text_iterator) src = re.search(r'src=\".+\d+\"', line).group() if src: tex = re.search(r'alt=\"\$(.|\n)+\$\"', line) if tex is None: image_src = src[5:-1] image_url = response.urljoin(image_src) image_urls.append(image_url) images_urls_of_section.append(image_url) else: tex_used = True continue if re.match(r'<\w{1,10}', line): while line != '>': i, line = next(text_iterator) continue if re.match(r'(</?\w{,10}|>)', line): continue if line: old_line = line if 'Также доступны документы в формате' in line or \ ('href' in line or line == 'TeX') or \ (line.endswith('>') and not line.endswith('-->')): continue line = line.strip() line = re.sub(r'(^>|!-- MATH|--)', '', line, re.S).strip() line = line.replace('\\begin{displaymath}', '$').replace('\\end{displaymath}', '$') if line: session_text.append(line) return task_dict, image_urls, tex_used