Beispiel #1
0
    def most_popular_page(self, response: HtmlResponse):
        description_list = response.css('div.descriptionContainer')
        for item in description_list:
            title = item.css('a::text').extract_first()
            sub_link = item.css('a::attr(href)').extract_first()
            channel_url = response.urljoin(sub_link)
            self.logger.warning('get channel:{0} ,link is:{1}'.format(
                title, channel_url))
            yield scrapy.Request(channel_url,
                                 callback=self.channel_page_see_all)

        # determine has next page
        next_page_li = response.css('li.page.next.wl-page')
        if next_page_li:
            next_page_sub_link = next_page_li.css(
                'a::attr(href)').extract_first()
            page_number = int(next_page_sub_link.split('page=')[1])
            page_number_start = self.settings.get('PAGE_NUMBER_START')
            page_number_end = self.settings.get('PAGE_NUMBER_END')
            if page_number_end is not None:
                if page_number_start < page_number <= page_number_end:
                    next_page_url = response.urljoin(next_page_sub_link)
                    self.logger.warning(
                        'has next page, url is:{0}'.format(next_page_url))
                    yield scrapy.Request(next_page_url,
                                         callback=self.most_popular_page)
                else:
                    self.logger.warning('has next page, but is in limit')
            else:
                next_page_url = response.urljoin(next_page_sub_link)
                self.logger.warning(
                    'has next page, url is:{0}'.format(next_page_url))
                yield scrapy.Request(next_page_url,
                                     callback=self.most_popular_page)
Beispiel #2
0
    def categories_parse(self, response: HtmlResponse, category):
        next_url_list = response.css('a.button.prev::attr(href)').getall()
        if len(next_url_list) > 1:
            yield scrapy.Request(url=response.urljoin(next_url_list[1]), callback=self.categories_parse,
                                 cb_kwargs={'category': category})
        else:
            yield scrapy.Request(url=response.urljoin(next_url_list[0]), callback=self.categories_parse,
                                 cb_kwargs={'category': category})

        for item in response.css('div.thumb-video.cf').css('a.thumb-video-link::attr(href)').getall():
            yield scrapy.Request(url=item, callback=self.video_parse, cb_kwargs={'category': category})
Beispiel #3
0
    def parse(self, response: HtmlResponse):
        print(type(response), '+++++++++++++++++++++++++')
        print(response.encoding)
        print(response.status)

        with open('books.html', 'w', encoding='utf8') as f:
            f.write(response.text)

        subjects = response.xpath('//li[@class="subject-item"]')
        for subject in subjects:
            item = DoubanbookItem()

            title = subject.xpath('.//h2/a/text()').extract_first()
            item['title'] = title.strip()

            rate = subject.xpath(
                './/span[@class="rating_nums"]/text()').extract_first()
            item['rate'] = rate

            publish = subject.xpath(
                './/div[@class="pub"]/text()').extract_first()
            item['publish'] = publish.strip()
            yield item

        for i in range(2):
            next_pag = response.xpath(
                '//div[@class="paginator"]/a/@href').extract_first()
            url = response.urljoin(next_pag)
            yield scrapy.Request(url=url, callback=self.parse)
Beispiel #4
0
 def ajax_model_page(self, response: HtmlResponse):
     model_info_list = response.css('li.pcVideoListItem')
     for item in model_info_list:  # type: SelectorList
         video_url = item.css('span.title').css('a::attr(href)').get()
         yield scrapy.Request(response.urljoin(video_url),
                              callback=self.video_page,
                              priority=100)
Beispiel #5
0
 def channel_page(self, response: HtmlResponse):
     video_css = response.css('span.title')
     for item in video_css:
         video_sub_link = item.css('a::attr(href)').extract_first()
         video_url = response.urljoin(video_sub_link)
         self.logger.warning(
             'send to parse real video, url is:{0}'.format(video_url))
         yield scrapy.Request(video_url, callback=self.video_page)
Beispiel #6
0
    def parse(self, response: HtmlResponse):
        videos_list = response.css('ul.videos.row-5-thumbs.videosGridWrapper')
        video_css = videos_list.css('span.title')
        for item in video_css:  # type: SelectorList
            video_sub_link = item.css('a::attr(href)').get()
            video_url = response.urljoin(video_sub_link)
            title = item.css('a::text').get()
            self.logger.info('send [%s] to parse real video', title)
            yield scrapy.Request(video_url,
                                 callback=self.video_page,
                                 priority=100)

        # determine has next page
        next_page_li = response.css('li.page_next')
        if next_page_li:
            next_page_sub_link = next_page_li.css('a::attr(href)').get()
            next_page_url = response.urljoin(next_page_sub_link)
            yield scrapy.Request(next_page_url)
Beispiel #7
0
 def porn_star_page(self, response: HtmlResponse):
     # p**n star type no need page number,because next page=2 not show all 2 page videos
     li_list = response.css('div.videoUList').css('ul').css('li')
     for li_tag in li_list:  # type: SelectorList
         video_url = li_tag.css('span.title').css('a::attr(href)').get()
         yield scrapy.Request(response.urljoin(video_url),
                              callback=self.video_page,
                              priority=100)
     # check has next button
     page_element = response.css('div.pagination3')
     if page_element:
         # if in last page, page_next css not exist
         next_element = page_element.css('li.page_next')
         if next_element:
             next_url = next_element.css('a::attr(href)').get()
             yield scrapy.Request(response.urljoin(next_url),
                                  callback=self.porn_star_page,
                                  priority=10)
Beispiel #8
0
 def parse_next_link(self, response: HtmlResponse) -> str:
     next_page_tag = response.css(
         'a[href*="?category=long&viewtype=basic"]')
     next_link = None
     for item in next_page_tag:
         if '»' == item.css('a::text').extract_first():
             ori_link = item.css('a::attr(href)').extract_first()
             next_link = response.urljoin(ori_link)
     return next_link
Beispiel #9
0
 def parse_list_of_tasks(self, response: HtmlResponse, max_number=0, next_number=0, step=5, subsection:str = ''):
     task_urls = response.css('.problemsmallnumber .componentboxlink::attr(href)').extract()
     for task_url in task_urls:
         callback = partial(self.parse_task, subsection=subsection)
         yield response.follow(response.urljoin(task_url), callback=callback)
     if next_number < max_number:
         url = set_get_parameter(response.url, 'start', next_number)
         callback = partial(self.parse_list_of_tasks, max_number=max_number, next_number=next_number + step, subsection=subsection)
         yield response.follow(url, callback=callback)
Beispiel #10
0
 def model_page(self, response: HtmlResponse):
     # parse current page
     li_list = response.css('div.videoUList').css('ul').css('li')
     for li_tag in li_list:  # type: SelectorList
         video_url = li_tag.css('span.title').css('a::attr(href)').get()
         yield scrapy.Request(response.urljoin(video_url),
                              callback=self.video_page,
                              priority=100)
     # check has "Load More" button
     more_button = response.css('#moreDataBtnStream')
     if more_button:
         max_page = more_button.css('::attr(data-maxpage)').get()
         load_more_ori_str = more_button.css('::attr(onclick)').get()
         ajax_url = load_more_ori_str.split("'")[1]
         for i in range(2, int(max_page) + 1):
             new_link = '{0}&page={1}'.format(response.urljoin(ajax_url), i)
             yield scrapy.Request(new_link,
                                  callback=self.ajax_model_page,
                                  priority=10)
Beispiel #11
0
 def video_parse(self, response: HtmlResponse, category):
     link = response.urljoin(response.css("a.full_download_link[onclick*='mp43000']::attr(href)").get())
     title = ''
     for i in response.css('div.title_bar::text').getall():
         i = i.strip()
         if i:
             title = i
             break
     if link != 'http://www.hotwiferio.com/members/':
         yield HotItem(name=title, link=link, category=category)
Beispiel #12
0
    def parse_country_links(self, response: HtmlResponse) -> Request:
        table = response.css(self.config.countries_table)
        all_link_tags = table.css('a')
        country_links = [link.attrib['href'] for link in all_link_tags]

        for country_link in country_links:
            full_country_link = response.urljoin(country_link)
            current_country = country_link.split('/')[1]
            yield scrapy.Request(full_country_link,
                                 callback=self.parse_country,
                                 cb_kwargs={"country": current_country})
Beispiel #13
0
 def parse(self, response: HtmlResponse, **kwargs):
     for li in response.css('nav').css('li'):
         if li.css('::text').get() == 'Scenes':
             for child in li.css('ul>li'):
                 try:
                     year = int(child.css('::text').get())
                     link = child.css('::attr(href)').get()
                     yield scrapy.Request(url=response.urljoin(link), cb_kwargs={'category': year},
                                          callback=self.parse_page)
                 except ValueError as e:
                     pass
Beispiel #14
0
 def parse(self, response: HtmlResponse):
     model_filter_list = self.settings.getlist('MODEL_FILTER_LIST')
     li_tag_list = response.css('div.sectionWrapper').css(
         'ul#moreData').css('li')
     for item in li_tag_list:  # type: SelectorList
         sub_link = item.css('a.usernameLink').css('a::attr(href)').get()
         model_name = sub_link.split('/')[-1]
         if model_name in model_filter_list or len(model_filter_list) == 0:
             # filter user, model, pornStar
             if '/model/' in sub_link:
                 yield scrapy.Request(response.urljoin(sub_link +
                                                       '/videos/upload'),
                                      callback=self.model_page,
                                      priority=10)
             elif '/pornstar/' in sub_link:
                 yield scrapy.Request(response.urljoin(sub_link +
                                                       '/videos/upload'),
                                      callback=self.porn_star_page,
                                      priority=10)
             else:
                 yield scrapy.Request(response.urljoin(sub_link +
                                                       '/videos/public'),
                                      callback=self.model_page,
                                      priority=10)
Beispiel #15
0
    def parse_table_of_context(self, response: HtmlResponse):
        ul = response.css('ul.componentboxlist')
        themes = {text.strip(): [url, 0] for text, url in zip(ul.css('a::text').extract(), ul.css('a::attr(href)').extract())}
        titles = ul[0].css('::text').extract()
        for i, s in enumerate(titles):
            s = s.strip()
            if s in themes:
                s_n = titles[i + 1]
                n = int(s_n.strip().split()[0][1:])
                themes[s][1] = n

        for subsection, (url, n) in themes.items():
            url = set_get_parameter(response.urljoin(url), 'start', 0)
            callback = partial(self.parse_list_of_tasks, max_number=n, next_number=5, subsection=subsection)
            yield response.follow(url, callback=callback)
Beispiel #16
0
    def parse(self, response: HtmlResponse):
        poems = response.css('.left > .sons')
        for poem in poems:
            # 判断一下是整首诗词还是只是句子
            title = poem.css('b::text').extract()
            item = GushiwenItem()
            if title:
                # 标题
                item['title'] = title[0]
                # [朝代,作者]
                source = poem.css('.source > a::text').extract()
                # 朝代
                item['dynasty'] = source[0]
                # 作者
                item['author'] = source[1]
                # 内容
                item['content'] = ''.join(
                    poem.css('.contson *::text').getall()).strip()

                yield item
            else:
                # 诗句
                verses = poem.css('.cont')
                for verse in verses:
                    # 作者<<标题>>
                    source = verse.css('a::text').extract()[1]
                    # 作者
                    item['author'] = re.search(r'(.*?)《(.*?)》',
                                               source).group(1)
                    # 标题
                    item['title'] = re.search(r'(.*?)《(.*?)》', source).group(2)
                    # 句子内容
                    item['content'] = verse.css('a::text').extract()[0]
                    # 朝代
                    item['dynasty'] = ''

                    yield item

        next_href = response.css('#amore::attr(href)').get()
        if next_href:
            next_url = response.urljoin(next_href)
            request = scrapy.Request(next_url)
            yield request
Beispiel #17
0
    def parse(self, response: HtmlResponse):
        channel_list = []
        description_list = response.css('div.descriptionContainer')
        for item in description_list:
            title = item.css('a::text').get()
            sub_link = item.css('a::attr(href)').get()
            self.logger.info('get channel:{0} ,link is:{1}'.format(
                title, sub_link))
            save_name = sub_link.split('/')[2]
            channel_list.append(save_name + '\n')

        # determine has next page
        next_page_li = response.css('li.page_next')
        if next_page_li:
            next_page_sub_link = next_page_li.css('a::attr(href)').get()
            next_page_url = response.urljoin(next_page_sub_link)
            self.logger.info('has next page, url is:{0}'.format(next_page_url))
            yield scrapy.Request(next_page_url)

        with open('channel.txt', 'a+') as f:
            f.writelines(channel_list)
Beispiel #18
0
    def parse(self, response: HtmlResponse):
        json_resp = json.loads(response.text)
        current_page = json_resp['result']['page']
        max_pages = json_resp['result']['pages']

        if current_page < max_pages:
            next_url_suffix = '?page={0}'.format(current_page + 1)
            next_url = response.urljoin(next_url_suffix)
            self.logger.warn('next url:{0}'.format(next_url))
            yield scrapy.Request(next_url)

        video_info_list = json_resp['result']['docs']
        self.logger.warn('第{0}页,有{1}视频'.format(current_page,
                                               len(video_info_list)))
        for info in video_info_list:
            title = info['title']
            video_id = info['videoId']
            splice_url = 'https://www.netflav.com/video?id={0}'.format(
                video_id)
            addition_meta = {'name': title}
            yield scrapy.Request(url=splice_url,
                                 callback=self.real_video_parse,
                                 meta=addition_meta)
    def _parse_principals(self, response: HtmlResponse):
        """
        Parse principal and yield to retrieve one's documents.
        Parsing documents is not cached because different principals of the
        same country can have the same documents.
        """
        field = self._retrieve_table_field(response)

        processed_rows = 0
        while True:
            try:
                data = {
                    'url':
                    response.urljoin(
                        next(field).css('a::attr(href)').extract_first()),
                    'foreign_principal':
                    self._extract_next(field),
                    'date':
                    self._parse_date(self._extract_next(field)),
                    'address':
                    self._extract_next(field),
                    'state':
                    self._extract_next(field),
                    'country':
                    self._extract_next(field),
                    'registrant':
                    self._extract_next(field),
                    'reg_num':
                    self._extract_next(field),
                }
                next(field)  # just skip 'reg_date'

                documents_request = scrapy.Request(
                    url=data['url'],
                    callback=self._parse_documents,
                    dont_filter=True,
                )
                documents_request.meta['data'] = data
                yield documents_request

            except StopIteration:
                break

            processed_rows += 1

        if processed_rows < self.ROWS_FETCH:
            return

        meta = response.meta
        meta['first_row'] += processed_rows
        meta['form_data'][
            'p_widget_action_mod'] = self._generate_next_page_action(
                meta['first_row'])

        request = scrapy.FormRequest(
            url=meta['url'],
            headers=meta['headers'],
            formdata=meta['form_data'],
            callback=self._parse_principals,
        )

        for field in ['url', 'headers', 'form_data', 'first_row']:
            request.meta[field] = meta[field]

        yield request
Beispiel #20
0
    def collect_all_tagets(self, driver, starturl: str):
        """
        从入口starturl开始,遍历每一页收集所有标的url,该函数执行前务必已经`driver.get(starturl)`
        """
        try:
            total = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR,
                     'div.pagination.J_Pagination  .page-total')))
        except TimeoutException:  # TimeoutException
            self.logger.debug("导航页有问题,获取导航失败:{}".format(starturl))
            return
        # 分页信息分析
        max_page_num = int(total.text)

        try:
            nav = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located(
                    (By.CSS_SELECTOR, 'div.pagination.J_Pagination a')))
        except TimeoutException:  # TimeoutException
            self.logger.debug("导航页有问题,获取导航失败:{}".format(starturl))
            return
        # 跳转到首页,第二个是第一页
        driver.execute_script("arguments[0].click();", nav[1])

        # 从首页开始遍历每页,动作是收集所有标的,然后到下一页.
        self.logger.info("将收集{}页,starturl{}".format(max_page_num, starturl))

        max_retry_times = 5  # 尝试5次
        retrys = max_retry_times
        for i in range(max_page_num):
            # 提示
            if i < max_page_num - 1 and i % 50 == 0:
                self.logger.info("第{}页开始收集".format(i))
            elif i == max_page_num - 1:
                self.logger.info("已收集{}页".format(i))

            try:
                _ = WebDriverWait(driver, 20).until(
                    EC.presence_of_all_elements_located(
                        (By.CSS_SELECTOR, ".sf-content .sf-item-list li a")))
                #从实践看,网页会变化,所以采用保存下来再解析
                response = HtmlResponse(driver.current_url,
                                        body=str.encode(driver.page_source),
                                        encoding='utf-8')
                yield list(
                    set(
                        map(
                            lambda x: response.urljoin(x),
                            response.css(
                                ".sf-content .sf-item-list li a::attr(href)").
                            getall())))

            except TimeoutException:
                self.logger.info("{} 页收集标的url失败".format(i))

            # 已经到了最后页,就不需要跳转下一页了
            if i < max_page_num - 1:
                # 跳转到下一页,
                if self._to_next_page(driver) == -1:
                    retrys -= 1
                    if retrys <= 0:
                        break  # 如果跳转失败跳出循环结束
                else:
                    retrys = max_retry_times
        # return urls
        return
Beispiel #21
0
 def channel_page_see_all(self, response: HtmlResponse):
     # get see all button
     see_all = response.css(
         'a.greyButton.light::attr(href)').extract_first()
     all_url = response.urljoin(see_all)
     yield scrapy.Request(all_url, callback=self.channel_page)
Beispiel #22
0
    def extract_task(self, content: scrapy.Selector, response: HtmlResponse):
        # TODO: обновить документацию
        """
        Вытаскивает информацию о задаче из содержимого страницы
        :return: словарь с ключами по именам секций задачи и со значениями вида
                 text, images, где images -- список кортежей вида (image_url, tex_view)
        """
        # Вытаскивание текста
        # text = list(map(lambda s: s.strip(), ''.join(content.extract()).split('\n')))
        # text = list(map(lambda s: s.strip(), re.split(r'(<br>|<p>|</p>|>\n)', ''.join(content.extract()))))
        text = list(map(lambda s: s.strip(), re.split(r'(</?\w{,10}|>)', ''.join(content.extract()))))

        task_dict = {}
        current_section = ''
        session_text = []
        image_urls = []
        images_urls_of_section = []
        text_iterator = enumerate(text)
        tex_used = False
        while True:
            try:
                i, line = next(text_iterator)
                new_section = None
            except StopIteration:
                break

            if line == '<h3':
                next(text_iterator)
                next(text_iterator)
                i, line = next(text_iterator)
                new_section = re.findall(r'(Условие|Подсказка|Решение|Ответ|Источники и прецеденты использования)', line)

            if (not new_section) and (not current_section):
                continue
            if new_section:
                if current_section:
                    session_text = ' '.join(filter(lambda s: s, session_text))
                    session_text = ' '.join(session_text.split())
                    session_text.replace('\'', '')
                    task_dict[DEFAULT_NAMES[current_section]] = session_text, images_urls_of_section
                current_section = new_section[0]
                if current_section == 'Источники и прецеденты использования':
                    break
                session_text = []
                images_urls_of_section = []
                continue
            if '<img' in line:
                i, line = next(text_iterator)
                src = re.search(r'src=\".+\d+\"', line).group()
                if src:
                    tex = re.search(r'alt=\"\$(.|\n)+\$\"', line)
                    if tex is None:
                        image_src = src[5:-1]
                        image_url = response.urljoin(image_src)
                        image_urls.append(image_url)
                        images_urls_of_section.append(image_url)
                    else:
                        tex_used = True
                continue
            if re.match(r'<\w{1,10}', line):
                while line != '>':
                    i, line = next(text_iterator)
                continue
            if re.match(r'(</?\w{,10}|>)', line):
                continue
            if line:
                old_line = line
                if 'Также доступны документы в формате' in line or \
                        ('href' in line or line == 'TeX') or \
                        (line.endswith('>') and not line.endswith('-->')):
                    continue
                line = line.strip()
                line = re.sub(r'(^>|!-- MATH|--)', '', line, re.S).strip()
                line = line.replace('\\begin{displaymath}', '$').replace('\\end{displaymath}', '$')
                if line:
                    session_text.append(line)

        return task_dict, image_urls, tex_used