コード例 #1
0
 def vacancy_parse(self, response: HtmlResponse):
     # host without subdomains
     parsed_uri = tldextract.extract(response.url)
     host = "{domain}.{suffix}".format(domain=parsed_uri.domain,
                                       suffix=parsed_uri.suffix).lower()
     # url
     url = urljoin(response.url, urlparse(response.url).path)
     # name
     name = response.css('h1.header *::text').extract_first()
     # company
     company = response.css(
         '*.vacancy-company-name-wrapper > span[itemprop="identifier"] > meta[itemprop="name"]::attr(content)'
     ).extract_first().strip()
     # salary
     salary_data = response.css(
         'div.vacancy-title > span[itemprop="baseSalary"]')
     salary_min = salary_data.css(
         'meta[itemprop="minValue"]::attr(content)').extract_first()
     salary_max = salary_data.css(
         'meta[itemprop="maxValue"]::attr(content)').extract_first()
     salary_currency = salary_data.css(
         'meta[itemprop="currency"]::attr(content)').extract_first()
     # return result
     yield JobparserItem(
         host=host,
         url=url,
         name=name,
         company=company,
         salary_min=salary_min,
         salary_max=salary_max,
         salary_currency=salary_currency,
     )
コード例 #2
0
    def parse(self, response: HtmlResponse):
        # 下载之后的由engine调用,获取提取到的数据(数据解析),
        # 返回一个字典对象
        print(type(response))
        print('--->', response.url)

        item = {}
        # 返回List[<Selector>, ...]
        articles = response.css('.article')
        for article in articles:

            item['info_url'] = article.css('.author').xpath('./a/@href').get()
            item['name'] = article.css('.author').xpath('./a/img/@alt').get()
            item['img'] = article.css('.author').xpath('./a/img/@src').get()
            content = ''.join(
                article.css('.content').xpath('./span/text()').extract())

            item['content'] = base64.b16encode(
                content.encode(encoding='utf-8')).decode()

            yield item

        # 找出下一页的url
        next_url = response.css('.pagination').xpath(
            './/li[last()]/a/@href').get()
        # 发起下一页的请求
        yield Request('http://www.qiushibaike.com' + next_url,
                      callback=self.parse,
                      priority=10,
                      dont_filter=True)
コード例 #3
0
    def book_parse(self, response: HtmlResponse):

        # Получение url-а и парсинг информации о книжке
        book_url = response.url
        book_title = response.xpath(
            '//div[@id="product-title"]/h1/text()').extract_first()
        book_authors = response.xpath(
            '//div[@class="authors"]/a[@data-event-label="author"]/text()'
        ).extract()
        book_rating = response.xpath(
            '//div[@id="rate"]/text()').extract_first()

        # Парсинг цены. Три разных тега.
        price_normal = response.css(
            'span.buying-price-val-number::text').extract_first()
        price_new = response.css(
            'span.buying-pricenew-val-number::text').extract_first()
        price_old = response.css(
            'span.buying-priceold-val-number::text').extract_first()

        # Возвращаем один item
        yield BookparserItem(url=book_url,
                             title=book_title,
                             authors=book_authors,
                             price_normal=price_normal,
                             price_new=price_new,
                             price_old=price_old,
                             rating=book_rating)
コード例 #4
0
 def test_working_selector_after_reload(self):
     response = HtmlResponse(url='http://127.0.0.1/aaa',
                             body='<html><body><h1>test</h1></body></html>')
     response.css('h1')
     filename = self.storage.save(response)
     response2 = self.storage.load(filename)
     response2.css('h1')
コード例 #5
0
ファイル: toysrus.py プロジェクト: iandees/all-the-places
    def parse(self, response):
        marker_txt = re.findall(re.compile("markerData.*\}", re.MULTILINE), response.body_as_unicode())
        if not len(marker_txt):
            return
        markers_json = "{\"" + marker_txt[0]
        markers = list(json.loads(markers_json).values())[0]

        if not len(markers):
            return
        for marker in markers:
            marker_response = HtmlResponse(url="", body=marker["info"].encode("utf-8"))
            hours = re.findall(r"\{\"label.*\}", marker["info"])
            hours = hours[0]
            parsed_hours = json.loads(hours)

            addr_parts = marker_response.css(".address span:not(.phone)::text").extract()
            url = marker_response.css("header a").xpath("@href").extract_first()
            city, state = addr_parts[-1].split(",")

            yield GeojsonPointItem(lat=marker.get("lat"), lon=marker.get("lng"),
                                   name=marker_response.css("header a::text").extract_first(default=None),
                                   addr_full=", ".join(addr_parts),
                                   city=city.strip(),
                                   state=state.strip(),
                                   country="United States",
                                   phone=marker_response.css(".phone::text").extract_first(),
                                   website=url,
                                   opening_hours=get_hours(parsed_hours["days"]),
                                   ref=url.split("/")[-1].split(".")[0])
    def post_parse(self, response: HtmlResponse):
        # title = response.css('div.title-info-main h1.title-info-title').extract_first()
        title = response.css(
            'div.title-info-main span.title-info-title-text::text'
        ).extract_first()
        price = response.css(
            'div.item-price-value-wrapper span.js-item-price::attr(content)'
        ).extract_first()

        list = response.css(
            'ul.item-params-list li.item-params-list-item').extract()
        list2 = response.css(
            'ul.item-params-list span.item-params-label::text').extract()
        list3 = response.css(
            'ul.item-params-list li.item-params-list-item::text').extract()

        for i, item in enumerate(list3[:]):
            if item == ' ':
                list3.remove(item)
        dict = {}
        for i, item in enumerate(list2):
            dict[item] = list3[i]

#        param_name = 1

        yield {
            'title': title,
            'price': price,
            'list': dict  #list,
        }
コード例 #7
0
ファイル: MyFollow.py プロジェクト: lizhaode/Scrapy91
    def parse_my_follow_real_link(self, response: HtmlResponse):
        self.logger.warn('开始解析{0}真实视频'.format(response.url))
        title = response.css('#viewvideo-title::text').extract_first().strip()
        author = response.css('a[href*="uprofile.php"]').css(
            'span::text').extract_first().strip()
        # 发现有的视频,名字相同,作者相同,只有Url中的viewkey不同
        view_key = response.url.split('viewkey=')[1]
        # 由于有的视频名字中带 / 会导致创建成文件夹,所以需要处理一下
        if '/' in title:
            title = title.replace('/', '')

        encrypted_url = response.css('video').extract_first().split(
            'strencode("')[1].split('"))')[0]
        first_encrypted = encrypted_url.split('"')[0]
        second_excrypted = encrypted_url.split('"')[2]
        video_link = ParseRealUrl.get_url(first_encrypted, second_excrypted)

        if video_link:
            # 处理一下链接中 http://185.38.13.130//mp43/2998... 这种的 url
            video_link_list = video_link.split('//')
            real_video_link = video_link_list[0] + '//' + video_link_list[
                1] + '/' + video_link_list[2]
            self.logger.warn('视频:{0} 分析完毕,丢入下载 pipelines'.format(title))
            down_file_name = title + '-' + author + '-' + view_key
            yield DownloadVideoItem(file_urls=real_video_link,
                                    file_name=down_file_name)
        else:
            self.logger.warn('获取视频下载地址失败,地址:{0}'.format(response.url))
コード例 #8
0
    def parse(self, response: HtmlResponse):

        items = response.css('#main-ad-list').css(
            '.item:not(.list_native):not(.yap-loaded)')

        new_item_count = 0

        for item in items:
            attrib = item.attrib
            id = attrib['data-list_id']

            product = Product.get_by_olx_id(int(id))

            if product is not None:
                continue

            new_item_count = new_item_count + 1

            url: str = item.css('.OLXad-list-link::attr(href)').get()
            if url is None:
                url = item.css('.OLXad-list-link-featured::attr(href)').get()

            yield response.follow(url,
                                  callback=self.parse_item,
                                  meta=response.meta.copy())

        if new_item_count > 0:
            for next_link in response.css('a.link[rel="next"]'):
                yield response.follow(next_link,
                                      callback=self.parse,
                                      meta=response.meta)
コード例 #9
0
    def parse(self, response):
        marker_txt = re.findall(re.compile("markerData.*\}", re.MULTILINE), response.body_as_unicode())
        if not len(marker_txt):
            return
        markers_json = "{\"" + marker_txt[0]
        markers = list(json.loads(markers_json).values())[0]

        if not len(markers):
            return
        for marker in markers:
            marker_response = HtmlResponse(url="", body=marker["info"].encode("utf-8"))
            hours = re.findall(r"\{\"label.*\}", marker["info"])
            hours = hours[0]
            parsed_hours = json.loads(hours)

            addr_parts = marker_response.css(".address span:not(.phone)::text").extract()
            url = marker_response.css("header a").xpath("@href").extract_first()
            city, state = addr_parts[-1].split(",")

            yield GeojsonPointItem(lat=marker.get("lat"), lon=marker.get("lng"),
                                   name=marker_response.css("header a::text").extract_first(default=None),
                                   addr_full=", ".join(addr_parts),
                                   city=city.strip(),
                                   state=state.strip(),
                                   country="United States",
                                   phone=marker_response.css(".phone::text").extract_first(),
                                   website=url,
                                   opening_hours=get_hours(parsed_hours["days"]),
                                   ref=url.split("/")[-1].split(".")[0])
コード例 #10
0
    def start_requests(self):
        self.driver.get(self.start_urls[0])
        res = HtmlResponse(url='index html',
                           body=self.driver.page_source,
                           encoding="utf-8")
        title_text = res.css('#main section h1 a::text')[0].root.strip()
        self.driver.find_element_by_link_text(title_text).click()  # 点击进去
        time.sleep(2)
        if title_text in self.driver.page_source:
            self.detail_parse(self.driver.page_source, title_text)

        while True:
            try:
                key_word = self.driver.find_elements_by_class_name(
                    "next")[0].text  # 进行下一篇文章抓取
            except (TypeError, IndexError):
                self.driver.quit()
                key_word = None
            if not key_word:
                break
            self.driver.find_element_by_link_text(key_word).click()
            time.sleep(2)
            res = HtmlResponse(url='next html',
                               body=self.driver.page_source,
                               encoding="utf-8")
            title_text = res.css('.article-info h1 a::text').extract_first()
            self.detail_parse(self.driver.page_source, title_text)
コード例 #11
0
def getAudio(author, sing):
    reqStr = author + ' ' + sing
    reqStr = reqStr.replace('& ', '')
    #print reqStr
    reqStr = reqStr.replace(' ', '+')
    #print reqStr
    googurl = 'https://www.google.ru/search?newwindow=1&ei=O41KWrz4J-nX6QTFgbWYDQ&q=%s' % reqStr

    req = requests.get(googurl, headers=headers)
    response = HR(url=googurl, body=req.text, encoding='utf-8')

    try:
        links = response.css('.kv ._Rm::text').extract()
        mp3party = ''
        for link in links:
            if 'mp3party.net' in link:
                mp3party = 'http://' + link
                break
        print mp3party
        req = requests.get(mp3party, headers=headers)
        response = HR(url=mp3party, body=req.text, encoding='utf-8')
        audio = response.css('.jp-play::attr(href)').extract_first()
        print audio
    except:
        audio = ''
    return audio
コード例 #12
0
    def vacancy_parse(self, response: HtmlResponse):
        name = response.css('div.vacancy-title \
            h1.header ::text').extract()

        salary = [
            response.css(
                'span[itemprop="baseSalary"] meta[itemprop="minValue"] ::attr(content)' \
            ).extract_first(), \

            response.css(
                'span[itemprop="baseSalary"] meta[itemprop="maxValue"] ::attr(content)' \
            ).extract_first(), \

            response.css(
                'span[itemprop="baseSalary"] meta[itemprop="currency"] ::attr(content)' \
            ).extract_first()
        ]

        vacancy_link = response.url
        site_scraping = self.allowed_domains[0]

        yield JobParserItem(
            name=name, \
            salary=salary, \
            vacancy_link=vacancy_link, \
            site_scraping=site_scraping
        )
コード例 #13
0
ファイル: spider.py プロジェクト: HanMuJiang/pc
def spider():
    driver = webdriver.Chrome()  # 启动谷歌驱动,打开浏览器
    url = 'https://www.shiyanlou.com/courses/427'
    driver.get(url)  # 打开待爬取页面
    page = 1
    result = []
    while True:
        WebDriverWait(driver, 10).until(  # 显式等待 10 秒,默认轮询 0.5 秒
            EC.text_to_be_present_in_element(  # 设置条件
                (By.XPATH, '//ul[@class="pagination"]/li[@class="active"]'),
                str(page)  # 上一行代码块里是否有这一行的字段
            ))
        html = driver.page_source
        response = HtmlResponse(url=url, body=html.encode('utf-8'))
        for i in response.css('div.comment-item-wrapper'):
            d = {
                'username': i.css('a.username::text').extract_first().strip(),
                'content': reduce(lambda a, b: a + b,
                                  (i.css('p::text').extract()))
            }
            result.append(d)
        if response.css('div.comment-box li.disabled.next-page'):
            break  # 如果有 class 属性值为 disalbed 的 li 标签,表示没有下一页了
        page += 1
        # chromedirver 无法自动定位到当前页面未显示区域,下面两行代码起到定位作用
        ac = driver.find_element_by_css_selector(
            'div.comment-box li.next-page a')
        ActionChains(driver).move_to_element(ac).perform()
        driver.find_element_by_css_selector(
            'div.comment-box li.next-page').click()
    driver.quit()
    with open('comments.json', 'w') as f:
        json.dump(result, f)
コード例 #14
0
ファイル: hhru.py プロジェクト: SeredinGs/pythonDataMining
 def vacansy_parse(self, response: HtmlResponse):
     name_vac = ''
     name1 = response.css(
         'div.vacancy-title h1.header span.highlighted::text').getall(
         )  # Наименование вакансии
     name2 = response.css('div.vacancy-title h1.header::text'
                          ).extract_first()  # HH добавили...
     if len(
             name1
     ) == 0:  # ...в заголовок имени span, приходится добавлять новые обработчики и соответственно проверки
         if name2 is not None:
             name_vac = name2
     else:
         if name2 is None:
             name_vac = name1
         else:
             name_vac = name1[0] + name2
     salary = response.css('div.vacancy-title p.vacancy-salary::text'
                           ).extract_first()  # Зарплата
     min_sal, max_sal = self.parse_salara(salary)
     url = response.url
     # Сделаем примитивный вывод показывающий результат
     #print('name = {}, salary = {}, min_sal = {}, max_sal = {}, url = {}'.format(name, salary, min_sal, max_sal, url))
     yield JobparserItem(
         name=name_vac,
         min_salary=min_sal,
         max_salary=max_sal,
         link=url,
         source='HH')  # Передаем сформированный item в pipeline
コード例 #15
0
 def parse(self, response: HtmlResponse):
     next_page = response.css('a[rel=next]::attr(href)').extract_first()
     yield response.follow(next_page, callback=self.parse)
     vacancy = response.css('a[target=_blank]::attr(href)').extract()
     print(vacancy)
     for link in vacancy[0:19]:
         yield response.follow(link, self.vacancy_parse)
コード例 #16
0
ファイル: xh_dx.py プロジェクト: disenQF/xpy1901
    def parse_info(self, response: HtmlResponse):
        # 解析详情页面的回调函数
        item = XHItem()

        # 读取请求对象中传入meta中的uid数据
        item['uid'] = response.meta.get('uid')
        item['name'] = response.meta.get('name')
        item['image_urls'] = [
            'http://www.521609.com' +
            response.css('#bigimg').xpath('./@src').get()
        ]
        item['images'] = []
        yield item

        # 下一页url
        next_url = response.css('.pagelist').xpath(
            './li[last()]/a/@href').get()
        if next_url != '#':
            next_url = 'http://www.521609.com/daxuexiaohua/' + next_url
            yield Request(next_url,
                          callback=self.parse_info,
                          meta={
                              'uid': item['uid'],
                              'name': item['name']
                          },
                          priority=10)
コード例 #17
0
    def find_features(self, response):

        room_features = response.css('div.room__features').getall()
        features_room = []
        features_apartment = []
        isroom = 1
        for room_features_obj in room_features:
            room_features_response = HtmlResponse(url=room_features_obj,
                                                  body=room_features_obj,
                                                  encoding='utf-8')
            room__feature = room_features_response.css(
                'div.room__feature').getall()
            for room_feature_obj in room__feature:
                room_feature_response = HtmlResponse(url=room_feature_obj,
                                                     body=room_feature_obj,
                                                     encoding='utf-8')
                strong = room_feature_response.css('p strong::text').get()
                paragraph = room_feature_response.css('p::text').get()
                line = ""
                if strong:
                    line = strong
                if paragraph:
                    line = line + paragraph
                if isroom and line:
                    features_room.append(line)
                elif line:
                    features_apartment.append(line)
            isroom = 0
        features = {
            'features_room': features_room,
            'features_shared_apartment': features_apartment
        }
        return features
コード例 #18
0
    def book_parse(self, response: HtmlResponse):

        # Получение url-а и парсинг информации о книжке
        book_url = response.url
        book_title = response.css(
            'h1.item-detail__title::text').extract_first()
        book_authors = response.xpath(
            '//div[@class="item-tab__chars-list"]/div[1]/span[2]/a/text()'
        ).extract()
        book_rating = response.css(
            'span.rating__rate-value::text').extract_first()

        # Парсинг цены. Два разных тега
        price_normal = response.css(
            'div.item-actions__price b::text').extract_first()
        price_old = response.css(
            'div.item-actions__price-old::text').extract_first()

        # Возвращаем один item
        yield BookparserItem(url=book_url,
                             title=book_title,
                             authors=book_authors,
                             price_normal=price_normal,
                             price_new=None,
                             price_old=price_old,
                             rating=book_rating)
コード例 #19
0
 def vacancy_parse(self, response: HtmlResponse):
     vaca = response.xpath(
         '//div[contains(@class,"vacancy-title")]//h1[@class="header"]//text()'
     ).extract()
     salary = response.css(
         'div.vacancy-title p.vacancy-salary::text').extract()
     salary = salary()
     try:
         sal_min = response.css(
             'div.vacancy-title meta[itemprop="minValue"]::attr(content)'
         ).extract()
     except:
         sal_min = 'NaN'
     try:
         sal_max = response.css(
             'div.vacancy-title meta[itemprop="maxValue"]::attr(content)'
         ).extract()
     except:
         sal_max = 'NaN'
     link = response.css(
         'div.bloko-column_xs-4 div[itemscope="itemscope"] meta[itemprop="url"]::attr(content)'
     ).extract()
     yield JobparserItem(name=vaca[0],
                         link=link[0],
                         salary=salary,
                         min_salary=sal_min,
                         max_salary=sal_max,
                         source='hh.ru')
コード例 #20
0
ファイル: sj.py プロジェクト: VadimSpb/parcing_learning
 def vacansy_parse(self, response: HtmlResponse):
     name = response.css('h1.rFbjy::text').extract_first()  # Наименование вакансии
     salary = response.css('span._2Wp8I').extract_first()  # Зарплата
     url = response.request.url
     source = 'superjob.ru'
     yield JobparserItem(name=name, min_salary=salary, max_salary=salary, url=url,
                         source=source)  # Передаем сформированный item в pipeline
コード例 #21
0
    def parse_ads(self, response: HtmlResponse):
        loader = ItemLoader(item=AvitoRealEstate(), response=response)
        loader.add_xpath('photos',
                         '//div[contains(@class, "gallery-img-wrapper")]//div[contains(@class, "gallery-img-frame")]/@data-url')
        loader.add_css('title', 'h1.title-info-title span.title-info-title-text::text')
        par_names = response.css('li.item-params-list-item span.item-params-label::text').extract()
        for i in range(len(par_names)):
            par_names[i] = par_names[i].replace(' ', '')

        par_data = response.css('li.item-params-list-item::text').extract()
        my_dict = {'Этаж:': 'floor',
                    'Этажейвдоме:': 'house_floors',
                    'Типдома:': 'house_type',
                    'Количествокомнат:': 'rooms',
                    'Общаяплощадь:': 'total_s',
                    'Жилаяплощадь:': 'living_s',
                    'Площадькухни:': 'kitchen_s',
                    'Отделка:': 'otdelka'
                }

        result_dict = {}
        for i in range(len(par_names)):
            result_dict[my_dict[par_names[i]]] = par_data[i * 2 + 1]

        for keys in result_dict:
            loader.add_value(keys, result_dict[keys])
        print(1)
        yield loader.load_item()
コード例 #22
0
ファイル: AbnormalSpider.py プロジェクト: LWping/scrapyAccess
    def parse_detail(self, response):
        json_response = json.loads(response.body_as_unicode())
        # print(json_response)
        if json_response['data'] is None or json_response['data']['navigationBar'] is None:
            print("接口服务调用异常,接口url:{}, 响应:{}".format(response.url, json_response))
            return

        html_response = HtmlResponse(url="detail HTML string",
                                     body=json_response['data']['navigationBar'],
                                     encoding='utf-8')

        categoryfacetsecond = html_response.css('div.sub-title-nav ul li:last-child a::attr(categoryfacetsecond)').get()

        catalogids = html_response.css('div.sub-title-nav ul li:last-child a::attr(catalogids)').get().split(',')

        classificationid = html_response.css('div.sub-title-nav ul li:last-child a::attr(classificationid)').get()

        detail_payload = {
            'categoryFacetSecond': categoryfacetsecond,
            'catalogIds': catalogids,
            'classificationId': classificationid,
            'zzbh': self.zzbh
        }

        print(detail_payload)

        yield response.follow(self.nav_detail_url, callback=self.parse_nav_detail,
                              method='POST',
                              headers={'Content-Type': 'application/json'},
                              body=json.dumps(detail_payload))
コード例 #23
0
ファイル: hhru.py プロジェクト: SpiritIV/job_scrapper
 def vacansy_parse(self, response: HtmlResponse):
     link_vac = response.url
     name = response.css('div.vacancy-title h1.header::text').extract_first()
     salary = response.css('div.vacancy-title p.vacancy-salary::text').extract()
     link = 'hh.ru'
     # print(name, salary)
     yield JobparserItem(name=name, salary_from=''.join(salary), salary_to='', link_vac=link_vac, link_site=link)
コード例 #24
0
    def vacancy_parse(self, response: HtmlResponse):
        vac = response.css('div._3MVeX h1._3mfro::text').extract_first()
        salary = response.css('div._3MVeX span._2Wp8I span::text').extract()

        yield JobparserItem(name=vac,
                            salary=salary,
                            link=response.url,
                            source='superjob.ru')
    def post_parse(self, response: HtmlResponse):
        title = response.css('article h1::text').extract_first()
        date = response.css('article time::attr(datetime)').extract_first()

        yield {
            'title': title,
            'date': date,
        }
コード例 #26
0
    def parse(self, response:HtmlResponse):
        vacancies = response.css("a.icMQ_._6AfZ9::attr(href)").extract()
        for vacancy in vacancies:
            yield response.follow(vacancy, callback=self.vacancy_parse)

        next_page = response.css("a.icMQ_._1_Cht._3ze9n.f-test-button-dalshe.f-test-link-Dalshe::attr(href)").extract_first()
        if next_page:
            yield response.follow(next_page, callback=self.parse)
コード例 #27
0
ファイル: sanguo_wiki.py プロジェクト: thomaszdxsn/SpiderNest
    def parse(self, response: HtmlResponse):
        detail_urls = response.css('.excerpt h2 a::attr(href)').extract()
        for detail_url in detail_urls:
            yield response.follow(detail_url, callback=self.parse_detail)

        next_page = response.css('li.next-page a::attr(href)').extract_first(None)
        if next_page:
            yield response.follow(next_page, callback=self.parse)
コード例 #28
0
ファイル: hhru.py プロジェクト: Belfi-Gor/data_mining
 def vacancy_parse(self, response: HtmlResponse):
     name = response.css(
         "div.vacancy-title  h1.header::text").extract_first()
     salary = response.css(
         "div.vacancy-title  p.vacancy-salary::text").extract_first()
     company = ''.join(
         response.css('a.vacancy-company-name span::text').extract())
     yield JobparserItem(name=name, salary=salary, company=company)
コード例 #29
0
ファイル: hh.py プロジェクト: Selen34/scraping
 def parse(self, response: HtmlResponse):
     next_page = response.css(
         'a.HH-Pager-Controls-Next::attr(href)').extract_first()
     yield response.follow(next_page, callback=self.parse)
     vacancies = response.css(
         'div.vacancy-serp-item__info a.bloko-link::attr(href)').extract()
     for link in vacancies:
         yield response.follow(link, callback=vacancy_parse)
コード例 #30
0
ファイル: sjru.py プロジェクト: nawww83/data_acquisition
 def parse(self, response: HtmlResponse):
     next_page = response.css(
         'a.f-test-button-dalshe::attr(href)').extract_first()
     yield response.follow(next_page, callback=self.parse)
     vacancy = response.css(
         'div.f-test-vacancy-item a::attr(href)').extract()
     for link in vacancy:
         yield response.follow(link, self.vacancy_parse)
コード例 #31
0
 def parse(self, response: HtmlResponse):
     next_page = response.css(
         'a.pagination-next__text::attr(href)').extract_first()
     books_links = response.css(
         'a.product-title-link::attr(href)').extract()
     for link in books_links:
         yield response.follow(link, callback=self.book_parse)
     yield response.follow(next_page, callback=self.parse)
コード例 #32
0
 def parse_field(self, html, fn):
     response = HtmlResponse('http://localhost/test.html',
                             body='<table><tr>%s</tr></table>' % html)
     row = response.css('tr')[0]
     node = response.css('td')[0]
     lobbyist = Loader(self.spider, response, Lobbyist(), row)
     lobbyist.add_value(None, fn(node))
     item = lobbyist.load_item()
     actual = dict(item)
     return actual
コード例 #33
0
	def parse_json(self,response):
		data = response.body[1:-1]
		js = json.loads(data)
		response = HtmlResponse(url=response.url,body=js['data'].encode('utf8'))
		for href in response.css(settings["el_nacional"]['links']):
			full_url = response.urljoin(href.extract())
			yield scrapy.Request(full_url, callback=self.parse_links)
コード例 #34
0
 def parse_row(self, html, fn):
     response = HtmlResponse('http://localhost/test.html',
                             body='<table>%s</table>' % html)
     row = response.css('tr')[0]
     item = fn(response, row)
     actual = dict(item)
     return actual
コード例 #35
0
 def parse_lobbyist(self, html, fn):
     response = HtmlResponse('http://localhost/test.html',
                             body='<book>%s</book>' % html)
     rows = response.css('row')
     item = fn(response, rows)
     actual = dict(item)
     return actual
コード例 #36
0
	def parse_links(self, response):
		try:
			reponse.css
		except:
			response = HtmlResponse(url=response.url,body=response.body)
		fecha = limpiar_autor_tc(response.css(settings[self.name]['fecha']).extract()[0].split('|')[1])
		current_date = True
		if(len(fecha)>10):
			current_date = obtener_fecha_tipo6(fecha.split(" ")[0])
		if(current_date):
			titulo = limpiar_autor_tc(response.css(settings[self.name]['titulo']).extract()[0])
			body = limpiar_ult_n(response.css(settings[self.name]['body']).extract())
			yield {
			'titulo': titulo,
			'autor': response.css(settings[self.name]['autor']).extract()[0],
			'fecha': fecha,
			'body': [body],
			'link': response.url,
			}
コード例 #37
0
ファイル: wienerlinien_at.py プロジェクト: Lukas0907/feeds
 def parse(self, response):
     # Wiener Linien returns HTML with an XML content type which creates an
     # XmlResponse.
     response = HtmlResponse(url=response.url, body=response.body)
     for item in response.css(".block-news-item"):
         il = FeedEntryItemLoader(
             response=response,
             timezone="Europe/Vienna",
             ignoretz=True,
             base_url="https://www.{}".format(self.name),
         )
         link = response.urljoin(item.css("a::attr(href)").extract_first())
         il.add_value("link", link)
         il.add_value("title", item.css("h3::text").extract_first())
         il.add_value("updated", item.css(".date::text").extract_first())
         yield scrapy.Request(link, self.parse_item, meta={"il": il})