def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        items = []

        # extract urls from the src attribute of img tags
        # Example: <img src="http://url"> will extract http://url
        images = hxs.select('//img')
        for image in images:
            item = CrawlerItem()
            item['page'] = response.url
            item['picture'] = image.select('@src').extract()[0]
            item = self.CALLBACK_processImage(item['picture'], item)
            items.append(item)

        urls = hxs.select('//a')
        for url in urls:
            if url.select('img'):
                item = CrawlerItem()
                item['page'] = response.url
                item['picture'] = url.select('img/@src').extract()[0]
                item['picture_destination'] = url.select('@href').extract()[0]
                item = self.CALLBACK_processImage(item['picture_destination'], item)
                items.append(item)

        #if len(items):
        log.msg('%s images were found.' % len(items), level=log.INFO)
        return items
    def parse_comment(self, response):
        picture_href = response.xpath('//div[re:match(@class,"media-preview-content.*")]/a/@href').extract_first()
        global valid_link
        if picture_href != None:
            response.meta['info']["url"] = picture_href
            response.meta['info']["type"] = picture_href.split(".")[-1]
            valid_link = True
            yield CrawlerItem(response.meta['info'])
        elif self.picture_type.match(response.meta['info']["url"]):
            response.meta['info']["type"] = response.meta['info']["url"].split(".")[-1]
            valid_link = True
            yield CrawlerItem(response.meta['info'])
        else:
            valid_link = False
            print response.url, " is not picture. data-url is ", response.meta['info']["url"]

        if valid_link:
            comments = response.xpath(
                '//div[@id="' + "siteTable_" + response.meta["info"]["_id"] + '"]/div[re:match(@class," ?thing id-.*")]')
            if comments:
                get_comments = self.analysis_comment(response.meta["info"]['_id'], response.meta["info"]['href'],
                                                     response.meta["info"]["_id"], comments)
                i = 0
                while i < len(comments):
                    comment_item, num = get_comments.next()
                    if num == 0:
                        i = i + 1
                    if comment_item:
                        yield comment_item
    def parse_article(self, response):
        main = response.xpath('//div[@id="main-content"]')
        title = main.xpath('//h1[@id="page-title"]/span/text()').extract()
        content = main.xpath('//div[@class="field-item even"]/p').extract()
        item = CrawlerItem()
        if len(content) == 0:
            return
        item["url"] = response.url
        item["content"] = []
        cleanr = re.compile('<.*?>')
        for c in content:
            if len(c) > 1:
                c = re.sub(cleanr, '', c).replace('\n', '')
                item["content"].append(c)

        count = 0
        for line in item["content"]:
            count += len(line.split())
        if count < 500:
            # print(count)
            return

        item["title"] = title[0].replace('<', '').replace('>', '').replace(
            ':', '').replace('"', '').replace('/',
                                              '').replace('|', '').replace(
                                                  '*', '').replace('?', '')
        return item
Exemple #4
0
    def process_TaoCan(self, pageindex):
        res_items = []
        values = {'parameter':'FF32=&FF33=',\
         'areaCode':'025',\
         'sortFlag':'xl',\
         'cssFlag':'down',\
         'pageindex' : str(pageindex),\
         'tableNumber':'03'}

        pj = json.loads(
            post('http://js.189.cn/nmall/product/queryPackageList.do', values))
        pageTotal = pj['pageCount']

        for offer in pj['offerList']:
            item = CrawlerItem()
            item[
                'url'] = 'http://js.189.cn/nmall/product/queryPackageXq/' + offer[
                    'FNUMBER'] + '.html'
            item['title'] = offer['FNUMBER']
            item['table'] = json.dumps(offer,
                                       ensure_ascii=False).encode('utf-8')
            item['table2'] = ''
            item['need_know'] = ''
            item['faq'] = ''
            res_items.append(item)

        for i in range(2, pageTotal + 1):
            res_items.append(self.process_TaoCan(i))

        return res_items
Exemple #5
0
    def parse_broadbandInfo(self, response):
        self.log('%s' % response.url)
        bodys = response.body.split('<html>')
        items = []
        filename = 'G:/Github/crawler/out/broadbandInfo/' + response.url.split(
            '/')[-1]
        fout = open(filename, 'wb')
        for body in bodys:
            soup = bs(body)
            kd_xqinfo_res = soup.find('div', class_='kd_xqinfo')
            if kd_xqinfo_res == None:
                continue
            else:
                item = CrawlerItem()
                item['url'] = response.url
                item['title'] = kd_xqinfo_res.find('h2').string
                fout.write(item['title'] + '\n')
                tr_s = kd_xqinfo_res.find_all(has_tr_no_displayNone)
                tableContent = ''
                for tr in tr_s:
                    for ss in tr.stripped_strings:
                        tableContent += ss + '\n'
                item['table'] = tableContent
                item['need_know'] = ''
                items.append(item)
                fout.write(item['table'])
                break

        fout.close()
    def parse_details(self, response):
        items = CrawlerItem()
        # items = CrawlerItem()
        job_page = response.selector.xpath('//div[@class= "section single"]')
        #jobs_meta = response.selector.xpath('//div[starts-with(@class, "job-meta")]')
        jobs_meta = job_page.xpath('//div[starts-with(@class, "job-meta")]')
        items['url'] = response.url
        items['title'] = job_page.xpath(
            'div/h1[@class= "title"]/text()').extract_first().strip()
        items['jobtype'] = jobs_meta.xpath(
            'span[@class = "job-type"]/span/text()').extract_first()
        items['location'] = jobs_meta.xpath(
            'span[@class = "location"]/span/text()').extract_first()
        items['organisation'] = jobs_meta.xpath(
            'span[@class = "company"]/text()').extract_first()
        items['date_posted'] = jobs_meta.xpath(
            'span[@class = "date"]/text()').extract_first()
        footer_div = job_page.xpath(
            '//div[@class= "content-bar iconfix foot"]')
        items['category'] = footer_div.xpath(
            'p[@class= "meta"]//a/text()').extract_first()
        items['days_to_expiry'] = footer_div.xpath(
            'p[@class= "meta"]//span[@class= "expiry"]/text()').extract_first(
            )
        contents_table = bs(response.body,
                            'html.parser').find('div', {
                                'class': 'section_content'
                            }).children
        items['job_details'] = self.process_details_soup(contents_table)
        #items['job_details'] = html.fromstring(job_page.xpath('//div[@class= "section_content"]').extract()).text_content().strip()

        yield items
Exemple #7
0
    def parse(self, response):

        telephones = Selector(response).xpath('//body/section/ul/li')

        for telephone in telephones:

            item = CrawlerItem()
            item['name'] = telephone.xpath('a/h3/text()').extract_first()
            item['price'] = telephone.xpath(
                'a/div[@class="price"]/strong/text()').extract_first()
            item['img'] = telephone.xpath(
                'a/img/@data-original').extract_first()
            if item['img'] is None or len(item['img']) < 1:
                item['img'] = telephone.xpath('a/img/@src').extract_first()

            for row in telephone.xpath('a/figure/span/text()'):

                key = row.extract().split(":")[0]
                value = row.extract().split(":")[1]
                key = 'screen' if key.lower() == "màn hình" else key.lower()

                if key.lower() == "ram":
                    ram_data = row.extract().split(",")
                    for data in ram_data:
                        k = data.split(":")[0]
                        v = data.split(":")[1]
                        k = k.strip().lower()
                        # print(k)
                        item[k] = v
                else:
                    item[key] = value
                    print(key, value)
                item["screen"] = item["screen"].replace('\"', '"')
            yield item
Exemple #8
0
    def parse(self, response):
        logger.debug('Getting URL: %s', response.url)
        items = []

        if is_document(response.url):
            item = CrawlerItem()
            item['body'] = response
            item['link'] = response.url
            items.append(item)
            return items

        try:
            sel = Selector(response)
            for link in sel.xpath('//a'):
                href = link.xpath('@href').extract()

                if not href:
                    continue

                lnk = href[0].strip()
                if lnk.startswith('#') or not lnk or lnk.startswith('mailto:'):
                    continue

                url = urllib.parse.urljoin(response.url, lnk)
                items.append(Request(url))
        except Exception as e:
            logger.error(e)

        return items
Exemple #9
0
 def parse_page(self, response):
     """Scrapes information from pages into items"""
     item = CrawlerItem()
     item['url'] = response.url.encode('utf-8')
     item['visited'] = datetime.datetime.now().isoformat().encode('utf-8')
     item['published'] = get_first(
         response.selector.xpath('//time/@datetime').extract())
     item['title'] = get_first(
         response.selector.xpath(
             '//meta[@property="og:title"]/@content').extract())
     item['description'] = get_first(
         response.selector.xpath(
             '//meta[@name="description"]/@content').extract())
     item['text'] = "".join([
         s.strip().encode('utf-8') for s in response.selector.css(
             '.article-content>.rtf-content-wrapper>P').xpath(
                 './/text()').extract()
     ])
     item['author'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//div[@class="name"]/a[@rel="author"]/text()').extract()
     ]
     item['keywords'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//meta[@name="news_keywords"]/@content').extract()
     ]
     return item
Exemple #10
0
 def parse_page(self, response):
     """Scrapes information from pages into items"""
     item = CrawlerItem()
     item['url'] = response.url.encode('utf-8')
     item['visited'] = datetime.datetime.now().isoformat().encode('utf-8')
     item['published'] = get_first(
         response.selector.xpath(
             '//span[@class="Datum"]/@content').extract())
     item['title'] = get_first(
         response.selector.xpath(
             '//meta[@property="og:title"]/@content').extract())
     item['description'] = get_first(
         response.selector.xpath(
             '//meta[@property="og:description"]/@content').extract()
     ).strip()
     item['text'] = "".join([
         s.strip().encode('utf-8') for s in response.selector.xpath(
             '//div[@class="FAZArtikelText"]/div/p/text()').extract()
     ])
     item['author'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//span[@class="Autor"]/span[@class="caps last"]/a/span/text()'
         ).extract()
     ]
     item['keywords'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//meta[@name="keywords"]/@content').extract()
     ]
     return item
Exemple #11
0
 def parse_page(self, response):
     """Scrapes information from pages into items"""
     item = CrawlerItem()
     item['url'] = response.url.encode('utf-8')
     item['visited'] = datetime.datetime.now().isoformat().encode('utf-8')
     item['published'] = get_first(
         response.selector.xpath('//meta[@name="date"]/@content').extract())
     item['title'] = get_first(
         response.selector.xpath(
             '//meta[@property="og:title"]/@content').extract())
     item['description'] = get_first(
         response.selector.xpath(
             '//meta[@name="description"]/@content').extract())
     item['text'] = "".join([
         s.strip().encode('utf-8')
         for s in response.selector.css('.article__item').css(
             '.paragraph').xpath('.//text()').extract()
     ])
     item['author'] = [
         s.encode('utf-8') for s in response.selector.css('.byline').css(
             'span[itemprop="name"]').xpath('./text()').extract()
     ]
     item['keywords'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//meta[@name="keywords"]/@content').extract()
     ]
     # Handle next pages
     next_page = get_first(
         response.selector.xpath('//link[@rel="next"]/@href').extract())
     if next_page:
         self.logger.debug("Next page found: " + next_page)
         yield Request(next_page, callback=self.parse_page)
     yield item
Exemple #12
0
    def parse(self, response):
        print(
            "----------------------------------------------------------------------------"
        )
        print(
            "----------------------------------------------------------------------------"
        )

        # xpath匹配规则
        for each in response.xpath("//li"):
            item = CrawlerItem()
            try:
                item["title"] = each.xpath("./div/a/text()").extract()[0]
            except:
                item["title"] = '空'

            try:
                item["name"] = each.xpath(
                    "./div/div[1]/div/a/text()").extract()[0]
            except:
                item["name"] = '空'

            try:
                item["href"] = 'https://www.jianshu.com' + each.xpath(
                    "./div/div[2]/a[2]/@href").extract()[0]
            except:
                item["href"] = '空'

            try:
                item["type"] = each.xpath(
                    "./div/div[2]/a[1]/text()").extract()[0]
            except:
                item["type"] = '空'

            try:
                item["time"] = each.xpath(
                    "./div/div[1]/div/span/@data-shared-at").extract()[0]
            except:
                item["time"] = '空'

            try:
                extract = 'https://www.jianshu.com' + each.xpath(
                    "./div/div[2]/a[1]/@href").extract()[0]
                if (DemoSpider.start_urls.count(extract) == 0):
                    DemoSpider.start_urls.insert(
                        DemoSpider.start_urls.__len__(), extract)
                else:
                    pass
            except:
                pass
            # 把数据交给管道文件
            yield item

        DemoSpider.index = DemoSpider.index + 1
        if (DemoSpider.index < DemoSpider.start_urls.__len__() - 1):
            # 把请求交给控制器
            yield scrapy.Request(self.url[DemoSpider.index],
                                 callback=self.parse)
        else:
            pass
Exemple #13
0
    def parse_page_contents(self, response):
        item = CrawlerItem()
        id_ = response.xpath(
            '//*[@id="main"]/div[1]/div/div[2]/h2/text()')[0].extract()

        reviews = response.xpath('//*[@id="main"]/div[3]/div/div[2]/ol/li')

        for review in reviews:

            item["title"] = review.xpath(
                './div/div/div/div/div/div[1]/div[1]/div[1]/div[1]/a/text()'
            )[0].extract()

            item["id"] = id_

            item["score"] = review.xpath(
                './div/div/div/div/div/div[1]/div[1]/div[2]/div/text()'
            )[0].extract()

            try:
                tmp = review.xpath(
                    './div/div/div/div/div/div[1]/div[3]/span/span[2]/text()'
                ).extract()
                item["review"] = ''.join(tmp)
                if not tmp:
                    raise MakeError()
            except:
                tmp = review.xpath(
                    './div/div/div/div/div/div[1]/div[3]/span/text()').extract(
                    )
                item["review"] = ''.join(tmp)

            yield item
Exemple #14
0
 def parse_page(self, response):
     """Scrapes information from pages into items"""
     item = CrawlerItem()
     item['url'] = response.url.encode('utf-8')
     item['visited'] = datetime.datetime.now().isoformat().encode('utf-8')
     item['published'] = parser.parse(
         get_first(response.selector.xpath(
             '//time/@datetime').extract())).isoformat().encode('utf-8')
     item['title'] = get_first(
         response.selector.xpath(
             '//meta[@property="og:title"]/@content').extract())
     item['description'] = get_first(
         response.selector.xpath(
             '//meta[@name="description"]/@content').extract())
     item['text'] = "".join([
         s.strip().encode('utf-8') for s in response.selector.css(
             '.article>.body>p').xpath('.//text()').extract()
     ])
     item['author'] = [
         s.encode('utf-8')
         for s in response.selector.css('.authorContainer').xpath(
             './/span/strong/span/text()').extract()
     ]
     item['keywords'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//meta[@name="news_keywords"]/@content').extract()
     ]
     item['resource'] = self.name
     item['publication_id'] = hashlib.sha1(
         (str(item['url']) + str(item['published']))).hexdigest()
     return item
Exemple #15
0
    def parse(self, response):

        item = CrawlerItem()
        item['url'] = response.url
        item['raw'] = None
        item['is_visited'] = 'Y'
        item['rvrsd_domain'] = self.get_rvrsd_domain(
            response.request.meta.get('download_slot'))

        try:
            item['status'] = response.status
            raw = response.text
            if response.status == 200:
                item['parsed'] = self.parse_text(raw)
            else:
                item['parsed'] = None

            self.counter = self.counter + 1
            if self.counter % 100 == 0:
                print('[%d] Sleep...' % self.counter)
                sleep(1)

            print('[%d] Parsed: %s' % (self.counter, response.url))

        except AttributeError as e:
            item['status'] = -3
            item['parsed'] = None
            self.logger.error('Fail to Parse: %s , because %s' %
                              (response.url, e))
            print('[%d] Fail to Parse: %s , because %s' %
                  (self.counter, response.url, e))

        return item
Exemple #16
0
 def parse_page(self, response):
     """Scrapes information from pages into items"""
     item = CrawlerItem()
     item['url'] = response.url.encode('utf-8')
     item['visited'] = datetime.datetime.now().isoformat().encode('utf-8')
     item['published'] = get_first(
         response.selector.xpath('//meta[@name="date"]/@content').extract())
     item['title'] = get_first(
         response.selector.css('.headline').xpath('./text()').extract())
     item['description'] = get_first(
         response.selector.xpath(
             '//meta[@name="description"]/@content').extract())
     item['text'] = "".join([
         s.strip().encode('utf-8') for s in response.selector.xpath(
             '//div[@class="article-section clearfix"]/p/text()').extract()
     ])
     item['author'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//p[@class="author"]/a/text()').extract()
     ]
     item['keywords'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//meta[@name="news_keywords"]/@content').extract()
     ]
     return item
    def parse_page(self, response):
        """Scrapes information from pages into items"""
        #settings = get_project_settings()
        published = parser.parse(get_first(response.selector.xpath('//meta[@name="date"]/@content').extract()))
        published = published.replace(tzinfo=timezone('UTC'))
  #      earliest = parser.parse(settings.get('EARLIEST_PUBLISHED'))
 #       if published < earliest:
 #           raise DropItem('Dropping this article published on %s at %s which is before earliest published global setting %s' % (self.name, published.isoformat(), earliest.isoformat()))
            #raise CloseSpider('Article was published on %s at %s which is before earliest published global setting %s' % (self.name, published.isoformat(), earliest.isoformat()))
 #       else:
        item = CrawlerItem()
        item['url'] = response.url.encode('utf-8')
	item['visited'] = datetime.datetime.now().isoformat().encode('utf-8')
    	item['published'] = published.isoformat().encode('utf-8')
    	item['title'] = get_first(response.selector.xpath('//meta[@property="og:title"]/@content').extract())
        item['description'] = get_first(response.selector.xpath('//meta[@name="description"]/@content').extract())
        #item['text'] = "".join([s.strip().encode('utf-8') for s in response.selector.css('.article__item').css('.paragraph').xpath('.//text()').extract()])
    	item['author'] = [s.encode('utf-8') for s in response.selector.css('.byline').css('span[itemprop="name"]').xpath('./text()').extract()]
    	item['keywords'] = [s.encode('utf-8') for s in response.selector.xpath('//meta[@name="keywords"]/@content').extract()]
    	item['resource'] = self.name
    	item['publication_id'] = hashlib.sha1((str(item['url']) + str(item['published']))).hexdigest()
   	# Handle next pages
    	next_page = get_first(response.selector.xpath('//link[@rel="next"]/@href').extract())
    	if next_page:
            self.logger.debug("Next page found: "+next_page)
            yield Request(next_page,callback=self.parse_page)
        #else:
        #    raise CloseSpider('Article was published on %s at %s which is before earliest published global setting %s' % (self.name, published.isoformat(), earliest.isoformat()))
        yield item
Exemple #18
0
 def parse2(self, response):
     item = CrawlerItem()
     try:
         page = Selector(response=response).xpath('//ul[@class="pagination"]')
         author = Selector(response=response).xpath('//h1[@class="title J_title"]/text()').get().strip()
         author = ' '.join(author.split())
         print(author)
         if len(page)==0:
             print('只有一页评论')
             comments=self.comms(response)
             for comment in comments:
                 if comment == " " or comment == "  " :
                     pass
                 else:
                     item['author'] = author
                     item['comment'] = comment
                     yield item
             #####
         else:
             print('该页有多页评论')
             page_num = page[0].xpath('./li/a/text()').getall()
             print(page_num)
             num = int(page_num[-2])
             print(num)
             for n in range(1,num+1):
                 print(f'正在提取第{n}页')
                 if n == 1:
                     url = response.request.url + '/#comments'
                 else:
                     url = response.request.url + f'/p{n}/#comments'
                 yield scrapy.Request(url=url, callback=self.parse3, dont_filter=False)
     except Exception as e:
         print(e)
         print('手机详情页链接未抓取成功')    
Exemple #19
0
        def parse_article(self, response):
            def extract_with_css(query):
                return response.css(query).extract_first(
                    default='Not-Found').strip()

            if response.xpath(
                    "//span[@itemprop='name']/text()").extract_first() is None:
                author = response.xpath(
                    "//p[@class='byline']/text()").extract_first().strip()
            else:
                author = response.xpath(
                    "//span[@itemprop='name']/text()").extract_first().strip()

            item = CrawlerItem()
            item['Date'] = dt.datetime.today().strftime('%Y-%m-%d %H:%M:%S')
            item['Headline'] = extract_with_css('h1.content__headline::text')
            item['Author'] = author
            item['Topic'] = extract_with_css(
                'div.content__section-label a::text')
            item['Snippet'] = extract_with_css('p::text')
            item['Tags'] = response.css(
                'li.submeta__link-item a::text').extract()
            item['DateUpdated'] = extract_with_css(
                'p.content__dateline time::text')
            yield item
Exemple #20
0
    def start_requests(self):
        strategy = 0
        try:
            strategy = int(self.strategy)
            print(f'使用指定策略:{strategy}')
        except Exception as e:
            print('使用默认追踪策略')

        col = f'kw-{self.keyword}'
        cols = self.db.list_collection_names(filter={"name":{"$regex":r"^kw-"}})
        if not col in cols:
            yield scrapy.Request(f'https://listado.mercadolibre.com.mx/{self.keyword}')
        else:
            if strategy:
                yield scrapy.Request(f'https://listado.mercadolibre.com.mx/{self.keyword}')
            else:
                docs_cursor = self.db[col].aggregate([{'$group': {'_id': { 'pid': '$pid' }, 'pid': {'$last': '$pid'} ,  'src': {'$last': '$src'}, 'sales': {'$last': '$sales'}}},{'$match': {'sales': {'$gt': 0}}}])
                docs = list(docs_cursor)
                if not len(docs) > 0:
                    yield scrapy.Request(f'https://listado.mercadolibre.com.mx/{self.keyword}')
                else:
                    for doc in docs:
                        item = CrawlerItem()
                        item['src'] = doc['src']
                        item['pid'] = doc['pid']
                        yield scrapy.Request(item['src'],self.parse_item,cb_kwargs={'item':item})
Exemple #21
0
    def download_errback(self, failure, url):
        item = CrawlerItem()
        item['url'] = url
        item['is_visited'] = 'Y'
        item['rvrsd_domain'] = None
        item['raw'] = None
        item['parsed'] = None

        if failure.check(IgnoreRequest):
            self.logger.debug('Forbidden by robot rule')
            item['status'] = -1

        elif failure.check(DNSLookupError):
            self.logger.info('Fail to DNS lookup.')
            item['status'] = -2

        elif failure.check(DNSMismatch):
            self.logger.info('Fail to DNS match.')
            item['status'] = -2

        elif failure.check(NoRouteError):
            self.logger.info('No route error.')
            item['status'] = -4

        elif failure.check(HttpError):
            status = failure.value.response
            self.logger.info('Http error [%s].' % status)
            item['status'] = status

        else:
            self.logger.info('Unknown error.')
            item['status'] = -255

        yield item
Exemple #22
0
 def parse_page(self, response):
     """Scrapes information from pages into items"""
     item = CrawlerItem()
     item['url'] = response.url.encode('utf-8')
     item['visited'] = datetime.datetime.now().isoformat().encode('utf-8')
     item['published'] = parser.parse(
         get_first(
             response.selector.xpath(
                 '//meta[@property="vr:published_time"]/@content').extract(
                 ))).isoformat().encode('utf-8')
     item['title'] = get_first(
         response.selector.xpath(
             '//meta[@property="og:title"]/@content').extract())
     item['description'] = get_first(
         response.selector.xpath(
             '//meta[@property="og:description"]/@content').extract()
     ).strip()
     item['text'] = "".join([
         s.strip().encode('utf-8') for s in response.selector.xpath(
             '//div[@class="main-text "]/p/text()').extract()
     ])
     item['author'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//meta[@name="author"]/@content').extract()
     ]
     item['keywords'] = [
         s.encode('utf-8') for s in response.selector.xpath(
             '//meta[@name="keywords"]/@content').extract()
     ]
     item['resource'] = self.name
     item['publication_id'] = hashlib.sha1(
         (str(item['url']) + str(item['published']))).hexdigest()
     return item
Exemple #23
0
 def crawldata(self, response):
     questions = response.xpath(
         '//*[@id="list-comment"]/div[@class="f-cmt-ask"]')
     for quest in questions[1:]:
         items = CrawlerItem()
         items['Comment'] = quest.xpath(
             'div[@class="f-cmmain"]/text()').extract_first()
         yield items
Exemple #24
0
 def parse(self, response):
     items = CrawlerItem()
     imdbId = str(response.url)[-8:-1]
     items['imdbId'] = str(imdbId)
     # xpath提取分数
     score = response.xpath('//span[@itemprop="ratingValue"]/text()').extract()[0]
     items['score'] = score
     return items
Exemple #25
0
 def parse(self, response):
     item = CrawlerItem()
     item['jpg_urls'] = []
     linkextractors = LxmlLinkExtractor(
         allow=[r'\.jpg', r'\.tif'], deny_extensions=['md5', 'xmp', 'html'])
     for link in linkextractors.extract_links(response):
         item['jpg_urls'].append(link.url)
     return item
Exemple #26
0
 def parse(self, response):
     sel = Selector(response)
     sites = sel.css('a[href$=".gz"]')
     for site in sites:
         item = CrawlerItem()
         item['url'] = site.xpath('@href').extract()
         with open('enlaces.txt', 'a') as f:
             f.write('{0}\n'.format(item['url'][0]))
 def parse_item(self, response):
     item = CrawlerItem()
     item['url'] = response.url
     item['title'] = response.css('#firstHeading::text').get()
     item['overview'] = re.sub(
         self.content_regex, '',
         response.css('#mw-content-text div p:not(.mw-empty-elt)').get())
     item['content'] = re.sub(self.content_regex, '',
                              response.css('#mw-content-text div').get())
     return item
Exemple #28
0
 def parse(self, response):
     print('================>Start to crawling the URL' + response.url)
     links = Selector(response).xpath(
         "//div[@class='title-news']/a/@href").extract()
     for link in links:
         url = CrawlerItem()
         url['title_link'] = link.css('title::text').get()
         url['author_link'] = link.css('.fck_detail strong::text').get()
         url['publish_time'] = link.css('.date::text').get()
         yield url
Exemple #29
0
    def parse_item(self, response):
        questions = response.xpath('//div[@class="summary"]/h3')

        for question in questions:
            item = CrawlerItem()
            item['url'] = question.xpath(
                'a[@class="question-hyperlink"]/@href').extract()[0]
            item['title'] = question.xpath(
                'a[@class="question-hyperlink"]/text()').extract()[0]
            yield item
Exemple #30
0
 def parse_apk(self, response):
     for position in response.xpath('//ul[@id="iconList"]/li'):
         l = APKItemLoader(item=CrawlerItem(), selector=position)
         l.add_value('category', response.meta.get('cate', ''))
         l.add_value('apk_from', '360')
         l.add_xpath('apk_name', 'h3/a/text()')
         l.add_xpath('apk_url',
                     'a[starts-with(@href, "zhushou360:")]/@href',
                     re=r'.*&url=(.*)')
         yield l.load_item()