コード例 #1
0
ファイル: jobbole.py プロジェクト: Mr2277/baidu
    def parse(self, response):
        post_nodes = response.xpath('//*[@id="archive"]//div[@class="post-thumb"]')     # 首先获取文章的所有url
        print(post_nodes)
        for post_node in post_nodes:

         post_url = post_node.xpath('.//a/@href').extract_first()  # 文章的地址
         print(post_url)
        # image_url = post_node.xpath('.//img/@src').extract_first()  # 文章图片的地址
        # print(image_url)
         #image_url = parse.urljoin(response.url, image_url)  # 以前的文章图片是在本域名下,所以拼接一下。
         #yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail,meta={'image_url': image_url})  # 用回调函数分析文章页面的元素
         # 提取下一页的url

         next_url = response.xpath('//*[@id="archive"]//a[contains(@class,"next")]/@href').extract_first()
         if next_url:
             yield Request(url=next_url, callback=self.parse)

         def parse_detail(self, response):
             # 通过Itemloader加载Item
             image_url = response.meta.get('image_url')  # 传递图片的url
             item_loader = ArticleItemLoader(item=ArticleItem(), response=response)  # 将类设置为自定义的Itemloader类
             item_loader.add_xpath('title', '//*[@class="entry-header"]/h1/text()')  # 通过xpath来提取数据
             item_loader.add_value('url', response.url)  # 直接添加值
             item_loader.add_value('url_object_id', get_md5(response.url))
             item_loader.add_xpath('create_date', '//p[@class="entry-meta-hide-on-mobile"]/text()[1]')
             item_loader.add_value('image_url', [image_url])
             item_loader.add_xpath('praise_nums', "//span[contains(@class,'vote-post-up')]/h10/text()")
             item_loader.add_xpath('fav_nums', "//span[contains(@class,'bookmark-btn')]/text()")
             item_loader.add_xpath('comment_nums', "//a[@href='#article-comment']/span/text()")
             item_loader.add_xpath('content', '//*[@class="entry"]/p | //*[@class="entry"]/h3 | //*[@class="entry"]/ul')

             article_item = item_loader.load_item()  # 将规则进行解析,返回的是list
             yield article_item

        pass
コード例 #2
0
ファイル: fang.py プロジェクト: myluke/dataems
 def parse(self, response):
     district_name = response.xpath(
         '//div[@class="qxName"]/a[not(@class="org bold") and not(contains(text(),"周边"))]/text()'
     ).extract()
     district_url = response.xpath(
         '//div[@class="qxName"]/a[not(@class="org bold") and not(contains(text(),"周边"))]/@href'
     ).extract()
     for name, url in zip(district_name, district_url):
         print("开始区=========")
         print(url)
         yield scrapy.Request(url=response.urljoin(url), callback=self.town)
         time.sleep(random.randint(1, 3))
コード例 #3
0
 def town(self, response):
     town_names = response.xpath(
         '//div[@class="option-list sub-option-list gio_plate"]/a[@class!="on"]/text()'
     ).extract()
     town_urls = response.xpath(
         '//div[@class="option-list sub-option-list gio_plate"]/a[@class!="on"]/@href'
     ).extract()
     for name, url in zip(town_names, town_urls):
         print("开始镇=========")
         print(url)
         yield scrapy.Request(url=response.urljoin(url),
                              callback=self.town_data)
         time.sleep(random.randint(1, 3))
コード例 #4
0
ファイル: fang.py プロジェクト: myluke/dataems
 def town(self, response):
     town_names = response.xpath(
         '//p[@id="shangQuancontain"]/a[not(@class="org bold")]/text()'
     ).extract()
     town_urls = response.xpath(
         '//p[@id="shangQuancontain"]/a[not(@class="org bold")]/@href'
     ).extract()
     for name, url in zip(town_names, town_urls):
         print("开始镇=========")
         print(url)
         yield scrapy.Request(url=response.urljoin(url),
                              callback=self.town_data)
         time.sleep(random.randint(1, 3))
コード例 #5
0
 def town(self, response):
     global page_num
     town_names = response.xpath(
         '//div[@class="sub-items"]/a[@data-id!="全部"]/@data-id').extract()
     town_urls = response.xpath(
         '//div[@class="sub-items"]/a[@data-id!="全部"]/@href').extract()
     page_num = 1
     for name, url in zip(town_names, town_urls):
         print("开始镇===========" + url)
         yield scrapy.Request(url=url,
                              headers=self.headers,
                              callback=self.town_data)
         time.sleep(random.randint(1, 3))
コード例 #6
0
ファイル: cars.py プロジェクト: Hristiyan-Bonev/cars_crawler
 def parse_category(self, response):
     total_pages = int(response.xpath('//span[contains(@class,"pageNumbersInfo")]/b/text()').re('\d+$')[0])
     ads_available = response.xpath('//table[contains(@class,"tablereset")]//a[contains(@href,"act=4")]')
     if not ads_available:
         logging.error(f"[#!#] No ads for | {response.meta['manufacturer']} {response.meta['model']} |")
         return {}
     logging.warning(
         f"Processing total of {len(ads_available)} posts for {response.meta['manufacturer']} {response.meta['model']}")
     CarsCrawlerMobile.processed_cars_count[
         response.meta['manufacturer']] = CarsCrawlerMobile.processed_cars_count.get(response.meta['manufacturer'],
                                                                                     0) + len(ads_available)
     url_without_page_number = "=".join(response.url.split('=')[:-1])
     for page_number in range(1, total_pages + 1):
         next_page = f'{url_without_page_number}={page_number}'
         yield Request(next_page, callback=self._parse_page, meta=response.meta)
コード例 #7
0
    def parse_detail(self, response):
        """提取内容"""
        node_list = response.xpath("//div[@id='J_goodsList']/ul/li")

        cnt = 0
        print(len(node_list))
        for node in node_list:
            item = JdItem()
            item['spu_id'] = node.xpath("./@data-spu").extract_first()
            item['sku_id'] = node.xpath("./@data-sku").extract_first()
            item['name'] = node.xpath(
                ".//div[@class='p-name p-name-type-2']/a/em/text()"
            ).extract_first()
            item['price'] = node.xpath(
                ".//div[@class='p-price']/strong/i/text()").extract_first()
            detail_url = node.xpath(
                ".//div[@class='p-img']/a/@href").extract_first()
            item['detail_url'] = 'https:' + detail_url if detail_url else ''
            default_url = node.xpath(
                ".//div[@class='p-img']/a/img/@src").extract_first()
            item['default_url'] = 'https:' + default_url if default_url else ''
            item['comment'] = node.xpath(
                ".//div[@class='p-commit']/strong/a/text()").extract_first()
            item['shop'] = node.xpath(
                ".//div[@class='p-shop']/span/a/text()").extract_first()
            item['is_self'] = node.xpath(
                ".//div[@class='p-icons']/i[1]/text()").extract_first()

            yield item
            # yield scrapy.Request(url=url)
        print('--' * 40)
        print(cnt)
コード例 #8
0
ファイル: cars.py プロジェクト: Hristiyan-Bonev/cars_crawler
 def _parse_ad(self, response):
     price, title, *_ = sorted(response.xpath('//div/strong/text()').extract())
     car_details = dict(zip(response.xpath('//ul[@class="dilarData"]/li/text()').extract()[::2],
                            response.xpath('//ul[@class="dilarData"]/li/text()').extract()[1::2]))
     car_features = [x[2:] for x in
                     response.xpath('//div[contains(@style,"margin-bottom:") and contains(.,"•")]/text()').extract()]
     car_features_mapped = {x: 1 if x in car_features else 0 for x in FEATURES_LOOKUP}
     ad_description = "\n".join(
         response.xpath('//div[contains(.,"Допълнителна")]/following::table[1]/tr/td/text()').extract())
     yield {
         'manufacturer': response.meta['manufacturer'].upper(),
         'model': response.meta['model'].upper(),
         'date': datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S'),
         'ad_title': title,
         'price': price,
         'ad_description': ad_description,
         **car_details,
         **car_features_mapped,
     }
コード例 #9
0
 def parse(self, response):
     district_name = response.xpath(
         '//span[@class="item-title" and contains(text(), "区域")]/../span[@class="elems-l"]/a[@class="" and not(contains(text(),"周边"))]/text()'
     ).extract()
     district_url = response.xpath(
         '//span[@class="item-title" and contains(text(), "区域")]/../span[@class="elems-l"]/a[@class="" and not(contains(text(),"周边"))]/@href'
     ).extract()
     dy = int(datetime.date.today().day)
     count = 1
     for name, url in zip(district_name, district_url):
         print("开始区========")
         print(url)
         if dy == count:
             yield scrapy.Request(url=url,
                                  headers=self.headers,
                                  callback=self.town,
                                  meta={'name': name})
             time.sleep(random.randint(1, 3))
         count = count + 1
コード例 #10
0
ファイル: test.py プロジェクト: langtung/NLP-cho-tieng-Viet
	def getTho(sefl,response):
		hxs = Selector(response)
		b=response.xpath("//p[contains(@class,'Poem_Han')]")
		if b:
			sites=response.xpath("//td[contains(@valign, 'top')]/p//text()")
		else:
			sites = hxs.xpath('//div[contains(@id,"PoemBody")]//text()')
			   
		
		author=hxs.xpath('//a[contains(@href,"viewauthor")]/b/text()')[0].extract().encode('utf-8')
		title=hxs.xpath('//a[contains(@name,"POEM")]/text()').extract()[0].encode('utf-8')
	# print author
	# print title
		filename="("+author+") "+title+".txt"
		filename=unicode(filename,"utf-8")
	# print filename
		print("***************************************************************************************")
		with open(filename,'w') as thefile:
			for site in sites:
				print>>thefile,site.extract().encode('utf-8')
コード例 #11
0
	def getTho(sefl,response):
		hxs = Selector(response)
		a=response.xpath("//p[contains(@class,'Normal')]//text()")
		a=[''.join(a1 for a1 in a.extract())]
		#author=hxs.xpath('//a[contains(@href,"viewauthor")]/b/text()')[0].extract().encode('utf-8')
		title=hxs.xpath('//div/h1//text()')[0].extract()
		print title.encode('utf-8')
	# print author
	# print title
		filename=title+".txt"
		#filename=unicode(filename,"utf-8")
	# print filename
		print("***************************************************************************************")
		with open(filename,'wb') as thefile:
			for a1 in a:
				print>>thefile,a1.encode('utf-8')
コード例 #12
0
    def parse(self, response):

        selector = Selector(response)
        # // dl // dt / following - sibling:: *
        infos = response.xpath("//dl//dd/a/@href").extract()
        # print infos

        for v in infos:
            try:
                # print v
                response = scrapy.Request("https://www.booktxt.net/0_790/" + v, dont_filter=True,
                                          callback=self.parse_Item)

                yield response
                # item[''] = v.xpath('')
                # yield item
            except IndexError:
                pass
コード例 #13
0
    def parse(self, response):
        hack = response.xpath('//*[@id="container"]')
        post_title = response.xpath('//*[@class=" subject_new"]/a/text()')
        post_time = response.xpath('//*[@class="lastpost smalltext"]/text()')
        post_lastpost = response.xpath('//*[@class="lastpost smalltext"]/a/text()')
        post_author = response.xpath('//*[@class="author smalltext"]/a/text()')
        post_link = response.xpath('//*[@class=" subject_new"]/a/@href')
        items = []
        for post_title, post_time, post_author, post_lastpost, post_link in zip(post_title, post_time, post_author, post_lastpost, post_link):
            item = HackforumsItem()
            item['Title'] = re.sub(r'[^a-z A-Z0-9?]', '', post_title.extract().strip())
            item['Title'] = post_time.extract().strip()
            item['Lastpost'] = post_lastpost.extract().strip()
            item['Author'] = post_author.extract().strip()
            item['Link'] = post_link.extract().strip()
            items.append(item)
            yield item

        next_page_url = response.xpath().extract_first()
        absolute_next_page_url = response.urljoin(next_page_url)
        yield scrapy.Request(absolute_next_page_url, callback=self.parse)
コード例 #14
0
    def town_data(self, response):
        ershou = ajk()
        data_area = response.xpath('//div[@_soj="xqlb"]')
        print(len(data_area))
        global page_num
        names = []
        prices = []
        bdyears = []
        bdaddrs = []
        bddists = []
        lats = []
        lngs = []
        cdates = []
        for data in data_area:
            name = data.xpath('div[@class="li-info"]/h3/a/text()').extract()
            price = data.xpath(
                'div[@class="li-side"]/p/strong/text()').extract()
            bdyear = data.xpath(
                'div[@class="li-info"]/p[@class="date"]/text()').extract()
            bdaddr = data.xpath(
                'div[@class="li-info"]/address/text()').extract()
            if name:
                names.append(name[0])
                #                gpslocation = self.get_gps(name[0])
                #                if gpslocation:
                #                    lats.append(gpslocation['lat'])
                #                    lngs.append(gpslocation['lng'])
                #                elif not gpslocation:
                #                    lats.append(' ')
                #                    lngs.append(' ')
                lats.append(' ')
                lngs.append(' ')
                if price:
                    prices.append(price[0])
                else:
                    prices.append('暂无均价')
                if bdyear:
                    bdyears.append(self.get_year(bdyear[0]))
                else:
                    bdyears.append('9999')
                if bdaddr:
                    address = bdaddr[0].strip().lstrip().rstrip()
                    address = unicodedata.normalize('NFKC', address)
                    print(address)
                    m = re.findall("\[.+\-", address)
                    bddists.append(m[0].replace("[", "").replace("-",
                                                                 "").replace(
                                                                     "]", ""))

                    m = re.findall("\].*", address)
                    bdaddrs.append(m[0].replace("[", "").replace("]",
                                                                 "").lstrip())
                else:
                    bdaddrs.append('暂无地址')
                    bddists.append('暂无地址')
                dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                cdates.append(dt)
            else:
                break

        assert len(names) == len(prices)

        for block_name, block_price, block_bdyear, block_bdaddr, block_bddist, block_lat, block_lng, block_date in zip(
                names, prices, bdyears, bdaddrs, bddists, lats, lngs, cdates):
            ershou['house_name'] = block_name
            ershou['house_price'] = block_price
            ershou['house_bdyear'] = block_bdyear
            ershou['house_bdaddr'] = block_bdaddr
            ershou['house_bddist'] = block_bddist
            ershou['house_lat'] = block_lat
            ershou['house_lng'] = block_lng
            ershou['craw_date'] = block_date
            ershou['source_web'] = "anjuke"
            yield ershou
        next_link = response.xpath(
            '//div[@class="multi-page"]/a[contains(text(), "下一页")]/@href'
        ).extract()
        print(next_link)
        if next_link:
            url = next_link[0]
            page_num = page_num + 1
            print('next page =============' + url)
            time.sleep(random.randint(1, 3))
            yield scrapy.Request(url=url,
                                 headers=self.headers,
                                 callback=self.town_data)
コード例 #15
0
ファイル: fang.py プロジェクト: myluke/dataems
    def town_data(self, response):
        ershou = ajk()
        data_area = response.xpath(
            '//div[@class="houseList"]/div[@class="list rel"]')
        names = []
        prices = []
        bdyears = []
        bdaddrs = []
        bddists = []
        lats = []
        lngs = []
        cdates = []
        for data in data_area:
            print("开始小区=========")
            name = data.xpath(
                'dl[@class="plotListwrap clearfix"]/dd/p/a[@class="plotTit"]/text()'
            ).extract()
            price = data.xpath(
                'div[@class="listRiconwrap"]/p[@class="priceAverage"]/span[1]/text()'
            ).extract()
            bdaddr = data.xpath(
                'dl[@class="plotListwrap clearfix"]/dd/p[2]/text()').extract()
            bddistrict = data.xpath(
                'dl[@class="plotListwrap clearfix"]/dd/p[2]/a[1]/text()'
            ).extract()
            bdyear = data.xpath(
                'dl[@class="plotListwrap clearfix"]/dd/ul[@class="sellOrRenthy clearfix"]/li[3]/text()'
            ).extract()
            if name:
                names.append(name[0].strip().rstrip().lstrip())
                lats.append(" ")
                lngs.append(" ")

                bddists.append(bddistrict[0])
                bdaddrs.append("  ")

                if bdyear:
                    bdyears.append(bdyear[0])
                else:
                    bdyears.append('9999')

                if price:
                    tmp = re.findall("[-+]?\d+[\.]?\d*",
                                     price[0].strip().rstrip().lstrip())
                    if tmp:
                        prices.append(tmp[0])
                    elif not tmp:
                        prices.append('暂无均价')
                elif not price:
                    prices.append('暂无均价')
                dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                cdates.append(dt)

        for block_name, block_price, block_bdyear, block_bdaddr, block_bddist, block_lat, block_lng, block_date in zip(
                names, prices, bdyears, bdaddrs, bddists, lats, lngs, cdates):
            ershou['house_name'] = block_name
            ershou['house_price'] = block_price
            ershou['house_bdyear'] = block_bdyear
            ershou['house_bdaddr'] = block_bdaddr
            ershou['house_bddist'] = block_bddist
            ershou['house_lat'] = block_lat
            ershou['house_lng'] = block_lng
            ershou['craw_date'] = block_date
            ershou['source_web'] = "fang"
            yield ershou
            next_link = response.xpath(
                '//div[@class="fanye gray6"]/a[contains(text(), "下一页")]/@href'
            ).extract()
            if next_link:
                url = next_link[0]
                print('next page =============' + url)
                time.sleep(random.randint(1, 3))
                yield scrapy.Request(url=response.urljoin(url),
                                     callback=self.town_data)
コード例 #16
0
    def town_data(self, response):
        ershou = ajk()
        data_area = response.xpath(
            '//div[@class="list-wrap"]/ul[@class="house-lst"]/li')
        names = []
        prices = []
        bdyears = []
        bdaddrs = []
        lats = []
        lngs = []
        cdates = []
        for data in data_area:
            name = data.xpath('div[@class="info-panel"]/h2/a/text()').extract()
            price = data.xpath(
                'div[@class="info-panel"]/div[@class="col-3"]/div[@class="price"]/span[@class="num"]/text()'
            ).extract()
            bdaddr = data.xpath(
                'div[@class="info-panel"]/div[@class="col-1"]/div[@class="where"]/a[@class="actshowMap_list"]/@xiaoqu'
            ).extract()
            bddistrict = data.xpath(
                'div[@class="info-panel"]/div[@class="col-1"]/div[@class="where"]/a[@class="actshowMap_list"]/@districtname'
            ).extract()
            bdyear = data.xpath(
                'div[@class="info-panel"]/div[@class="col-1"]/div[@class="other"]/div[@class="con"]/text()'
            ).extract()
            if name:
                names.append(name[0].strip().rstrip().lstrip())
                if bdaddr[0]:
                    tmp = re.findall("[-+]?\d+[\.]?\d*", bdaddr[0])
                    if tmp:
                        lngs.append(tmp[0])
                        lats.append(tmp[1])
                    else:
                        lats.append(" ")
                        lngs.append(" ")
                else:
                    lats.append(" ")
                    lngs.append(" ")

                bdaddrs.append(bddistrict[0])

                if (len(bdyear) >= 4):
                    if bdyear[3]:
                        tmp = bdyear[3].strip().rstrip().lstrip()
                        if tmp:
                            bdyears.append(tmp)
                        elif not tmp:
                            bdyears.append('9999')
                    elif not bdyear[3]:
                        bdyears.append('9999')
                else:
                    bdyears.append('9999')

                if price:
                    tmp = re.findall("[-+]?\d+[\.]?\d*",
                                     price[0].strip().rstrip().lstrip())
                    if tmp:
                        prices.append(tmp[0])
                    elif not tmp:
                        prices.append('暂无均价')
                elif not price:
                    prices.append('暂无均价')
            dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            cdates.append(dt)

        for block_name, block_price, block_bdyear, block_bdaddr, block_lat, block_lng, block_date in zip(
                names, prices, bdyears, bdaddrs, lats, lngs, cdates):
            ershou['house_name'] = block_name
            ershou['house_price'] = block_price
            ershou['house_bdyear'] = block_bdyear
            ershou['house_bdaddr'] = block_bdaddr
            ershou['house_bddist'] = block_bdaddr
            ershou['house_lat'] = block_lat
            ershou['house_lng'] = block_lng
            ershou['craw_date'] = block_date
            ershou['source_web'] = "lianjia"
            yield ershou
            next_link = response.xpath(
                '//div[@class="page-box house-lst-page-box"]/a[contains(text(), "下一页")]/@href'
            ).extract()
            if next_link:
                url = next_link[0]
                print('next page =============' + url)
                time.sleep(random.randint(1, 3))
                yield scrapy.Request(url=response.urljoin(url),
                                     callback=self.town_data)
コード例 #17
0
    def parse(self, response):
        print("Processing --> " + response.url)
        responseMetaInfo = response.request.meta if response.request.meta else None

        # Scraping data from the page using Xpath to avoid irrelevant data
        data = response.xpath(
            '//*[not(ancestor-or-self::script or descendant-or-self::script or ancestor-or-self::noscript or descendant-or-self::noscript or ancestor-or-self::style or descendant-or-self::style)]/text()'
        ).getall()
        seller_relation = ''
        adstxtstatus = 0
        track = 0

        # Storing appropriate HTTP Status Codes
        if response.status == 200:
            httpstatus = response.status
            adstxtstatus = 1
        else:
            if 'redirect_reasons' in responseMetaInfo:
                httpstatus = responseMetaInfo['redirect_reasons'][
                    len(responseMetaInfo['redirect_reasons']) - 1]
                adstxtstatus = 2
            else:
                httpstatus = response.status
                adstxtstatus = 3

        # Retrieving original URL if there's a redirect
        if responseMetaInfo.get('redirect_urls'):
            url = responseMetaInfo['redirect_urls'][0]
        else:
            url = response.request.url

        # Writing scraped Data into a text file
        with open('Temp.txt', 'w', newline='\n', encoding='utf-8') as tempFile:
            for entry in data:
                tempFile.write(entry.rstrip("\r\n"))
            tempFile.close()

        # Reading the temporarily stored Data
        file = open('Temp.txt', 'r')

        # Storing ads.txt entries in dict (scraped_info) to be written in the final CSV output
        for row in file:
            seller_relation = "Direct" if "direct" in str.lower(
                row) else "Reseller"

            if "direct" in str.lower(row) or "reseller" in str.lower(row):
                track = 1
                self.scraped_info = {
                    'Domain Name': (url.replace('http://',
                                                '')).replace('/ads.txt', ''),
                    'Ads.txt':
                    adstxtstatus,
                    'Ads.txt Line':
                    row.rstrip("\n"),
                    'Seller Relation':
                    seller_relation,
                    'HTTP Status Code':
                    httpstatus,
                }
                yield self.scraped_info
        if track == 0:
            self.scraped_info = {
                'Domain Name': (url.replace('http://',
                                            '')).replace('/ads.txt', ''),
                'Ads.txt': adstxtstatus,
                'Ads.txt Line': row.rstrip("\n"),
                'Seller Relation': seller_relation,
                'HTTPStatusCode': httpstatus,
            }
            yield self.scraped_info
コード例 #18
0
ファイル: cars.py プロジェクト: Hristiyan-Bonev/cars_crawler
 def _parse_page(self, response):
     ads = ["https:" + x for x in response.xpath('//td[@class="valgtop"]//a[@class="mmm"]/@href').extract()]
     for ad_url in ads:
         yield Request(ad_url, callback=self._parse_ad, meta=response.meta)
コード例 #19
0
ファイル: test1.py プロジェクト: langtung/NLP-cho-tieng-Viet
	def getTho(self,response):
		hxs = Selector(response)
		a=response.xpath('///td//a[contains(@href,"java")]/text()')
		with open('testLink.txt','wb') as thefile:
			for a1 in a:
				print>> thefile,a.extract()