コード例 #1
0
ファイル: yahoo.py プロジェクト: Lvxingpai/Andaman
    def parse_state_url(self, response):  # draw the state
        sel = Selector(response)
        tempcountryname = sel.xpath(
            '//div[@id="MediaWeatherRegion"]/div[@class="hd"]/div[@class="yom-bread"]/text()').extract()
        match = re.search(r'[\w\s]+$', tempcountryname[0])
        if match:
            countryname = match.group().strip()
        else:
            self.log('没有国家名', log.WARNING)
            return

        data_1 = response.meta['data']

        for node in sel.xpath('//div[@id="page1"]/ul/li/a'):
            state_name = node.xpath('./span/text()').extract()[0].strip()
            state_href = node.xpath('./@href').extract()[0]

            yield Request(url='https://weather.yahoo.com' + state_href, callback=self.parse_city,
                          meta={'data': {'data_1': data_1, 'countryname': countryname, 'state': state_name}})

            country_code = data_1['countrycode']

            # Get states and provinces
            item = YahooCityItem()
            item['country'] = {'countrycode': country_code, 'countryname': countryname}
            item['state'] = state_name
            item['level'] = 1
            item['abroad'] = data_1['abroad']
            yield item
コード例 #2
0
ファイル: kaidy.py プロジェクト: miminus/Spider_demo
 def parse_torrent(self,response):  
     all_content = BeautifulSoup(response.body,'html5lib')
     sel = Selector(text=all_content.prettify(), type="html")
     topic_item = response.meta['topic_item']
     topic_item['thread_content'] = response.body
     
     topic_item['topic_board']='凯迪社区'
     print '+++++++++++++++++++'
     try:
         homepage = sel.xpath('//div[re:test(@class,"postspecific")]//span[re:test(@class,"c-main")]/a/@href').extract()[0].strip()
         topic_item['homepage'] = homepage 
         
         user_id = re.findall(self.urser_id_pa,homepage)[0]
         topic_item['poster_id'] = user_id
     except:
         topic_item['homepage'] = '' 
         topic_item['poster_id'] = '111'
     
     topic_item['data_type']=2
     topic_item['site_id']=8        
     
     scratch_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())
     topic_item['scratch_time'] = scratch_time    
         
         
     return topic_item
コード例 #3
0
    def parse(self, response):
        sel = Selector(response)
        sites = sel.xpath('//div[@class="item"]/div[@class="info"]')
        items = []
        for site in sites:
            item = Doubantop250FilmItem()
            item['name'] = str("".join(site.xpath('div[@class="hd"]/a/span[@class="title"]/text()').extract()))
            item['rate'] = str("".join(site.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()))
            item['url'] = str("".join(site.xpath('div[@class="hd"]/a/@href').extract()))
            item['rate_num'] = str(site.xpath('div[@class="bd"]/div[@class="star"]/span/text()').extract()[1])
            item['summary'] = str("".join(site.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()))

            direct_actor = str(site.xpath('div[@class="bd"]/p/text()').extract()[0]).replace("\n", "")
            if direct_actor.__contains__("主演"):
                item['director'] = direct_actor.split("主演")[0].strip().split("导演")[1].strip().replace(":", "")
                item['actor'] = direct_actor.split("主演")[1].strip().replace(":", "")
            else:
                item['director'] = direct_actor.split("导演")[1].strip().replace(":", "")
                item['actor'] = 'unknown'

            releaseDate_nation_type = str(site.xpath('div[@class="bd"]/p/text()').extract()[1]).replace("\n", "")
            item['releaseDate'] = releaseDate_nation_type.split("/")[0].strip()
            item['nation'] = releaseDate_nation_type.split("/")[1].strip()
            item['type'] = releaseDate_nation_type.split("/")[2].strip()
            items.append(item)
        return items
コード例 #4
0
ファイル: gamepedia.py プロジェクト: kylezwarich/PoE-Scrapy
 def parse(self, response):
     """
     The lines below is a spider contract. For more info see:
     http://doc.scrapy.org/en/latest/topics/contracts.html
     @url http://pathofexile.gamepedia.com/List_of_unique_XXX
     @scrapes pathofexile.gamepedia.com
     """
     if not self._is_valid_url(response.url):
         return None
     url_parts = urlparse(response.url)
     self.set_path(url_parts)
     #self.log('A response from %s just arrived!' % response.url)
     sel = Selector(response)        
     items = sel.xpath(".//tr[@id]")
     unique_items = []
     for an_item in items: 
         unique_item = UniqueItem()
         unique_item['name'] = an_item.xpath("./td[1]/a[1]/@title").extract()[0]
         num_spans = len(an_item.xpath("./td[last()]//div[@class='itemboxstatsgroup']/span"))
         if num_spans == 1:
             unique_item['implicit_mods'] = []
         else:
             unique_item['implicit_mods'] = an_item.xpath("./td[last()]//div[@class='itemboxstatsgroup'][1]//span/text()").extract()
         affix_mods = an_item.xpath("./td[last()]//div[@class='itemboxstatsgroup'][last()]//span/text()").extract()                
         unique_item['affix_mods'] = affix_mods
         unique_item['url'] = "{}://{}{}".format(url_parts.scheme, 
                                                 url_parts.netloc, an_item.xpath("./td[1]/a[1]/@href").extract()[0])
         unique_item['category'] = self.get_category()
         unique_items.append(unique_item)
     return unique_items
コード例 #5
0
	def parse_job_details(self, response):
		hxs = Selector(response)
		item = BrightermondaySampleItem()
		item['link'] = response.url
		item['title'] = hxs.xpath('//h2/text()').extract()[0]
		item['desc'] = hxs.xpath('//article[@class="resultDetail"]/p/text()').extract()[0]
		return item
コード例 #6
0
ファイル: gspider.py プロジェクト: tiagodiz/Lab
    def parse(self, response):
        item = Bet()
        item['bookmaker'] = 'TheGreek'
        item['sport'] = 'Soccer'
        item['eventDate'] = '23'
        item['moneyLine'] = {}
        item['totals'] = {}
        item['spreads'] = {}

        
        
        leagues = Selector(response).xpath('//div[@class="table-container"]')
        for league in leagues:
            item['league'] = league.xpath('h4/text()').extract()[0].strip()
            lines = leagues.xpath('div[@class="lines"]') 
            for line in lines:
                item['homeTeam'] = line.xpath('ul/li[@class="name"]/a/text()').extract()[0].strip()
                item['awayTeam'] = line.xpath('ul/li[@class="name"]/a/text()').extract()[1].strip()
                item['moneyLine']['home'] = line.xpath('ul/li[@id="ml"]/a/text()').extract()[0].strip()
                item['moneyLine']['draw'] = line.xpath('ul/li[@id="ml"]/a/text()').extract()[1].strip()
                item['moneyLine']['away'] = line.xpath('ul/li[@id="ml"]/a/text()').extract()[2].strip()
                item['totals']['points'] = line.xpath('ul/li[@id="gt"]/a/text()').extract()[0].strip().encode("utf8")
                item['totals']['over'] = line.xpath('ul/li[@id="gt"]/a/text()').extract()[1].strip()
                item['totals']['under'] = line.xpath('ul/li[@id="gt"]/a/text()').extract()[3].strip()
                item['spreads']['hdp'] = line.xpath('ul/li[@id="spread_home"]/a/text()').extract()[0].strip().encode("utf8")
                item['spreads']['home'] = line.xpath('ul/li[@id="spread_home"]/a/text()').extract()[1].strip()
                item['spreads']['away'] = line.xpath('ul/li[@id="spread_away"]/a/text()').extract()[1].strip()
                yield item
コード例 #7
0
ファイル: mobile_spider.py プロジェクト: prm10/python_project
    def parse(self, response):
        sel=Selector(response)
        sites=sel.xpath('//li[@class="item"]/div[@class="item-wrap"]')
        items=[]
        info=str(len(sites))+" mobile info have been found"
        logging.info(info.encode('utf-8'))
        for site in sites:
            item=WebCrawlingItem()

            site1=site.xpath('div[@class="item-detail"]/div[@class="item-title"]')
            site2=site.xpath('div[@class="item-detail"]/div[@class="item-rela"]/a')
            site3=site.xpath('div[@class="item-sales"]')

            name=site1.xpath('h3/a/text()').extract()
            describe=site1.xpath('span/text()').extract()
            level=site2.xpath('div[@class="score-num"]/text()').extract()
            price=site3.xpath('div[@class="price price-now"]/a/text()').extract()

            # print(name)
            item['name'] = self.str_join([d.encode('utf-8') for d in name])
            item['describe'] = self.str_join([d.encode('utf-8') for d in describe])
            item['level'] = self.str_join([d.encode('utf-8') for d in level])
            item['price'] = self.str_join([d.encode('utf-8') for d in price])
            items.append(item)
            logging.info("Appending item "+item['name'])
        logging.info("Append done.")
        return items
コード例 #8
0
ファイル: spiders.py プロジェクト: Q3B0/PythonSpider_Scrapy
    def parse_item(self, response):

        selector = Selector(response)
        companyInfo = selector.xpath('//td[@class="cont_company"]//td[@class="td_r"]/text()')
        jobInfo = selector.xpath('//*[@id="DataList1"]//table/tr')
        contactInfo = selector.xpath('//td[@class="cont_contact"]')
        contact_text = contactInfo.xpath('text()').extract()[0] + ' ' + contactInfo.xpath('text()').extract()[1] + ' ' + contactInfo.xpath('text()').extract()[2]

        #print self.mailre.findall(contact_text)
        #print self.phonePartern.match(contactInfo.xpath('text()').extract()[0])
        #print self.emainPartern(contactInfo.xpath('text()').extract()[1])
        #print (contactInfo.xpath('text()').extract()[2]).replace(' ','')

        for each in jobInfo:
            item = TsrcwItem()
            print each.extract()
            jobList = []
            try:
                for i in each.xpath('td[@class="td-grey"]/text()'):
                    if not (i.extract()).strip() == "":
                        jobList.append((i.extract()).strip())
                item['email'] = self.mailre.findall(contact_text)[0]
                item['companyName'] = (companyInfo.extract()[0]).strip()
                item['industryName'] = (companyInfo.extract()[1]).strip()
                item['companyNature'] = (companyInfo.extract()[2]).strip()
                item['jobName'] = (each.xpath('td[@class="td-grey"]/a/text()').extract()[0]).strip()
                item['jobDetail'] = self.baseUrl+(each.xpath('td[@class="td-grey"]/a/@href').extract()[0]).strip()
                item['jobRegion'] = jobList[0]
                item['requiredDegree'] = jobList[1]
                item['salary'] = jobList[2]
                item['endDate'] = jobList[3]
                yield item
            except Exception,e:
                continue
コード例 #9
0
ファイル: utils.py プロジェクト: bittirousku/hepcrawl
def get_node(text, namespaces=None):
    """Get a scrapy selector for the given text node."""
    node = Selector(text=text, type="xml")
    if namespaces:
        for ns in namespaces:
            node.register_namespace(ns[0], ns[1])
    return node
コード例 #10
0
ファイル: probehost.py プロジェクト: samwellzuk/test
async def _parse_result(host, resp):
    """
    parse response of homepage, if there have any error, return it , or return None
    """
    rurl = urlparse(str(resp.url))
    # check: do site return friendly error: url is http://www.test.com/504.htm
    m = _url_error_re.search(rurl.path)
    if m:
        return "SITE:%s" % m.group(1)
    # check: is site redirect to other site, maybe nginx config have problem or host was recycled by dns manufacturer
    if not rurl.netloc.lower().endswith(host.lower()):
        return "SITE:REDIRECT"
    # body check
    html = await resp.text()
    sel = Selector(text=html)
    emlist = sel.xpath('//body/*').extract()
    sbody = ''.join(emlist)
    # check: is site homepage blank
    if len(sbody) == 0:
        return "BODY:Blank"
    else:
        m = _nginx_error_page.search(sbody)
        if m:
            return "NGX:%s" % m.group(1)
        elif len(sbody) < _url_body_min:
            return "BODY:Min"
    return None
コード例 #11
0
    def parse_alpha(self, response):
        """ extract the alpha letters links"""
        sel = Selector(response)
        urls = sel.css("ul.alpha li a::attr(href)").extract()

        for url in urls:
            yield Request(url, callback=self.parse_page)
コード例 #12
0
 def pages(self, response):
     """
     提取各种案件的页数,并发起请求
     """
     sel = Selector(text=response.body)
     self.cases(response)   #提取首页的内容
     total = sel.xpath("//table/tbody//script/text()").re(u"共[\D]*?([\d]*?)[\D]*?页")
     try:
         total = int(total[0]) + 1
         for i in xrange(2, total):
             self.data['page'] = str(i)
             con = ["=".join(item) for item in self.data.items()]
             tail = "&".join(con)
             url = self.model_urls + "?" + tail
             fp = self.url_fingerprint(url)
             isexist = self.myRedis.sadd(self.url_have_seen,fp)
             if isexist:
                 #如果redis set ppai_dup_redis没有则插入并返回1,否则
                 #返回0
                 yield Request(url, callback=self.cases,\
                     dont_filter=False)
             else:
                 pass
     except Exception, e:
         log.msg("only_one url==%s== error=%s" %(response.url,\
             e), level=log.ERROR)
コード例 #13
0
ファイル: zappos_spider.py プロジェクト: mtaziz/jaycluster
    def parse_item(self, response):
        sel = Selector(response)
        item = ZapposItem()
        self._enrich_base_data(item, response, is_update=False)
        item['productId'] = ''.join(sel.xpath('//form[@id="prForm"]/input[@name="productId"]/@value').extract()).strip()

        if item['productId'] in self.seen_products:
            self.crawler.stats.inc_total_pages(response.meta['crawlid'], response.meta['spiderid'],
                                               response.meta['appid'], -1)
            return
        else:
            self.seen_products.add(item['productId'])
        self._enrich_same_part(item, response)
        positions = ['p', '1', '2', '3', '4', '5', '6']
        all_images = []
        image_urls = []
        for one_colorId in item['colorIds']:
            for one_position in positions:
                reg_str = r"pImgs\[%s\]\[\'4x\'\]\[\'%s\'\] = . filename: '(.*?)'," % (one_colorId, one_position)
                image_file = re_search(reg_str, response.body, dotall=False)
                image_file.replace("'", "")
                image_file.replace('"', "")
                all_images.append([one_colorId, one_position, image_file])
                if len(image_file) > 0:
                    image_urls.append(image_file)

        item['color_images'] = all_images
        item['image_urls'] = image_urls
        self.crawler.stats.inc_crawled_pages(
            crawlid=response.meta['crawlid'],
            spiderid=response.meta['spiderid'],
            appid=response.meta['appid']
        )

        return item
コード例 #14
0
ファイル: qyer.py プロジェクト: Lvxingpai/Andaman
    def parse_country_helper(self, response):
        if 'cont' in self.param:
            self.cont_map = {tmp: self.cont_map[tmp] for tmp in self.param['cont']}

        if 'country' in self.param:
            self.country_filter = [int(tmp) for tmp in self.param['country']]

        sel = Selector(response)

        for cont in self.cont_map:
            cont_node = sel.xpath('//div[@class="pla_indcountrylists"]/div[@id="%s"]' % self.cont_map[cont])[0]
            for region_node in cont_node.xpath('.//li[@class="item"]'):
                is_hot = bool(region_node.xpath('./p[@class="hot"]').extract())
                tmp = region_node.xpath('.//a[@href and @data-bn-ipg]')
                if not tmp:
                    continue
                region_node = tmp[0]
                zh_name = region_node.xpath('./text()').extract()[0].strip()
                en_name = region_node.xpath('./span[@class="en"]/text()').extract()[0].strip()
                tmp = region_node.xpath('./@data-bn-ipg').extract()[0]
                pid = int(re.search(r'place-index-countrylist-(\d+)', tmp).group(1))
                href = region_node.xpath('./@href').extract()[0]
                url = self.build_href(response.url, href)

                if self.country_filter and pid not in self.country_filter:
                    continue

                item = {'type': 'country'}
                data = {'zhName': zh_name, 'enName': en_name, 'alias': {zh_name.lower(), en_name.lower()},
                        'isHot': is_hot, 'id': pid, 'url': url}
                item['data'] = data

                yield item
コード例 #15
0
ファイル: qyer.py プロジェクト: Lvxingpai/Andaman
    def parse_homepage(self, response):
        sel = Selector(response)

        def func(node, hot):
            country_url = node.xpath('./@href').extract()[0].strip()
            country_name = node.xpath('./text()').extract()[0].strip()
            ret = node.xpath('./span[@class="en"]/text()').extract()
            country_engname = ret[0].lower().strip() if ret else None

            if 'country' in self.param and country_engname.lower() not in self.param['country']:
                return None

            sights_url = urlparse.urljoin(country_url, './sight')
            m = {"country_name": country_name, "country_url": country_url, "country_popular": hot,
                 "country_engname": country_engname, "sights_url": sights_url}
            return Request(url=sights_url, callback=self.parse_countrysights, meta={"country": m})

        for req in map(lambda node: func(node, False),
                       sel.xpath('//div[@id="allcitylist"]/div[contains(@class,"line")]/ul/li/a[@href]')):
            yield req

        for req in map(lambda node: func(node, True),
                       sel.xpath(
                               '//div[@id="allcitylist"]/div[contains(@class,"line")]/ul/li/p[@class="hot"]/a[@href]')):
            yield req
コード例 #16
0
    def stepTwo(self, response):
        hxs = Selector(response)
        translatedRows = hxs.xpath('//table[@id="translations"]/tr[ contains(@class, "status-current") ]/td[@class="original"]')
       
        # print ( len(untranslatedRows) )
        # pdb.set_trace()

        for rows in translatedRows:

            aux = ""

            for r in rows.xpath('./child::node()').extract():        
                aux = aux + r.strip() + ' '        
            
            i = self.compareStrings(aux) 
            
            if i is not None:
                #scrapy item
                # traductionItem = W
                # traductionItem['originalString'] = aux
                self.untranslated[i]['translatedString'] = rows.xpath('./..//td[@class="translation foreign-text"]/text()').extract()[0].strip()
                
        paginaSiguiente = []
        paginaSiguiente = hxs.xpath('//div[@class="paging"]/a[@class="next"]/@href')

        try:            
            fullUrl_toNextPage = response.urljoin( paginaSiguiente[0].extract() )
            return fullUrl_toNextPage
        except Exception:
            return None
コード例 #17
0
ファイル: getUrls.py プロジェクト: tienhv/LM_Scraper
def parseAreas(data):

	global pURL4
	hxs = Selector(text=data)

	for x in hxs.xpath("//div[@id=\'location_area\']/div[@class=\'facet-values\']/a/@href").extract():
		pURL4.append(addURL+x)
コード例 #18
0
ファイル: MafengwoSpider.py プロジェクト: Lvxingpai/Andaman
    def parse_reply(self, response):
        items = []
        sel = Selector(response)
        item = MafengwoYoujiItem()
        item_data = response.meta['item']

        item['author'] = item_data['author']
        item['title'] = item_data['title']
        item['reply'] = item_data['reply']
        item['place'] = item_data['place']
        item['public_time'] = item_data['public_time']
        item['way'] = item_data['way']
        item['days'] = item_data['days']
        item['contents'] = item_data['contents']
        item['cost'] = item_data['cost']
        page = response.meta['page']
        max_page = response.meta['max_page']
        template_url = response.meta['url'][:-1]

        reply = sel.xpath('//div[@class="post_item"]/div/div[contains(@class,"a_con_text reply")]').extract()
        if reply:
            item['reply'].extend(reply)
        max_page = int(max_page)
        if page < max_page:
            page += 1
            url = "%s%d" % (template_url, page)
            return Request(url=url, callback=self.parse_reply,
                           meta={'item': item, 'page': page, 'max_page': max_page, 'url': url})
        else:
            items.append(item)
            return items
コード例 #19
0
    def stepOne(self, response):
        
        hxs = Selector(response)
        # untranslatedRows = hxs.xpath('//table[@id="translations"]/tr[@class="preview untranslated priority-normal no-warnings"]/td[@class="original"]')
        untranslatedRows = hxs.xpath('//table[@id="translations"]/tr[ contains(@class, "untranslated") ]/td[@class="original"]')

        for rows in untranslatedRows:

            aux = WordpressTranslationHackItem()
            aux['originalString'] = ''

            for r in rows.xpath('./child::node()').extract():        
                aux['originalString'] = aux['originalString'] + r.strip() + ' '        
            
            self.untranslated.append( aux )
                
            # print ( self.untranslated[-1] )
            # print ( '------------------' )
            # pdb.set_trace()

        paginaSiguiente = []
        paginaSiguiente = hxs.xpath('//div[@class="paging"]/a[@class="next"]/@href')        

        try:            
            fullUrl_toNextPage = response.urljoin( paginaSiguiente[0].extract() )
            return fullUrl_toNextPage
        except Exception:
            return None
コード例 #20
0
ファイル: detail.py プロジェクト: Syhen/heretofore
 def parse_fans(self, response):
     fans_page = response.meta['fans_page']
     fans_level = response.meta['fans_level']
     item = response.meta['item']
     if u'暂无霸王票' in response.body.decode('gbk', 'ignore'):
         item['fans'] = []
         if fans_level:
             counter = Counter(fans_level)
             item['fans'] = [{'name': k, 'value': counter[k]} for k in counter]
         yield item
         return
     sel = Selector(text=response.body.decode('gbk', 'ignore'))
     fans_level.extend(sel.xpath('//*[@id="rank"]/div[2]/table/tr/td[2]/text()').extract())
     fans_page += 1
     if fans_page > 5:
         counter = Counter(fans_level)
         item['fans'] = [{'name': k, 'value': counter[k]} for k in counter]
         yield item
     else:
         yield Request(
             url='http://www.jjwxc.net/reader_kingticket.php?novelid={0}&page={1}'.format(
                 item['book_id'], fans_page),
             meta={'item': item, 'fans_page': fans_page, 'fans_level': fans_level},
             callback=self.parse_fans,
             dont_filter=True
         )
コード例 #21
0
 def parse(self, response):
     zip_file = open('CANADA_ZIPCODES.txt', 'r+')
     zip_list = filter(None, zip_file.read().split('\n'))
     for zip_item in zip_list:
         print "*** zip_item"
         print zip_item
         geo_url = 'https://maps.google.com/?q=%s canada'%(zip_item)
         try:
             map_url_content = requests.get(geo_url).content
         except:
             sleep(15)
             map_url_content = requests.get(geo_url).content
         sleep(3)
         sell = Selector(text=map_url_content)
         map_error_1 = sell.xpath(
             '//div[@class="sp-error-msg"]|//div[@class="noprint res"]/div//div[contains(@id,"marker_B")]')
         latlong = ' '.join(sell.xpath('//script').extract()) if not map_error_1 else ''
         lat_lng = re.findall(r'",\[(-?\d+\.?\d*),(-?\d+\.?\d*)\]\]', latlong, re.I)
         venue_latitude, venue_longitude = lat_lng[0] if lat_lng else ('', '')
         print venue_latitude, venue_longitude
         if not venue_latitude or not venue_longitude:
             with open('missing_lat_lng.txt', 'a+') as d:
                 print "*** DROPPED ZIP - %s"%(zip_item)
                 d.write(zip_item+'\n')
             print "NO LATITUDE OR LONGITUDE"
         else:
             fetch_url = 'http://api.invisalign.com/svc/rd?pc=%s&cl=CA&lat=%s&lng=%s&it=us'%(zip_item, venue_latitude, venue_longitude)
             meta_data = {'venue_latitude': venue_latitude,
                          'venue_longitude': venue_longitude,
                          'zip_code': zip_item}
             yield Request(url = fetch_url, dont_filter=True, callback=self.parse_result, meta=meta_data)
コード例 #22
0
ファイル: alternate.py プロジェクト: Overdaix/Project5
 def parse_item(self, response):
     items = []
     sel = Selector(response)
     print("test1")
     products = sel.xpath('//*[@id="coreProductInfos"]/div[2]')
    # breadcrumbs = sel.xpath('//div[@id ="contentWrapper"]')\
     table = sel.xpath('//tr[contains(td, "techDataCol")]')
     category = sel.xpath('//*[@id="contentWrapper"]/div[1]/span[2]/a/span/text()').extract()
     print(category)
     for product in products:
         if 'Geheugen' in category:
             item = Memory()
             print (table.xpath('//td/text()').extract())
             item['Category'] = category
             item['Name'] = product.xpath('//td[contains(td[1], "Modelnaam")]/td[2]/table/tbody/tr/td/text()').extract()
             item['Brand'] = product.xpath('//*[@id="details"]/div[4]/div/table/tbody/tr[2]/td[2]/table/tbody/tr/td/text()').extract()
             item['Quantity'] = product.xpath('//tr[contains(td[1], "Aantal")]/td[2]/text()').extract()
             item['Size'] = product.xpath('//tr[contains(td[1], "Modulegrootte")]/td[2]/text()').extract()
             item['PriceGB'] = product.xpath('//tr[contains(td[1], "Prijs per GB")]/td[2]/text()').extract()
             item['Type'] = product.xpath('//tr[contains(td[1], "Geheugentype")]/td[2]/text()').extract()
             item['Specification'] = product.xpath('//tr[contains(td[1], "Geheugen Specificatie")]/td[2]/text()').extract()
             item['LowVoltage'] = product.xpath('//tr[contains(td[1], "Low Voltage DDR")]/td[2]/text()').extract()
             item['Voltage'] = product.xpath('//tr[contains(td[1], "Spanning")]/td[2]/text()'). extract()
             item['Warranty'] = product.xpath('//tr[contains(td[1], "Fabrieksgarantie")]/td[2]/text()').extract()
             item['Ean'] = product.xpath('//tr[contains(td[1], "EAN")]/td[2]/text()').extract()
             item['Sku'] = product.xpath('//tr[contains(td[1], "SKU")]/td[2]/text()').extract()
             print("Geheugen!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
             items.append(item)
         return items
コード例 #23
0
 def detail(self, response):
     """
     extract detail info
     """
     sel = Selector(text=response.body)
     
     condition = sel.xpath(self.xpathSen["brand"]).extract()
     if len(condition) != 0:
         xpath_keys = ["type_auto","brand","level","BSX",
              "CSJG","ZWGS","PL","RLXS","QDFS"]
         xpath_conf = ["DDTC","DDTJZY","ESP","GPS","DSXH",
              "DCLD","DGLFXP"]
         keys_info = []
         for xpath_str in xpath_keys:
             tmp = sel.xpath(self.xpathSen[xpath_str]).extract()
             try:
                 keys_info.append(tmp[0])
             except Exception, e:
                 keys_info.append("")
                 log.msg("error info=%s keys_info=%s" %(e, "\001".join(keys_info)), level=log.ERROR)
         
         conf_info = []
         for xpath_s in xpath_conf:
             tmp = sel.xpath(self.xpathSen[xpath_s]).extract()
             try:
                 conf_info.append(tmp[0])
             except Exception, e:
                 conf_info.append("-")
                 log.msg("error info=%s conf_info=%s"%(e, \
                     "\001".join(conf_info)), level=log.ERROR)
コード例 #24
0
    def parse_location(self, response):
        sel = Selector(response)
        print(" **************** LOCATION LIST *************")
        print(response.url)
        print(" **************** LOCATION LIST *************")

        location = sel.xpath("//ul[@class='geoList']")
        for loc in location:
            state_link = loc.xpath("li/a/@href").extract()
            print(" **************** Attraction List starts *************")

            for link in state_link:
                url_link = response.urljoin(link)
                print(url_link)
                # "https://www.tripadvisor.com/Attractions-g34345-Activities-Key_West_Florida_Keys_Florida.html"
                yield scrapy.Request(url_link, callback=self.parse_attraction)
            print(" **************** Attraction List  ends *************")



            # yield scrapy.Request(url_link,callback=self.parse_test)

        locations = sel.xpath("//a[@class='guiArw sprite-pageNext  pid0']/@href").extract()
        print(" **************** LOCATION LIST  PAGINATION  starts *************")
        print(locations)
        print(" **************** LOCATION Link *************")

        for location in locations:
            if location:
                location_link = response.urljoin(location)
                print(location_link)
                yield scrapy.Request(location_link, callback=self.parse_location)
        print(" **************** LOCATION Link *************")

        print(" **************** LOCATION LIST  PAGINATION  ends *************")
コード例 #25
0
    def parse_channel(self, response):
        hxs = Selector(response)
        item = response.meta['record']
        item['video_url'] = hxs.xpath("body//div[@id='divVideoHolder']/@videosrc").extract()[0]
        item["title"] = hxs.xpath("body//div[@id='divTitrGrid']/text()").extract()[0]

        return item
コード例 #26
0
ファイル: migration.py プロジェクト: 01-/portia
def load_annotations(body):
    """Create slybot annotations from annotated html."""
    if not body:
        return {'annotations-plugin': {'extracts': []}}
    sel = Selector(text=add_tagids(body))
    existing_ids = set()
    annotations = []
    for elem in sel.xpath('//*[@data-scrapy-annotate]'):
        attributes = elem._root.attrib
        annotation = json.loads(unquote(attributes['data-scrapy-annotate']))
        if (isinstance(elem._root, _Element) and
                elem._root.tag.lower() == 'ins'):
            annotation.update(find_generated_annotation(elem))
        else:
            annotation['tagid'] = attributes.get('data-tagid')
        if 'id' not in annotation:
            annotation['id'] = gen_id(disallow=existing_ids)
        existing_ids.add(annotation['id'])
        annotations.append(annotation)
    for elem in sel.xpath('//*[@%s]' % '|@'.join(IGNORE_ATTRIBUTES)):
        attributes = elem._root.attrib
        for attribute in IGNORE_ATTRIBUTES:
            if attribute in attributes:
                break
        ignore = {attribute[len('data-scrapy-'):]: True}
        if 'id' not in ignore:
            ignore['id'] = gen_id(disallow=existing_ids)
        existing_ids.add(ignore['id'])
        annotations.append(ignore)
    return {'annotations-plugin': {'extracts': annotations}}
コード例 #27
0
	def parse_page_2(self, response):
		hxs = Selector(response)
		url = hxs.xpath("//a[@class='post-get-it-button--primary']/@href").extract()[0]
		return Request(
			url=url,
			callback=self.parse_end_page,
			meta={'item': response.meta['item']})
コード例 #28
0
    def parse(self, response):
        driver = response.meta['driver']
        for _, value in self.df.iterrows():
            driver.get(value['url'])
            time.sleep(2)

            html = driver.page_source
            resp_obj = Selector(text=html)

            check1 = resp_obj.xpath("//div[@data-type='items']")
            check2 = resp_obj.xpath(
                "//span[text()='Shop by Category' or text()='Shop by category']/parent::span/parent::button/following-sibling::div/div/ul/li"
            )
            check3 = resp_obj.xpath(
                "//h2[text()='Shop by category']/parent::div/parent::div/following-sibling::div//div[@class='TempoCategoryTile-tile valign-top']"
            )
            if check1:
                cntr = 1
                while True:
                    html = driver.page_source
                    resp_obj = Selector(text=html)
                    listings = resp_obj.xpath("//div[@data-type='items']")
                    for prods in listings:
                        product_url = f'''https://www.walmart.com{prods.xpath(".//div[@class='search-result-product-title gridview']/a/@href").get()}'''
                        product_name = prods.xpath(
                            "normalize-space(.//div[@class='search-result-product-title gridview']/a/span/text())"
                        ).get()
                        price = prods.xpath(
                            "normalize-space(.//span[@class='price-main-block']/span/span/text())"
                        ).get()
                        if not product_name:
                            product_url = f'''https://www.walmart.com{prods.xpath(".//span[text()='Product Title']/parent::div/a/@href").get()}'''
                            product_name = prods.xpath(
                                "normalize-space(.//span[text()='Product Title']/parent::div/a/span/text())"
                            ).get()
                        if not price:
                            price = f'''{prods.xpath("normalize-space(.//span[@class='price price-main'][1]/span/text())").get()} - {prods.xpath("normalize-space(.//span[@class='price price-main'][2]/span/text())").get()}'''
                        yield {
                            'product_url': product_url,
                            'product_name': product_name,
                            'product_price': price,
                            'lvl1_cat': value['lvl1_cat'],
                            'lvl2_cat': value['lvl2_cat'],
                            'lvl3_cat': value['lvl3_cat'],
                            'lvl4_cat': None
                        }

                    next_page = resp_obj.xpath(
                        "//span[text()='Next Page']/parent::button")
                    cntr += 1
                    if next_page:
                        next_page = resp_obj.xpath(
                            f"//ul[@class='paginator-list']/li/a[text()='{cntr}']/@href"
                        ).get()
                        driver.get(f"https://www.walmart.com{next_page}")
                        time.sleep(2)
                    else:
                        break

            elif check2:
                driver.execute_script("window.open('');")
                driver.switch_to.window(driver.window_handles[1])
                for listings in check2:
                    lvl4_cat = listings.xpath(".//a/span/text()").get()
                    url = listings.xpath(".//a/@href").get()
                    driver.get(f"https://www.walmart.com{url}")
                    cntr = 1
                    while True:
                        html = driver.page_source
                        resp_obj = Selector(text=html)
                        listings = resp_obj.xpath("//div[@data-type='items']")
                        for prods in listings:
                            product_url = f'''https://www.walmart.com{prods.xpath(".//div[@class='search-result-product-title gridview']/a/@href").get()}'''
                            product_name = prods.xpath(
                                "normalize-space(.//div[@class='search-result-product-title gridview']/a/span/text())"
                            ).get()
                            price = prods.xpath(
                                "normalize-space(.//span[@class='price-main-block']/span/span/text())"
                            ).get()
                            if not product_name:
                                product_url = f'''https://www.walmart.com{prods.xpath(".//span[text()='Product Title']/parent::div/a/@href").get()}'''
                                product_name = prods.xpath(
                                    "normalize-space(.//span[text()='Product Title']/parent::div/a/span/text())"
                                ).get()
                            if not price:
                                price = f'''{prods.xpath("normalize-space(.//span[@class='price price-main'][1]/span/text())").get()} - {prods.xpath("normalize-space(.//span[@class='price price-main'][2]/span/text())").get()}'''
                            yield {
                                'product_url': product_url,
                                'product_name': product_name,
                                'product_price': price,
                                'lvl1_cat': value['lvl1_cat'],
                                'lvl2_cat': value['lvl2_cat'],
                                'lvl3_cat': value['lvl3_cat'],
                                'lvl4_cat': lvl4_cat
                            }

                        next_page = resp_obj.xpath(
                            "//span[text()='Next Page']/parent::button")
                        cntr += 1
                        if next_page:
                            next_page = resp_obj.xpath(
                                f"//ul[@class='paginator-list']/li/a[text()='{cntr}']/@href"
                            ).get()
                            driver.get(f"https://www.walmart.com{next_page}")
                            time.sleep(2)
                        else:
                            break
                driver.close()
                driver.switch_to.window(driver.window_handles[0])

            elif check3:
                driver.execute_script("window.open('');")
                driver.switch_to.window(driver.window_handles[1])
                for listings in check3:
                    lvl4_cat = listings.xpath(".//span/text()").get()
                    url = listings.xpath(".//following-sibling::a/@href").get()
                    driver.get(f"https://www.walmart.com{url}")
                    cntr = 1
                    while True:
                        html = driver.page_source
                        resp_obj = Selector(text=html)
                        listings = resp_obj.xpath("//div[@data-type='items']")
                        for prods in listings:
                            product_url = f'''https://www.walmart.com{prods.xpath(".//div[@class='search-result-product-title gridview']/a/@href").get()}'''
                            product_name = prods.xpath(
                                "normalize-space(.//div[@class='search-result-product-title gridview']/a/span/text())"
                            ).get()
                            price = prods.xpath(
                                "normalize-space(.//span[@class='price-main-block']/span/span/text())"
                            ).get()
                            if not product_name:
                                product_url = f'''https://www.walmart.com{prods.xpath(".//span[text()='Product Title']/parent::div/a/@href").get()}'''
                                product_name = prods.xpath(
                                    "normalize-space(.//span[text()='Product Title']/parent::div/a/span/text())"
                                ).get()
                            if not price:
                                price = f'''{prods.xpath("normalize-space(.//span[@class='price price-main'][1]/span/text())").get()} - {prods.xpath("normalize-space(.//span[@class='price price-main'][2]/span/text())").get()}'''
                            yield {
                                'product_url': product_url,
                                'product_name': product_name,
                                'product_price': price,
                                'lvl1_cat': value['lvl1_cat'],
                                'lvl2_cat': value['lvl2_cat'],
                                'lvl3_cat': value['lvl3_cat'],
                                'lvl4_cat': lvl4_cat
                            }

                        next_page = resp_obj.xpath(
                            "//span[text()='Next Page']/parent::button")
                        cntr += 1
                        if next_page:
                            next_page = resp_obj.xpath(
                                f"//ul[@class='paginator-list']/li/a[text()='{cntr}']/@href"
                            ).get()
                            driver.get(f"https://www.walmart.com{next_page}")
                            time.sleep(2)
                        else:
                            break
                driver.close()
                driver.switch_to.window(driver.window_handles[0])

            else:
                pass
コード例 #29
0
    def parse_detail(self, response):
        try:
            # 数据获取不全
            data = Selector(text=response.body.decode('gbk'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            # 共有字段
            fileTitle = data.xpath(
                '//td[@class="fh tac bw fwb f18-0 pl2 b0"]/text()'
            ).extract_first()
            # 正文标题
            textTitle = data.xpath(
                '//td[@class="fh vat bw f8-0 b1"]/table[1]//tr[1]/td[@align="center"]/text()'
            ).extract_first()
            supllyType = response.meta.get('supllyType').strip()
            administration = response.meta.get('administration').strip()
            supplyNoticeTitle = response.meta.get('supplyNoticeTitle').strip()
            publishTime = response.meta.get('publishTime').strip()
            projectName = ''
            parcelNumber = ''
            parcelLocation = ''
            landPurpose = ''
            landArea = ''
            transferTimeLimit = ''
            transferPrice = ''
            landPurposeDetail = ''
            transferUnit = ''
            remark = ''
            publicityPeriod = ''
            contactUnit = ''
            unitAddr = ''
            postalCode = ''
            contactTel = ''
            contacter = ''
            email = ''
            lanServiceCondition = ''

            # 公告类型
            # noticeType =
            # 公示期
            publicityPeriod = reFunction(u'公示期:([\s\S]*)三、',
                                         reFunction('四、[\s\S]*',
                                                    items)).strip()
            # 联系单位
            contactUnit = reFunction(u'联系单位:([\s\S]*)单位地址',
                                     reFunction('四、[\s\S]*', items)).strip()
            # 单位地址
            unitAddr = reFunction(u'单位地址:([\s\S]*)邮政编码',
                                  reFunction('四、[\s\S]*', items)).strip()
            # 邮政编码
            postalCode = reFunction(u'邮政编码:([\s\S]*)联系电话',
                                    reFunction('四、[\s\S]*', items)).strip()
            # 联系电话
            contactTel = reFunction(u'联系电话:([\s\S]*)联 系 人',
                                    reFunction('四、[\s\S]*', items)).strip()
            # 联系人
            contacter = reFunction(u'联 系 人:([\s\S]*)电子邮件',
                                   reFunction('四、[\s\S]*', items)).strip()
            # 电子邮件
            email = reFunction(u'电子邮件:([\w\.\@]*)(?:[\S]*)',
                               reFunction('四、[\s\S]*', items)).strip()
            if '宗地编号' in items:
                for item in [
                        '宗地编号' + _ for _ in re.findall('([\s\S]*)二、', items)
                    [0].split('宗地编号')[1:]
                ]:
                    # 宗地编号
                    parcelNumber = reFunction('宗地编号:(?:\s*)([\s\S]*)地块位置',
                                              item).strip()
                    # 地块位置	parcelArea
                    parcelLocation = reFunction('地块位置:(?:\s*)([\s\S]*)土地用途:',
                                                item).strip()
                    # 土地用途
                    landPurpose = reFunction('土地用途:(?:\s*)([\s\S]*)土地面积\(公顷\)',
                                             item).strip()
                    # 土地面积(公顷)
                    landArea = reFunction(
                        '土地面积\(公顷\):(?:\s*)([\w}/\.{]*)(?:\s*)', item).strip()
                    # 项目名称
                    projectName = reFunction('项目名称:(?:\s*)([\s\S]*)土地用途明细',
                                             item).strip()
                    # 出让年限
                    transferTimeLimit = reFunction(
                        '出让年限:(?:\s*)([\s\S]*)成交价\(万元\)', item).strip()
                    # 成交价(万元)
                    transferPrice = reFunction(
                        '成交价\(万元\):(?:\s*)([\s\S]*)土地用途明细', item).strip()
                    # 土地用途明细(用途名称、面积)
                    landPurposeDetail = reFunction(
                        '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位',
                        item).strip() if reFunction(
                            '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位',
                            item).strip() else reFunction(
                                '(?:\s*)([\d\.]*)(?:[\s]*)受让单位', item).strip()
                    # 受让单位
                    transferUnit = reFunction('受让单位:(?:\s*)([\w}/{]*)(?:\s*)',
                                              item).strip()
                    # 土地使用条件
                    lanServiceCondition = reFunction(
                        '土地使用条件:(?:\s*)([\s\S]*)备注', item).strip()
                    # 备注
                    # remark = reFunction(u'备注:(?:\s*)([\w}/,、\u4e00-\uffe5()《》:\-\.<≤。{\u3002\uff1f\uff01\uff0c\u3001\uff1b\uff1a\u201c\u201d\u2018\u2019\uff08\uff09\u300a\u300b\u3008\u3009\u3010\u3011\u300e\u300f\u300c\u300d\ufe43\ufe44\u3014\u3015\u2026\u2014\uff5e\ufe4f\uffe5]*)(?:\s*)', item).strip()
                    remark = reFunction(u'备注:(?:\s*)([\s\S]*)(?:\s*)[二、]?',
                                        item).strip()
                    # 爬取时间
                    crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S",
                                                 time.localtime())
                    # 爬取地址url
                    url = response.url
                    # 唯一标识
                    md5Mark = encrypt_md5(parcelNumber + publishTime +
                                          parcelLocation + url)

                    # 存储数据
                    csvFile = [
                        administration, supplyNoticeTitle, publishTime,
                        fileTitle, textTitle, projectName, parcelNumber,
                        parcelLocation, landPurpose, landArea,
                        transferTimeLimit, transferPrice, landPurposeDetail,
                        transferUnit, remark, publicityPeriod, contactUnit,
                        unitAddr, postalCode, contactTel, contacter, email,
                        lanServiceCondition, crawlingTime, url, md5Mark
                    ]
                    self.fileDetail.write(','.join([
                        _.replace(',', ' ').replace('\n', '').replace(
                            '\r', '') if _ else _ for _ in csvFile
                    ]))
                    self.fileDetail.write('\n')
                    yield
                    #TODO
            elif '地块编号' in items:
                for item in [
                        '地块编号' + _ for _ in re.findall('([\s\S]*)二、', items)
                    [0].split('地块编号')[1:]
                ]:
                    # 地块编号
                    parcelNumber = reFunction('地块编号:(?:\s*)([\s\S]*)地块位置',
                                              item).strip()
                    # 地块位置	parcelArea
                    parcelLocation = reFunction('地块位置:(?:\s*)([\s\S]*)土地用途:',
                                                item).strip()
                    # 土地用途
                    landPurpose = reFunction('土地用途:(?:\s*)([\s\S]*)土地面积\(公顷\)',
                                             item).strip()
                    # 土地面积(公顷)
                    landArea = reFunction(
                        '土地面积\(公顷\):(?:\s*)([\w}/\.{]*)(?:\s*)', item).strip()
                    # 项目名称
                    projectName = reFunction('项目名称:(?:\s*)([\s\S]*)土地用途明细',
                                             item).strip()
                    # 出让年限
                    transferTimeLimit = reFunction(
                        '出让年限:(?:\s*)([\s\S]*)成交价\(万元\)', item).strip()
                    # 成交价(万元)
                    transferPrice = reFunction(
                        '成交价\(万元\):(?:\s*)([\s\S]*)土地用途明细', item).strip()
                    # 土地用途明细(用途名称、面积)
                    landPurposeDetail = reFunction(
                        '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位',
                        item).strip() if reFunction(
                            '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位',
                            item).strip() else reFunction(
                                '(?:\s*)([\d\.]*)(?:[\s]*)受让单位', item).strip()
                    # 受让单位
                    transferUnit = reFunction('受让单位:(?:\s*)([\w}/{]*)(?:\s*)',
                                              item).strip()
                    # 土地使用条件
                    lanServiceCondition = reFunction(
                        '土地使用条件:(?:\s*)([\s\S]*)备注', item).strip()
                    # 备注
                    remark = reFunction(u'备注:(?:\s*)([\s\S]*)(?:\s*)[二、]?',
                                        item).strip()
                    # 爬取时间
                    crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S",
                                                 time.localtime())
                    # 爬取地址url
                    url = response.url
                    # 唯一标识
                    md5Mark = encrypt_md5(parcelNumber + publishTime +
                                          parcelLocation + url)

                    # 存储数据
                    csvFile = [
                        administration, supplyNoticeTitle, publishTime,
                        fileTitle, textTitle, projectName, parcelNumber,
                        parcelLocation, landPurpose, landArea,
                        transferTimeLimit, transferPrice, landPurposeDetail,
                        transferUnit, remark, publicityPeriod, contactUnit,
                        unitAddr, postalCode, contactTel, contacter, email,
                        lanServiceCondition, crawlingTime, url, md5Mark
                    ]
                    self.fileDetail.write(','.join([
                        _.replace(',', ' ').replace('\n', '').replace(
                            '\r', '') if _ else _ for _ in csvFile
                    ]))
                    self.fileDetail.write('\n')

            #TODO
        except Exception as e:
            self.log(f'详情页数据解析失败, 错误: {e}', level=logging.ERROR)
コード例 #30
0
 def parseNews(self, response):
     self.response_body_decode(response)
     sel = Selector(response)
     homeurl = tools.getHomeUrl(response.url)
     brandname = response.meta['brandname']
     news = None  # news保存新闻主体部分的SelectorList
     pagerule = None
     # 判断是否已经可以确定页面规则
     if response.meta.has_key('pagerule'):
         pagerule = response.meta['pagerule']
         news = sel.xpath(pagerule['pageform'])
     else:
         # 对于新闻页面规则库的每条规则进行匹配,然后对该类型的新闻页面进行爬取
         for each_rule in newspage_type.page_rules:
             news = sel.xpath(each_rule['pageform'])
             if len(news) > 0:
                 pagerule = each_rule
                 break
     if pagerule is None:
         raise ValueError('Error processing (' + response.url +
                          ') This page do not have corresponding rules')
     # 获得allpage 和 nextpage url
     if pagerule['allpage'] is None:
         allpage = []
     else:
         allpage = news.xpath(pagerule['allpage']).extract()
     if pagerule['nextpage'] is None:
         nextpage = []
     else:
         nextpage = news.xpath(pagerule['nextpage']).extract()
     # 如果包含全页阅读的url,则进行该处理
     if len(allpage) > 0:
         if tools.isCompleteUrl(allpage[0]):
             url = allpage[0]
         else:
             url = homeurl + allpage[0]
         r = Request(url, callback=self.parseNews)
         r.meta['brandname'] = brandname
         r.meta['pagerule'] = pagerule
         yield r
     elif len(nextpage) > 0:
         # 如果包含下一页,则进行该处理
         if tools.isCompleteUrl(nextpage[0]):
             url = nextpage[0]
         else:
             url = homeurl + nextpage[0]
         # 提取当前页面的title, date, content,保存到article中,传递至下一请求
         title = news.xpath(pagerule['title']).extract()
         date = self.getDate(news, response.url, pagerule['date'])
         content = self.getContent(news, pagerule['content'])
         article = {
             'brandname': brandname,
             'title': title,
             'date': date,
             'content': content
         }
         r = Request(url, callback=self.parseNextPage)
         r.meta['article'] = article
         r.meta['pagerule'] = pagerule
         yield r
     else:
         # 如果新闻只有一页,则直接提取新闻内容
         title = news.xpath(pagerule['title']).extract()
         date = self.getDate(news, response.url, pagerule['date'])
         content = self.getContent(news, pagerule['content'])
         item = NewsItem()
         item['brandname'] = brandname
         item['date'] = date
         item['title'] = "".join(title)
         item['content'] = "".join(content)
         yield item
コード例 #31
0
def preview_result(Xpath, inputtext):
    sel = Selector(text=inputtext)
    result = sel.xpath(Xpath).extract()
    n = len(result)
    for idx, element in enumerate(result[:min(4,n)], start=1):
        print(f"Element {idx}: {element}", end=sp)
コード例 #32
0
def number_of_element(Xpath, inputtext):
    sel = Selector(text=inputtext)
    print(f"Number of selected element(s): {len(sel.xpath(Xpath))} elements", end=sp)
コード例 #33
0

def preview_result(Xpath, inputtext):
    sel = Selector(text=inputtext)
    result = sel.xpath(Xpath).extract()
    n = len(result)
    for idx, element in enumerate(result[:min(4,n)], start=1):
        print(f"Element {idx}: {element}", end=sp)


sp = '\n\n'
url = 'https://www.cdc.gov/nchs/tutorials/NHANES/index_continuous.htm'
# res = requests.get(url)
# html = res.text 
html = requests.get(url).content



xpath = '//p'
xpath2 = '//*'
sel = Selector(text=html)
sll = sel.xpath('//p')[2].extract()  # extract the 3rd element (here paragrph) of the selectorList
sll_ = sel.xpath('//p')  # without extract(), the selectorList give a 36 line preview of the paragraph
slla = sel.xpath('//p').extract()
sllf = sel.xpath('//p').extract_first()



# print(sll, slla, sllf, sep=sp)

print(number_of_element(xpath, html), number_of_element(xpath2, html),preview_result(xpath, html), sep=sp)
コード例 #34
0
    def parse(self, response):
        sites = json.loads(response.text)

        spider_name = response.meta['spider_name']

        #网页html
        data = sites["items_html"]
        min_position = sites["min_position"]

        #第一条twitter
        position = ''

        if 'max_position' in sites:
            position = sites["max_position"]
        else:
            position = min_position.split('-')[2]


        if data == "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n":
            print ("抓取完成!!!,更新种子")
            self.db.updateSeedTag(spider_name)
            self.db.updateSeedCountLocation(spider_name, position)
        else:
            #是否还有下一页
            #has_more_items = sites["has_more_items"]

            item = SpiderTwitterItem()

            # 获得贴文作者
            twitter_author = re.compile('data-name="(.+)" data-user-id=').findall(data)[0]

            selector_app = Selector(text=data)
            twitter_group = selector_app.xpath("//li[@class='js-stream-item stream-item stream-item\n']").extract()
            twitter_group_count = len(twitter_group)

            next_page_id = ""

            for twitter_personal in twitter_group:
                selector_content = Selector(text=twitter_personal)
                twitter_id = selector_content.xpath("//li[@class='js-stream-item stream-item stream-item\n']/@data-item-id").extract()

                if len(twitter_id) > 0:
                    next_page_id = twitter_id[0]

                    if self.db.getTwitterById(next_page_id):

                        # 判断是否是爬取到之前记录位置
                        if self.db.isSeedLocation(spider_name, next_page_id):

                            print ("%s最新推文抓取完毕"%spider_name)
                            self.db.updateSeedCountLocation(spider_name, position)
                            return

                        print ("%s已存在,进行去重过滤"%next_page_id)
                        continue
                    else:
                        item['twitter_id'] = twitter_id

                else:
                    item['twitter_id'] = ''

                twitter_content_whole = ""
                twitter_content_list = selector_content.xpath("//div[@class='js-tweet-text-container']").extract()

                for twitter_content in twitter_content_list:
                    selector_content_text = Selector(text=twitter_content)
                    twitter_content_text = selector_content_text.xpath("//text()").extract()
                    twitter_content_text_num = len(twitter_content_text)
                    for i in range(twitter_content_text_num):
                        if twitter_content_text[i] != "  " and twitter_content_text[i] != "\n  ":
                            twitter_content_add = twitter_content_text[i].replace("\n","")
                            twitter_content_whole += twitter_content_add

                twitter_content_whole_trun = twitter_content_whole.replace('"','\\"')
                twitter_href = selector_content.xpath("//small[@class='time']/a/@href").extract()
                twitter_time = selector_content.xpath("//small[@class='time']/a/@title").extract()
                twitter_num = selector_content.xpath("//span[@class='ProfileTweet-actionCountForAria']/text()").extract()
               
                if len(twitter_num) > 0:
                    twitter_reply = twitter_num[0]
                    twitter_trunsmit = twitter_num[1]
                    twitter_zan = twitter_num[2]
                else:
                    twitter_reply = ''
                    twitter_trunsmit = ''
                    twitter_zan = ''

                twitter_img = selector_content.xpath("//div[@class='AdaptiveMedia-photoContainer js-adaptive-photo ']/@data-image-url").extract()
                print ("目标:%s" % twitter_id[0])
                print ("内容:%s" % twitter_content_whole_trun)
                if len(twitter_author) > 0:
                    author = twitter_author
                    item['twitter_author'] = author
                else:
                    item['twitter_author'] = ''
                if len(twitter_id) > 0:
                    tw_id = twitter_id[0]
                    item['twitter_id'] = tw_id
                else:
                    item['twitter_id'] = ''
                if twitter_content_whole:
                    content = twitter_content_whole_trun
                    item['twitter_content'] = content
                else:
                    item['twitter_content'] = ''
                if len(twitter_href) > 0:
                    href = "https://twitter.com%s"%twitter_href[0]
                    item['twitter_href'] = href
                else:
                    item['twitter_href'] = ''
                if len(twitter_time) > 0:
                    time = twitter_time[0]
                    item['twitter_time'] = time
                else:
                    item['twitter_time'] = ''
                if len(twitter_num) > 0:
                    reply = twitter_reply
                    item['twitter_reply'] = reply
                else:
                    item['twitter_reply'] = ''
                if len(twitter_num) > 0:
                    trunsmit = twitter_trunsmit
                    item['twitter_trunsmit'] = trunsmit
                else:
                    item['twitter_trunsmit'] = ''
                if len(twitter_num) > 0:
                    zan = twitter_zan
                    item['twitter_zan'] = zan
                else:
                    item['twitter_zan'] = ''
                if len(twitter_img) == 1:
                    img = twitter_img[0]
                    item['twitter_img'] = img
                elif len(twitter_img) > 1:
                    img_list = []
                    for img in twitter_img:
                        img_list.append(img)
                    item['twitter_img'] = img_list
                else:
                    item['twitter_img'] = ''
                yield item

            print ("下一页等待中...")

            #has_more_items 为true 代表还有下一页
            yield Request(url=self.next_page_url.format(spider_name,self.now_time, next_page_id, position), callback=self.parse,headers={'Referer': "https://twitter.com/"}, meta={'spider_name': spider_name})
コード例 #35
0
ファイル: kstr.py プロジェクト: byom26/Web-Scrapping
    def parse(self, response):
        driver = response.meta['driver']
        driver.maximize_window()
        driver.execute_script("window.open('');")
        driver.switch_to.window(driver.window_handles[0])
        for _, value in self.df.iterrows():
            cntr = 199
            while True:
                location = value['Location']
                category = value['Category']
                subCat = value['Subcategory']
                url = f"{value['URL']}{cntr}"
                driver.get(url)
                cntr += 1
                WebDriverWait(driver, 15).until(
                    EC.visibility_of_element_located((
                        By.XPATH,
                        "(//div[@class='js-project-group'])[2]//h3/parent::a[@class='soft-black mb3']"
                    )))

                html = driver.page_source
                respObj = Selector(text=html)

                count = respObj.xpath(
                    "normalize-space(//b[contains(@class, 'count')]/text())"
                ).get()
                pCount = int("".join(re.findall(r'\d+', count)))

                driver.switch_to.window(driver.window_handles[1])
                items = respObj.xpath(
                    "(//div[@class='js-project-group'])[2]//h3/parent::a[@class='soft-black mb3']"
                )
                for item in items:
                    title = item.xpath("normalize-space(.//h3/text())").get()
                    if title not in self.li:
                        self.li.append(title)
                        url = item.xpath(".//@href").get()
                        driver.get(url)
                        time.sleep(1)
                        WebDriverWait(driver, 15).until(
                            EC.visibility_of_element_located((
                                By.XPATH,
                                "//a[@data-modal-title='About the creator']")))
                        html1 = driver.page_source
                        respObj1 = Selector(text=html1)
                        title = respObj1.xpath(
                            "normalize-space(//h2/span/a/text())").get()
                        creator = respObj1.xpath(
                            "normalize-space(//a[@data-modal-title='About the creator']/text())"
                        ).get()
                        backers = respObj1.xpath(
                            "normalize-space(//b[contains(text(), 'backers')]/text())"
                        ).get()
                        money = respObj1.xpath(
                            "normalize-space(//span[@class='money']/text())"
                        ).get()
                        driver.find_element_by_xpath(
                            "//a[@data-modal-title='About the creator']"
                        ).click()
                        time.sleep(2)
                        html2 = driver.page_source
                        respObj2 = Selector(text=html2)
                        yield {
                            'Title':
                            title,
                            'Creator':
                            creator,
                            'Backers':
                            backers.replace(" backers", ""),
                            'Money':
                            money,
                            'Website':
                            respObj2.xpath(
                                "//h4[contains(text(), 'Websites')]/following-sibling::ul/li/a/@href"
                            ).getall(),
                            'Location':
                            location,
                            'Category':
                            category,
                            'Sub Category':
                            subCat
                        }
                    else:
                        pass
                driver.switch_to.window(driver.window_handles[0])
                a = pCount // 12
                if pCount % 12 != 0:
                    a += 1
                else:
                    a += 0
                if cntr > 200:
                    break
コード例 #36
0
    def person_registered(self, response):
        test = Selector(
            response=response).xpath('//dl/div/text()').extract_first()
        if test is not '没有数据':
            if response.meta['staff_type'] == 1:
                info_data = Selector(response=response).xpath('//dl')
                for i in info_data:
                    person_data = {
                        'companyName': response.meta['company_name'],
                        'licenseNum': '',
                        'name': response.meta['person_name'],
                        'area': '青海省',
                        'sex': '',
                        'idCard': response.meta['id_card'],
                        'grade': '',
                        'major': '',
                        'num': '',
                        'regNum': '',
                        'validTime': '',
                        'tel': '',
                        'tokenKey': self.token
                    }
                    dd = i.xpath('./dd')

                    # 注册类别
                    grade = dd[0].xpath('./b/text()').extract_first()
                    if grade:
                        person_data['grade'] = grade

                    # 注册专业
                    major = dd[1].xpath('./text()').extract_first()
                    if major:
                        person_data['major'] = major

                    # 注册号
                    num = dd[3].xpath('./text()').extract_first()
                    if num:
                        person_data['num'] = num
                    print(person_data, '注册人员')
                    yield scrapy.FormRequest(
                        url=
                        'https://api.maotouin.com/rest/companyInfo/addCompanyRecordEngineer.htm',
                        formdata=person_data,
                        callback=self.person_post,
                        meta={
                            'data': person_data,
                            'company_name': response.meta['company_name']
                        },
                        dont_filter=True,
                    )
            elif response.meta['staff_type'] == 2:
                info_data = Selector(response=response).xpath('//dl')
                for i in info_data:
                    person_data = {
                        'companyName': response.meta['company_name'],
                        'licenseNum': '',
                        'name': response.meta['person_name'],
                        'area': '青海省',
                        'sex': '',
                        'idCard': response.meta['id_card'],
                        'grade': '',
                        'major': '',
                        'num': '',
                        'regNum': '',
                        'validTime': '',
                        'tel': '',
                        'tokenKey': self.token
                    }
                    dd = i.xpath('./dd')

                    # 注册类别
                    try:
                        grade = dd[0].xpath('./b/text()').extract_first()
                    except IndexError:
                        continue
                    if grade:
                        person_data['grade'] = grade

                    # 注册专业
                    major = dd[1].xpath('./text()').extract_first()
                    if major:
                        person_data['major'] = major

                    # 职称编号
                    num = dd[2].xpath('./text()').extract_first()
                    if num:
                        person_data['num'] = num
                    print(person_data, '职业人员')
                    yield scrapy.FormRequest(
                        url=
                        'https://api.maotouin.com/rest/companyInfo/addCompanyRecordEngineer.htm',
                        formdata=person_data,
                        callback=self.person_post,
                        meta={
                            'data': person_data,
                            'company_name': response.meta['company_name']
                        },
                        dont_filter=True,
                    )
            elif response.meta['staff_type'] == 3:
                info_data = Selector(response=response).xpath('//dl')
                for i in info_data:
                    person_data = {
                        'companyName': response.meta['company_name'],
                        'licenseNum': '',
                        'name': response.meta['person_name'],
                        'area': '青海省',
                        'sex': '',
                        'idCard': response.meta['id_card'],
                        'grade': '',
                        'major': '',
                        'num': '',
                        'regNum': '',
                        'validTime': '',
                        'tel': '',
                        'tokenKey': self.token
                    }
                    dd = i.xpath('./dd')

                    # 注册专业
                    number = dd[3].xpath('./text()').extract_first()
                    if number:
                        person_data['num'] = number

                    # # 职称编号
                    # validTime = dd[4].xpath('./text()').extract_first()
                    # if validTime:
                    #     person_data['validTime'] = validTime.replace('/')
                    print(person_data, '安全三类人员')
                    yield scrapy.FormRequest(
                        url=
                        'https://api.maotouin.com/rest/companyInfo/addCompanyRecordEngineer.htm',
                        formdata=person_data,
                        callback=self.person_post,
                        meta={
                            'data': person_data,
                            'company_name': response.meta['company_name']
                        },
                        dont_filter=True,
                    )

            elif response.meta['staff_type'] == 4:
                info_data = Selector(response=response).xpath('//dl')
                for i in info_data:
                    person_data = {
                        'companyName': response.meta['company_name'],
                        'licenseNum': '',
                        'name': response.meta['person_name'],
                        'area': '青海省',
                        'sex': '',
                        'idCard': response.meta['id_card'],
                        'grade': '',
                        'major': '',
                        'num': '',
                        'regNum': '',
                        'validTime': '',
                        'tel': '',
                        'tokenKey': self.token
                    }
                    dd = i.xpath('./dd')

                    # 注册专业
                    try:
                        major = dd[0].xpath('./text()').extract_first()
                    except IndexError:
                        continue
                    if major:
                        person_data['major'] = major

                    # 资格专业
                    grade = dd[1].xpath('./text()').extract_first()
                    if grade:
                        person_data['grade'] = grade

                    # 证书编号
                    num = dd[3].xpath('./text()').extract_first()
                    if num:
                        person_data['num'] = num

                    # # 职称编号
                    # validTime = dd[4].xpath('./text()').extract_first()
                    # if validTime:
                    #     person_data['validTime'] = validTime
                    print(person_data, '专业岗位证书')
                    yield scrapy.FormRequest(
                        url=
                        'https://api.maotouin.com/rest/companyInfo/addCompanyRecordEngineer.htm',
                        formdata=person_data,
                        callback=self.person_post,
                        meta={
                            'data': person_data,
                            'company_name': response.meta['company_name']
                        },
                        dont_filter=True,
                    )

            elif response.meta['staff_type'] == 5:
                info_data = Selector(response=response).xpath('//dl')
                for i in info_data:
                    person_data = {
                        'companyName': response.meta['company_name'],
                        'licenseNum': '',
                        'name': response.meta['person_name'],
                        'area': '青海省',
                        'sex': '',
                        'idCard': response.meta['id_card'],
                        'grade': '',
                        'major': '',
                        'num': '',
                        'regNum': '',
                        'validTime': '',
                        'tel': '',
                        'tokenKey': self.token
                    }
                    dd = i.xpath('./dd')

                    # 注册专业
                    major = dd[0].xpath('./text()').extract_first()
                    if major:
                        person_data['major'] = major

                    # 资格专业
                    grade = dd[1].xpath('./text()').extract_first()
                    if grade:
                        person_data['grade'] = grade

                    # 证书编号
                    num = dd[3].xpath('./text()').extract_first()
                    if num:
                        person_data['num'] = num

                    # 有效期至
                    validTime = dd[4].xpath('./text()').extract_first()
                    if validTime:
                        person_data['validTime'] = validTime
                    print(person_data, '技术人员')
                    yield scrapy.FormRequest(
                        url=
                        'https://api.maotouin.com/rest/companyInfo/addCompanyRecordEngineer.htm',
                        formdata=person_data,
                        callback=self.person_post,
                        meta={
                            'data': person_data,
                            'company_name': response.meta['company_name']
                        },
                        dont_filter=True,
                    )

            elif response.meta['staff_type'] == 6:
                info_data = Selector(response=response).xpath('//dl')
                for i in info_data:
                    person_data = {
                        'companyName': response.meta['company_name'],
                        'licenseNum': '',
                        'name': response.meta['person_name'],
                        'area': '青海省',
                        'sex': '',
                        'idCard': response.meta['id_card'],
                        'grade': '',
                        'major': '',
                        'num': '',
                        'regNum': '',
                        'validTime': '',
                        'tel': '',
                        'tokenKey': self.token
                    }
                    dd = i.xpath('./dd')

                    # 资格类别
                    try:
                        major = dd[0].xpath('./text()').extract_first()
                    except IndexError:
                        continue
                    if major:
                        person_data['major'] = major

                    # 等级
                    grade = dd[1].xpath('./text()').extract_first()
                    if grade:
                        person_data['grade'] = grade

                    # 等级
                    num = dd[3].xpath('./text()').extract_first()
                    if num:
                        person_data['num'] = num

                    # 职称编号
                    validTime = dd[4].xpath('./text()').extract_first()
                    if validTime:
                        person_data['validTime'] = validTime
                    print(person_data, '职业技能人员')
                    yield scrapy.FormRequest(
                        url=
                        'https://api.maotouin.com/rest/companyInfo/addCompanyRecordEngineer.htm',
                        formdata=person_data,
                        callback=self.person_post,
                        meta={
                            'data': person_data,
                            'company_name': response.meta['company_name']
                        },
                        dont_filter=True,
                    )
コード例 #37
0
    def parse(self, response):
        ''' 
        Scrape archive for articles
        Parameters
        ----------
        self:
            the PostillonSpider object
        response:
            The response from a scrapy request
        '''
        def init_selenium_driver():
            '''
            Initialize and return a firefox or chorme selenium driver depending on the option SELENIUM_DRIVER 

            Returns
            -------
            A firefox or chrome selenium driver depending on the option SELENIUM_DRIVER
            '''
            if SELENIUM_DRIVER == 'Firefox':
                firefoxOptions = webdriver.FirefoxOptions()
                firefoxOptions.headless = True
                desired_capabilities = firefoxOptions.to_capabilities()
                driver = webdriver.Firefox(
                    desired_capabilities=desired_capabilities)
            else:  # Chrome driver
                chrome_options = Options()
                chrome_options.headless = True
                driver = webdriver.Chrome(options=chrome_options)
            return driver

        def get_closed_elements():
            '''
            Returns all or some closed year and month elements, depending on the limit definitions.

            Returns
            -------
            All or some closed year and month elements, depending on the limit definitions.
            '''
            # Get all closed months of year to crawl, that are newer or equal to the limit specified by LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL
            if LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL:
                # get year
                element_of_YEAR_TO_CRAWL = driver.find_element_by_class_name(
                    'year-' + str(YEAR_TO_CRAWL))

                # Get closed months
                xpath = ".//li[contains(@class, 'closed') and (contains(@class, 'month-12')"
                for month in range(LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL - 1, 12):
                    month_plus_1 = month + 1
                    xpath += " or contains(@class, 'month-" + "{:02d}".format(
                        month + 1) + "')"
                xpath = xpath + ")]"

                closed_elements = element_of_YEAR_TO_CRAWL.find_elements_by_xpath(
                    xpath)
                closed_elements.append(element_of_YEAR_TO_CRAWL)

            # Get all closed months of year to crawl
            elif YEAR_TO_CRAWL:
                element_of_YEAR_TO_CRAWL = driver.find_element_by_class_name(
                    'year-' + str(YEAR_TO_CRAWL))

                closed_elements = element_of_YEAR_TO_CRAWL.find_elements_by_class_name(
                    'closed')
                closed_elements.append(element_of_YEAR_TO_CRAWL)

            # Get all closed years/months of the entire archive
            else:
                # also finds closed months inside closed years
                closed_elements = driver.find_elements_by_class_name('closed')

            return closed_elements

        def waitForLoad():
            '''
            Wait until at 1 article per year has been loaded. 
            If the current year is being crawled wait until an article of january or LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL 
            has been loaded (Because the current month of the current year is already loaded on page load).

            '''
            CURRENT_YEAR = datetime.now().year
            TIMEOUT = 20
            wait = WebDriverWait(driver, TIMEOUT)
            try:
                # xpath for tag that with class 'date' and content that includes '2020' or '1.2020' or '<LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL>.2020',
                # depending on what is to be crawled
                xpath = "//a/div/div/div[contains(@class, 'date') and contains(string(), '"
                if YEAR_TO_CRAWL:
                    # If the current year is crawled wait for an article of the first month to be loaded.
                    # This is necessary because the current month is already loaded on page load.
                    if YEAR_TO_CRAWL == CURRENT_YEAR:
                        if LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL:
                            xpath += str(
                                LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL) + "."
                        else:
                            xpath += "1."

                    xpath += str(YEAR_TO_CRAWL) + "')]"
                    wait.until(
                        EC.presence_of_element_located((By.XPATH, xpath)))

                # Wait for 1 artile per year
                else:
                    base_xpath = xpath
                    for i in range(2008, CURRENT_YEAR + 1):
                        # xpath for tag with class 'date' and the content that includes the year i
                        xpath = base_xpath + str(i) + "')]"
                        wait.until(
                            EC.presence_of_element_located((By.XPATH, xpath)))

            except TimeoutException as e:
                logging.warning(
                    "TimeoutException has been thrown while waiting for articles to load: %s",
                    e)

        def click_elements(elements):
            '''"
            Click all elements in elements

            Parameters
            ----------
            elements:
                HTML Elements to be clicked
            '''
            for element in elements:
                try:
                    # element.click() causes Exception: "could not be scrolled into view"
                    driver.execute_script("arguments[0].click();", element)
                    # print("click: " + element.get_attribute('class').split()[1])

                except Exception as e:
                    logging.warning(
                        "An exception has been thrown while clicking closed years/months: %s",
                        e)

        driver = init_selenium_driver()
        driver.get(root)

        # Close all years/months
        click_elements(driver.find_elements_by_class_name('open'))

        # Open closed years/months to load articles
        click_elements(get_closed_elements())

        # Wait for articles to be loaded
        waitForLoad()

        # Hand-off between Selenium and Scrapy
        sel = Selector(text=driver.page_source)

        # for all ul tags with class 'month-inner' get all contained li tags and get their direct a-tag children
        articleList = sel.xpath('//ul[@class="month-inner"]//li/a')

        articleList = utils.limit_crawl(articleList, TESTRUN_ARTICLES_LIMIT)

        if articleList:
            for article in articleList:
                # extract the value of the href attribute from article
                long_url = article.xpath('./@href').extract()[0]
                # extract the content of div-tags with class 'date' contained by article
                published_time = article.xpath(
                    './/div[@class="date"]/text()').extract()
                published_time = published_time[0] if len(
                    published_time) > 0 else ''

                if long_url and not utils.is_url_in_db(long_url):
                    yield scrapy.Request(long_url,
                                         callback=self.parse_article,
                                         cb_kwargs=dict(
                                             long_url=long_url,
                                             published_time=published_time))

                else:
                    utils.log_event(utils(), self.name, long_url, 'exists',
                                    'info')
                    logging.info('%s already in db', long_url)

        # Quit the selenium driver and close every associated window
        driver.quit()
コード例 #38
0
ファイル: user.py プロジェクト: yuanxiaodu/WeiboSpider
 def parse(self, response):
     user_item = UserItem()
     user_item['crawl_time'] = int(time.time())
     selector = Selector(response)
     user_item['_id'] = re.findall('(\d+)/info', response.url)[0]
     user_info_text = ";".join(
         selector.xpath('body/div[@class="c"]//text()').extract())
     nick_name = re.findall('昵称;?:?(.*?);', user_info_text)
     gender = re.findall('性别;?:?(.*?);', user_info_text)
     place = re.findall('地区;?:?(.*?);', user_info_text)
     brief_introduction = re.findall('简介;?:?(.*?);', user_info_text)
     birthday = re.findall('生日;?:?(.*?);', user_info_text)
     sex_orientation = re.findall('性取向;?:?(.*?);', user_info_text)
     sentiment = re.findall('感情状况;?:?(.*?);', user_info_text)
     vip_level = re.findall('会员等级;?:?(.*?);', user_info_text)
     authentication = re.findall('认证;?:?(.*?);', user_info_text)
     labels = re.findall('标签;?:?(.*?)更多>>', user_info_text)
     if nick_name and nick_name[0]:
         user_item["nick_name"] = nick_name[0].replace(u"\xa0", "")
     if gender and gender[0]:
         user_item["gender"] = gender[0].replace(u"\xa0", "")
     if place and place[0]:
         place = place[0].replace(u"\xa0", "").split(" ")
         user_item["province"] = place[0]
         if len(place) > 1:
             user_item["city"] = place[1]
     if brief_introduction and brief_introduction[0]:
         user_item["brief_introduction"] = brief_introduction[0].replace(
             u"\xa0", "")
     if birthday and birthday[0]:
         user_item['birthday'] = birthday[0]
     if sex_orientation and sex_orientation[0]:
         if sex_orientation[0].replace(u"\xa0", "") == gender[0]:
             user_item["sex_orientation"] = "同性恋"
         else:
             user_item["sex_orientation"] = "异性恋"
     if sentiment and sentiment[0]:
         user_item["sentiment"] = sentiment[0].replace(u"\xa0", "")
     if vip_level and vip_level[0]:
         user_item["vip_level"] = vip_level[0].replace(u"\xa0", "")
     if authentication and authentication[0]:
         user_item["authentication"] = authentication[0].replace(
             u"\xa0", "")
     if labels and labels[0]:
         user_item["labels"] = labels[0].replace(u"\xa0",
                                                 ",").replace(';',
                                                              '').strip(',')
     education_info = selector.xpath('//div[contains(text(),"学习经历")]/following-sibling::div[1]'). \
         xpath('string(.)').extract()
     if education_info:
         user_item['education'] = education_info[0].replace(u"\xa0", "")
     work_info = selector.xpath('//div[contains(text(),"工作经历")]/following-sibling::div[1]'). \
         xpath('string(.)').extract()
     if work_info:
         user_item['work'] = work_info[0].replace(u"\xa0", "")
     request_meta = response.meta
     request_meta['item'] = user_item
     yield Request(self.base_url + '/u/{}'.format(user_item['_id']),
                   callback=self.parse_further_information,
                   meta=request_meta,
                   dont_filter=True,
                   priority=1)
コード例 #39
0
ファイル: scraper.py プロジェクト: underwhelmed-ape/Pupdate
 def _select_data_table_from_page(self, pupdate_table):
     table_section = pupdate_table.xpath(f'//div[@class="{self.table_name}"]').extract()
     return Selector(text=table_section[0])
コード例 #40
0
class JCpenneySpider(BaseCheckoutSpider):
    name = 'jcpenney_checkout_products'
    allowed_domains = ['jcpenney.com'
                       ]  # do not remove comment - used in find_spiders()

    SHOPPING_CART_URL = 'http://www.jcpenney.com/jsp/cart/viewShoppingBag.jsp'
    CHECKOUT_PAGE_URL = "https://www.jcpenney.com/dotcom/" \
                        "jsp/checkout/secure/checkout.jsp"

    def start_requests(self):
        yield scrapy.Request('http://www.jcpenney.com/')

    def _get_colors_names(self):
        swatches = self._find_by_xpath(
            '//ul[@class="small_swatches"]'
            '/li[not(@class="sku_not_available_select")]'
            '//a[not(span[@class="no_color"]) and '
            'not(span[@class="color_illegal"])]/img')
        return [x.get_attribute("name") for x in swatches]

    def select_size(self, element=None):
        default_attr_xpath = '*//div[@id="skuOptions_size"]//' \
                             'li[@class="sku_select"]'
        avail_attr_xpath = '*//*[@id="skuOptions_size"]//' \
                           'li[not(@class="sku_not_available" or @class="sku_illegal")]/a'
        self.select_attribute(default_attr_xpath, avail_attr_xpath, element)

    def select_color(self, element=None, color=None):
        default_attr_xpath = '*//li[@class="swatch_selected"]'
        avail_attr_xpath = ('*//*[@class="small_swatches"]'
                            '//a[not(span[@class="no_color"]) and '
                            'not(span[@class="color_illegal"])]')

        if color and color in self.available_colors:
            default_attr_xpath = '*//*[@class="small_swatches"]//a' \
                                 '[img[@name="%s"]]' % color

        self.select_attribute(default_attr_xpath, avail_attr_xpath, element)
        self._find_by_xpath('//h1')[0].click()
        time.sleep(1)

    def click_condition(self, default_xpath, all_xpaths):
        return self._find_by_xpath(default_xpath) or self._find_by_xpath(
            all_xpaths)

    def select_attribute(self, default_attr_xpath, avail_attr_xpath, element):
        max_retries = 20
        retries = 0
        if self.click_condition(default_attr_xpath, avail_attr_xpath):
            self._click_attribute(default_attr_xpath, avail_attr_xpath,
                                  element)
            while self.driver.find_elements(
                    By.ID, 'page_loader') and retries < max_retries:
                time.sleep(1)
                retries += 1
            print(inspect.currentframe().f_back.f_code.co_name)

    def select_width(self, element=None):
        default_attr_xpath = '*//div[@id="skuOptions_width"]//' \
                             'li[@class="sku_select"]'
        avail_attr_xpath = '*//*[@id="skuOptions_width"]//' \
                           'li[not(@class="sku_not_available" or @class="sku_illegal")]/a'

        self.select_attribute(default_attr_xpath, avail_attr_xpath, element)

    def select_waist(self, element=None):
        default_attr_xpath = (
            '*//*[@id="skuOptions_waist"]//li[@class="sku_select"]')
        avail_attr_xpath = ('*//*[@id="skuOptions_waist"]//'
                            'li[not(@class="sku_not_available" '
                            'or @class="sku_illegal")]')

        self.select_attribute(default_attr_xpath, avail_attr_xpath, element)

    def select_inseam(self, element=None):
        default_attr_xpath = (
            '*//*[@id="skuOptions_inseam"]//li[@class="sku_select"]')
        avail_attr_xpath = ('*//*[@id="skuOptions_inseam"]//'
                            'li[not(@class="sku_not_available" '
                            'or @class="sku_illegal")]')

        self.select_attribute(default_attr_xpath, avail_attr_xpath, element)

    def select_neck(self, element=None):
        default_attr_xpath = (
            '*//*[@id="skuOptions_neck size"]//li[@class="sku_select"]')

        avail_attr_xpath = ('*//*[@id="skuOptions_neck size"]//'
                            'li[not(@class="sku_not_available" '
                            'or @class="sku_illegal")]')

        self.select_attribute(default_attr_xpath, avail_attr_xpath, element)

    def select_sleeve(self, element=None):
        default_attr_xpath = (
            '*//*[@id="skuOptions_sleeve"]//li[@class="sku_select"]')

        avail_attr_xpath = ('*//*[@id="skuOptions_sleeve"]//'
                            'li[not(@class="sku_not_available" '
                            'or @class="sku_illegal")]')

        self.select_attribute(default_attr_xpath, avail_attr_xpath, element)

    def _parse_attributes(self, product, color, quantity):
        time.sleep(10)
        self.select_color(product, color)
        self.select_size(product)
        self.select_width(product)
        self.select_waist(product)
        self.select_inseam(product)
        self.select_neck(product)
        self.select_sleeve(product)
        self._set_quantity(product, quantity)

    def _get_products(self):
        return self._find_by_xpath(
            '//*[@id="regularPP"]|//*[contains(@class,"product_row")]')

    def _add_to_cart(self):
        addtobagbopus = self._find_by_xpath('//*[@id="addtobagbopus"]')
        addtobag = self._find_by_xpath('//*[@id="addtobag"]')

        if addtobagbopus:
            self._click_on_element_with_id('addtobagbopus')
        elif addtobag:
            self._click_on_element_with_id('addtobag')
        time.sleep(5)

    def _do_others_actions(self):
        skip_this_offer = self._find_by_xpath(
            '//a[contains(@href,"javascript:skipThisOffer")]')
        if skip_this_offer:
            skip_this_offer[0].click()
            time.sleep(4)

    def _set_quantity(self, product, quantity):
        quantity_option = Select(
            self.driver.find_element_by_xpath('*//*[@name="prod_quantity"]'))
        try:
            quantity_option.select_by_value(str(quantity))
            quantity_selected = quantity_option.first_selected_option.text
            if quantity_selected != str(quantity):
                time.sleep(4)
            self.log('Quantity "{}" selected'.format(quantity))
        except:
            pass

    def _get_product_list_cart(self):
        time.sleep(1)
        self.page_source = self.driver.page_source
        self.page_selector = Selector(text=self.page_source)
        try:
            item_info = re.findall('var jcpORDERJSONjcp = (\{.+?\});',
                                   self.page_source, re.MULTILINE)[0]
            self.item_info = json.loads(item_info)
            return self.item_info
        except IndexError:
            return None

    def _get_products_in_cart(self, product_list):
        return product_list.get('purchasedItems')

    def _get_subtotal(self):
        return self.item_info.get('merchantTotalWithSavings')

    def _get_total(self):
        return self.item_info.get('orderTotal')

    def _get_item_name(self, item):
        return item.get('displayName')

    def _get_item_id(self, item):
        return item.get('itemNumber')[2:]

    def _get_item_price(self, item):
        return str(item.get('lineTotalPrice'))

    def _get_item_price_on_page(self, item):
        price_on_page_from_json = float(item.get('lineUnitPrice'))
        price_on_page_from_html = self.page_selector.xpath(
            '//span[contains(@data-anid, "product_CurrentSellingPrice")]/text()'
        ).re(FLOATING_POINT_RGEX)
        price_on_page_from_html = float(is_empty(price_on_page_from_html, 0))
        return price_on_page_from_json if price_on_page_from_json >= 0 else price_on_page_from_html

    def _get_item_color(self, item):
        selector = scrapy.Selector(text=self.page_source)
        color_new = is_empty(
            selector.xpath(
                '//span[@class="size" and '
                'contains(text(),"color:")]/text()').re('color\:\n(.+)'))
        color_old = is_empty(
            selector.xpath(
                '//span[@class="size" and contains(text(),"color:")]'
                '/strong/text()').extract())
        return color_new or color_old

    def _get_item_quantity(self, item):
        return item.get('quantity')

    def _enter_promo_code(self, promo_code):
        self.log('Enter promo code: {}'.format(promo_code))
        promo_field = self._find_by_xpath('//*[@id="cr-code"]')[0]
        promo_field.send_keys(promo_code)
        time.sleep(2)
        promo_field.send_keys(Keys.ENTER)
        time.sleep(5)
        self.driver.refresh()
        time.sleep(5)
        self.item_info = self._get_product_list_cart()

    def _remove_promo_code(self):
        self.log('Remove promo code')
        try:
            remove_field = self._find_by_xpath(
                '//a[@title="remove" and @class="cr-remove"]')[0]
            remove_field.click()
            time.sleep(10)
        except IndexError:
            self.log('Invalid promo code')

    def _get_promo_total(self):
        return self._get_total()

    def _get_promo_subtotal(self):
        return str(self._get_subtotal())

    def _parse_no_longer_available(self):
        return bool(self._find_by_xpath('//*[@class="error_holder"]'))
コード例 #41
0
ファイル: Spiders.py プロジェクト: jimist/yelp_crawler
    def parse(self, response):
        page = Selector(response)
        review_boxes = page.xpath(
            '//ul[@class="ylist ylist-bordered reviews"]/li')
        del review_boxes[0]
        for review_box in review_boxes:
            rv = Review()
            rv.business_id = self.biz_id
            rv.user_id = review_box.xpath(
                './/li[@class="user-name"]/a/@href').extract_first()
            if rv.user_id != None:
                user_url = rv.user_id
                rv.user_id = rv.user_id[rv.user_id.rfind("=") + 1:]
                if (self.session.query(YelpUser).filter(
                        YelpUser.yelp_id == rv.user_id).count() == 0):
                    user = self.fetch_userdata('https://www.yelp.com' +
                                               user_url)
                    self.session.add(user)

            else:
                user = YelpUser()
                user.yelp_id = None
                user.name = "Qype User"
                user.location = review_box.xpath(
                    './/li[@class="user-location responsive-hidden-small"]/b/text()'
                ).extract_first().strip()
                user.photos_count = review_box.xpath(
                    './/li[@class="photo-count responsive-small-display-inline-block"]/b/text()'
                ).extract_first()
                user.friends_count = review_box.xpath(
                    './/li[@class="friend-count responsive-small-display-inline-block"]/b/text()'
                ).extract_first()
                user.reviews_count = review_box.xpath(
                    './/li[@class="review-count responsive-small-display-inline-block"]/b/text()'
                ).extract_first()
                user.meta = None
                self.session.add(user)

            rv.text = review_box.xpath(
                './/div[@class="review-content"]/p/text()').extract_first()
            rv.rating = review_box.xpath(
                './/div[@class="review-content"]/div[@class="biz-rating biz-rating-large clearfix"]/div/div/@title'
            ).extract_first()
            rv.rating = rv.rating[0:rv.rating.find(" ")]
            rv.date = review_box.xpath(
                './/div[@class="review-content"]/span[@class="rating-qualifier"]/text()'
            ).extract_first()
            self.session.add(rv)

        if (self.session.query(CrawlData).filter(
                CrawlData.url == response.url).count() != 0):
            crawl_data = CrawlData()
            crawl_data.body = response.body
            crawl_data.requestHeader = str(response.request.headers)
            crawl_data.url = response.url
            self.session.add(crawl_data)

        self.session.commit()
        next_page = page.xpath('//link[@rel="next"]/@href').extract_first()
        if (next_page != None):
            yield response.follow(next_page, self.parse)
コード例 #42
0
    def parse_house_info(self, resp):
        """
        解析二手房信息
        :return:
        """
        item = dict()
        response = Selector(resp)
        generalXpath = "//span[text()='{}']/../text()"
        # 链家编号
        item['houseCode'] = response.xpath(
            "//div[@class='houseRecord']/span[2]/text()").extract_first(
                "").strip()
        # 小区名
        item['houseName'] = response.xpath(
            "//div[@class='communityName']/a[1]/text()").extract_first(
                "").strip()
        # 朝向
        item['houseDirection'] = response.xpath(
            generalXpath.format("房屋朝向")).extract_first("").strip()
        # 户型
        item['houseType'] = response.xpath(
            generalXpath.format("房屋户型")).extract_first("").strip()
        # 电梯
        item['houseElevator'] = response.xpath(
            generalXpath.format("配备电梯")).extract_first("").strip()
        # 区域
        item['houseAddress'] = response.xpath(
            "//div[@class='areaName']/a/text()").extract_first("").strip()
        item['houseDistrict'] = response.xpath(
            "//div[@class='areaName']/span[@class='info']/a[2]/text()"
        ).extract_first("").strip()
        item['houseRegion'] = response.xpath(
            "//div[@class='areaName']/span[@class='info']/a[1]/text()"
        ).extract_first("").strip()
        # 楼层
        item['houseFloor'] = response.xpath(
            generalXpath.format("所在楼层")).extract_first("").strip()
        # 建筑面积
        item['houseSize'] = response.xpath(
            generalXpath.format("建筑面积")).extract_first("").strip()
        # 装修情况
        item['houseStatus'] = response.xpath(
            generalXpath.format("装修情况")).extract_first("").strip()
        # 每平米价格
        item['houseUnitPrice'] = response.xpath(
            "//span[@class='unitPriceValue']/text()").extract_first(
                "").strip()
        # 总价
        item['houseAllPrice'] = response.xpath(
            "//div[@class='price ']/span[@class='total']/text()"
        ).extract_first("").strip()
        # 建设时间
        item['houseYear'] = response.xpath(
            "//div[@class='area']/div[@class='subInfo']/text()").re_first(
                r"(\d+)")

        # 原文链接
        item['url'] = resp.url

        # 经纬度
        postions = self.pattern_position.search(resp.text)
        # 获取坐标
        item['Longitude'] = postions.group(1)
        item['Latitude'] = postions.group(2)
        self.db.update_set('houseCode', item)
        self.lianjia_spider_log.info(f'parse item success:{resp.url}')
コード例 #43
0
ファイル: fishc2.py プロジェクト: uba888/uba_python
def get_hxs(url):
	text=requests.get(url).text
	hxs=Selector(text=text)
	return hxs
コード例 #44
0
ファイル: Spiders.py プロジェクト: jimist/yelp_crawler
    def fetch_userdata(self, url):
        user = YelpUser()
        response = requests.get(url)
        page = Selector(response)
        user.yelp_id = url[url.rfind('=') + 1:]
        user.name = page.xpath(
            '//div[@class="user-profile_info arrange_unit"]/h1/text()'
        ).extract_first()
        user.location = page.xpath(
            '//div[@class="user-profile_info arrange_unit"]/h3/text()'
        ).extract_first()
        user.tagline = page.xpath(
            '//p[@class="user-tagline"]/text()').extract_first()
        user.friends_count = page.xpath(
            '//li[@class="friend-count"]/strong/text()').extract_first()
        user.reviews_count = page.xpath(
            '//li[@class="review-count"]/strong/text()').extract_first()
        user.photos_count = page.xpath(
            '//li[@class="photo-count"]/strong/text()').extract_first()
        user.image_url = page.xpath(
            '//div[@class="user-profile_avatar"]//img/@src').extract_first()

        if (MUST_DOWNLOAD_USER_IMAGE):
            if (os.path.exists(BASE_DIR + '/UserImages') == False):
                os.mkdir(BASE_DIR + '/UserImages')
            with open(BASE_DIR + 'UserImages/' + user.yelp_id + '.jpg',
                      'wb') as f:
                f.write(requests.get(user.image_url))
            user.image_path = BASE_DIR + 'UserImages/' + user.yelp_id + '.jpg'

        sidebar = page.xpath('//div[@class="user-details-overview_sidebar"]')
        extra_data = {}
        for ysection in sidebar.xpath('.//div[@class="ysection"]'):
            key = ysection.xpath('.//h4/text()').extract_first()
            if (key == 'Rating Distribution'):
                starts_distribution = ysection.xpath(
                    './/td[@class="histogram_count"]/text()').extract()
                extra_data[key] = dict()
                extra_data[key]['5 stars'] = starts_distribution[0]
                extra_data[key]['4 stars'] = starts_distribution[1]
                extra_data[key]['3 stars'] = starts_distribution[2]
                extra_data[key]['2 stars'] = starts_distribution[3]
                extra_data[key]['1 stars'] = starts_distribution[4]
            elif (key == 'Review Votes' or key == 'Stats'):
                items = ysection.xpath('.//ul/li')
                items_title = ysection.xpath(
                    './/ul/li/text()[not(normalize-space(.)="")]').extract()
                for item in items_title:
                    item = item.strip()
                extra_data[key] = dict()
                for title, item in dict(zip(items_title, items)).items():
                    extra_data[key][title.strip()] = item.xpath(
                        './/strong/text()').extract_first()
            elif (key.find('Compliments') != -1):
                items = ysection.xpath('.//li')
                extra_data['Compliments'] = dict()
                for item in items:
                    compliment = item.xpath('.//span/@class').extract_first()
                    extra_data['Compliments'][
                        self.compliments[compliment]] = item.xpath(
                            './/small/text()').extract_first()
        user.meta = json.dumps(extra_data)
        return user
コード例 #45
0
ファイル: pages.py プロジェクト: tandalf/fara_principals
 def _all_exhibit_rows(self):
     return Selector(text=self._content).xpath(
         '//table[@class="apexir_WORKSHEET_DATA"]/tr[@class="even"] | ' + \
         '//table[@class="apexir_WORKSHEET_DATA"]/tr[@class="odd"]')
コード例 #46
0
    def parse_details(self, response):
        product = response.meta["product"]
        hxs = Selector(response)

        # Get standard shipping fee
        shipping_fee = hxs.xpath(
            './/div[@class="shpp_opt"]/p[@name="delivery_option_no" and text()[contains(.,"Qxpress")]]/em/text()').extract_first()
        if not shipping_fee:
            shipping_fee = hxs.xpath(
                './/div[@class="shpp_opt"]/ul/li/label[text()[contains(.,"Qxpress")]]/em/text()').extract_first()
        if not shipping_fee:
            shipping_fee = hxs.xpath(
                './/div[@class="shpp_opt"]/ul/li/label/em/text()').extract_first()
        if shipping_fee:
            product['shipping_fee'] = shipping_fee

        # Get number of reviews
        review = hxs.xpath('.//a[@tab_name="CustomerReview"]/em/text()').extract_first()
        if review:
            product['review'] = review

        # Get seller rating
        # Format 4.1 / 5
        seller_rating = hxs.xpath('//span[@class="on"]/text()').extract_first()
        if seller_rating:
            product["seller_rating"] = seller_rating.split(" ")[-1]

        # Get oversea location
        location = hxs.xpath('//dl[@name="shipping_panel_area"]/dd/text()').extract_first()
        if location:
            product["local_overseas"] = location

        # Get sku category
        category_list = hxs.xpath('//span[@itemprop="name"]/text()').extract()
        if category_list:
            for level in range(0, len(category_list)):
                if level > 2:
                    break
                product["category_level%s" % str(level + 1)] = category_list[level]

        # Get variations

        variation_list = hxs.xpath(
            '//div[@id="inventory_layer_0"]/div[@class="innerWrap"]/div[@class="select_inner"]/ul/li/a/span/text()').extract()
        if not variation_list:
            variation_list = hxs.xpath(
                '//div[@id="opt_layer_0"]/div[@class="innerWrap"]/div[@class="select_inner"]/ul/li/a/span/text()').extract()
        if variation_list:
            max_variations = 10
            for i in range(0, min(max_variations, len(variation_list))):
                if '----' not in variation_list[i]:
                    product["V%s" % str(i + 1)] = variation_list[i]
                    quantity = re.search(r'(Qty\s\:\s)([0-9]+)([\w]*)', variation_list[i])
                    if quantity:
                        product["Q%s" % str(i + 1)] = quantity.group(2)
                    price = re.search(r'(.+\()([+-]\$[0-9]+\.[0-9]+)(\)\w*)', variation_list[i])
                    if price:
                        product["P%s" % str(i + 1)] = price.group(2)

        yield product
コード例 #47
0
        )

driver.get("https://www.matrimonio.com/wedding-planner")
time.sleep(2)

try:
  cookiesBtnElem = driver.find_element_by_xpath("//button[text()='Accetta']")
  driver.execute_script("arguments[0].click()", cookiesBtnElem)
  time.sleep(1)
except:
  pass

while True:
  pageCntr += 1
  html = driver.page_source
  respObj = Selector(text=html)

  #if pageCntr > 27:
  cards = respObj.xpath("//div[@data-list-type='Catalog']/div[@id]")
  for card in cards:
    urlList.append(card.xpath(".//a[contains(@id, 'app_lnk')]/@href").get())

  nextPageType1 = respObj.xpath(f"//a[@data-page and text()='{pageCntr}']")
  nextPageType2 = respObj.xpath(f"//span[contains(@class, 'pagination') and text()='{pageCntr}']")
  
  if nextPageType1:
    nextBtnElem = driver.find_element_by_xpath(f"//a[@data-page and text()='{pageCntr}']")
    driver.execute_script("arguments[0].click()", nextBtnElem)
    time.sleep(2)
    print(f"\n\n PAGE-{pageCntr}")
  elif nextPageType2:
コード例 #48
0
ファイル: pages.py プロジェクト: tandalf/fara_principals
 def _all_principal_td(self):
     page_selector = Selector(text=self._content)
     return page_selector.xpath(
         '//td[starts-with(@headers, "LINK BREAK_COUNTRY_NAME")]')
コード例 #49
0
# 可利用Beautiful Soup、pyquery及正则表达式来提取网页数据
# Scrapy提供了自己的数据提取方法:Selector(选择器).基于lxml构建,支持XPath选择器、CSS选择器就正则,解析速度和准确度非常高
# 1.直接使用:独立模块,可直接利用Selector类构建一个选择器对象,调用相关方法如xpath、css来提取数据
# 针对一段HTML,用如下方式狗结案Selector对象提取数据:
from scrapy import Selector

body = '<html><head><title>Hello World</title></head><body></body></html>'
selector = Selector(text=body)
title = selector.xpath('//title/text()').extract_first()    # 查找title中的文本,XPath选择器最后加text方法发可实现文本提取
print(title)
# 没有在Scrapy框架中运行,把Scrapy中的Selector单独拿出来使用,构建时传入text参数,生成了Selector选择器对象,像Scrapy中的解析
# 方式一样,调用xpath、css方法来提取。

# 2.Scrapy shell:Selector主要与Scrapy结合使用,Scrapy的回调函数中response直接调用xpath或者css方法提取数据,
# 所以借助Scrapy shell模拟Scrapy请求过程,理解相关提取方法
# 用官方文档样例页面:http://doc.scrapy.org/en/latest/_static/selectors-sample1.html
# 开启Srapy shell,命令行输入:
scrapy shell http://doc.scrapy.org/en/latest/_static/selectors-sample1.html
# 进入到Scrapy shell模式。过程是,Scrapy发起一次请求,请求的URL是命令行下输入的URL,把可操作的变量request、response传递给我
# 可在命令行模式下输入命令调用对象的一些操作方法,回车后实时显示结果。
# 演示实例都将页面的源码作为分析目标,源码:
<html>
<head>
<base href='http://example.com/' />
<title>Example website</title>
</head>
<body>
<div id = 'images'>
<a href='imgae1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
<a href='imgae1.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
<a href='imgae1.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
コード例 #50
0
 def parse(self, response):
     """
     解析
     """
     sel = Selector(text=response.body)
     print len(sel.xpath(u"//b[text()='单位名称']"))!= 0, "parse 条件"
     log.msg("parse 条件=%s"%str(len(sel.xpath(u"//b[text()='单位名称']")) != 0), level=log.INFO)
     if (len(sel.xpath(u"//b[text()='单位名称']")) != 0): #判别是否为要输入验证码
         pass
     else:
         log.msg("code=%s,  %s"%(str(response.status),response.body), level=log.INFO)
         raise UnknownResponseError
     #========================================================
     """
     第一部分:企业信用档案
     """
     item = DetailInformation()
     item['basic_info'] = fundation_info_extract(response)
     #========================================================
     #========================================================
     """
     第一部分 政府监管信息
     """
     item['regulatory_info'] = extract_combine_JCXX(response)
     #========================================================
     #========================================================
     """
     第三部分 行业评价信息
     """
     keywords_list = ['2-1.体系/产品/行业认证信息',
         '2-2.行业协会(社会组织)评价信息',\
         '2-3.水电气通讯等公共事业单位评价']
     item['envaluated_info'] = block_info_extract(response,\
         keywords_list)
     #========================================================
     """
     第四部分 媒体评价信息
     """
     keywords_list = ['3-1.媒体评价信息']
     item['media_env'] = block_info_extract(response, keywords_list)
     #========================================================
     """
     第五部分 金融信贷信息
     """
     #url = 'http://www.11315.com/\
     #getTradeLendingCount?companyId=%s'%response.url[7:15]
     #header = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36",
     #  'Referer':response.url}
     #req = urllib2.Request(url=url, headers=header)
     #xtml = urllib2.urlopen(req)
     #Nums = xtml.read()
     #print Nums, "this is Nums"
     #Nums = eval(Nums).split(",")
     #print Nums, "this is anothor Nums"
     #total = str(sum([int(i) for i in Nums]))
     #Nums.insert(0, total)  #在头部插入
     #if total == '0':
     #    t_url = ""
     #else:
     #    t_url = sel.xpath(u"//script").re(ur"html\(\'<a href=\"([\w\W]*?)\"")[0]
     #Nums.append(t_url)
     #Nums_re = "|".join(Nums)
     keywords_list = ['4-2.民间借贷评价信息']
     item["credit_fin"] = block_info_extract(response, keywords_list)
     #=======================================================
     """
     第六部分 企业运营信息
     """
     #keywords_list = ['5-3.水电煤气电话费信息',
     #'5-4.纳税信息']                          #要么运行js,要么模拟请求,破网站,就两行数据至于吗
     #item['operation_info'] = block_info_extract(response, keywords_list)
     #========================================================
     """
     第七部分 市场反馈信息
     """
     keywords_list = ['6-1.消费者评价信息',
     '6-2.企业之间履约评价','6-3.员工评价信息',
     '6-4.其他']
     item['feedback_info'] = block_info_extract(response, keywords_list)
     #========================================================
     return item
コード例 #51
0
ファイル: jd_info.py プロジェクト: edwardlg/CrawlMan
def main():
    adsl = ADSL()
    result = []
    df_input = pd.read_excel('sku.xlsx')
    sku_list = df_input['sku'].values
    start = 0
    length = len(sku_list)

    while 1:

        if start == length:
            break
        print('正在爬取第{}条'.format(start + 1))
        sku = sku_list[start]
        options = webdriver.ChromeOptions()
        options.add_argument(
            '--user-agent=Mozilla/5.0 (Windows NT 999999.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
        )

        options.add_argument('--headless')
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--disable-gpu')

        driver = webdriver.Chrome(executable_path=r'chromedriver.exe',
                                  chrome_options=options)
        wait = WebDriverWait(driver, TIMEOUT)  # 等待加载最长时间

        url = 'https://item.jd.com/{}.html'.format(sku)
        try:
            driver.get(url)
        except Exception as e:
            print(e)
            start += 1
            continue

        try:
            wait.until(
                EC.presence_of_element_located(
                    (By.XPATH, '//a[@id="InitCartUrl"]')))
        except:
            print('访问超时,重试')
            start += 1
            continue

        text = driver.page_source
        resp = Selector(text=text)
        title = resp.xpath('//div[@class="sku-name"]/text()').extract()
        if len(title) > 1:
            title = title[1].strip()
        else:
            title = title[0].strip()
        price = resp.xpath(
            '//span[@class="p-price"]/span[2]/text()').extract_first()
        comment = resp.xpath(
            '//div[@id="comment-count"]/a/text()').extract_first()

        try:
            activity_type = resp.xpath(
                '//div[@class="activity-type"]/strong/text()').extract_first()
        except:
            activity_type = None

        area = resp.xpath(
            '//div[@class="ui-area-text"]/text()').extract_first()
        store = resp.xpath(
            '//div[@id="store-prompt"]/strong/text()').extract_first()
        d = {}

        d['title'] = title
        d['price'] = price
        d['comment'] = comment
        d['activity_type'] = activity_type
        d['area'] = area
        d['store'] = store
        d['sku'] = str(sku)
        d['url'] = url

        result.append(d)
        time.sleep(2 * random.randint(2, 6))
        driver.close()
        start += 1

        adsl.reconnect()

        df = pd.DataFrame(result)
        df.to_csv(output_filename, encoding='gbk', mode='a', header=False)

    print('爬取结束,共爬取了{}条'.format(length))
コード例 #52
0
# -*- coding: utf-8 -*-
from scrapy import Selector
import requests
response = requests.get("https://www.baidu.com").text
select = Selector(text=response)
title = select.xpath("//title/text()").extract_first()
print(title)
コード例 #53
0
    def parse(self, response):

        pagesource = Selector(response)

        tax_rate = .01
        interest = 0.0435
        loan_term = 30
        insurance = .5
        dp_percentage = 0.25

        total_page = re.findall(
            r"\d+",
            response.xpath('//span[@class="pageText"]//text()').extract()
            [0])[1]
        current_page = re.findall(
            r"\d+",
            response.xpath('//span[@class="pageText"]//text()').extract()
            [0])[0]

        search_results = pagesource.xpath(
            "//div[@class='MapHomeCardReact HomeCard']")

        for search in search_results:
            entry = RedfinTestItem()
            entry['price'] = float(''.join(
                re.findall(
                    r"\d+",
                    search.xpath(
                        './/span[@data-rf-test-name="homecard-price"]//text()'
                    ).extract()[0])))
            entry['street'] = search.xpath(
                './/span[@data-rf-test-id="abp-streetLine"]//text()').extract(
                )[0]
            entry['citystatezip'] = search.xpath(
                './/span[@data-rf-test-id="abp-cityStateZip"]//text()'
            ).extract()[0]
            entry['zipcode'] = re.findall(
                r"\d+",
                search.xpath(
                    './/span[@data-rf-test-id="abp-cityStateZip"]//text()').
                extract()[0])
            entry['HOA'] = ''.join(
                re.findall(
                    r"\d+",
                    search.xpath(
                        './/span[@data-rf-test-name="homecard-amenities-hoa"]//text()'
                    ).extract()[0]))
            entry['Beds'] = ''.join(
                search.xpath('.//div[@class="value"]//text()').extract()[0])
            entry['Baths'] = ''.join(
                search.xpath('.//div[@class="value"]//text()').extract()[1])
            entry['SQFT'] = ''.join(
                search.xpath('.//div[@class="value"]//text()').extract()[2])

            entry['year_built'] = search.xpath(
                './/span[@data-rf-test-name="homecard-amenities-year-built"]//text()'
            ).extract()[0]
            entry['rent'] = get_rent(str(entry['street']),
                                     str(entry['zipcode']))
            entry['mortgage_pmt'] = float(
                Loan(entry['price'] * 1 - (dp_percentage), interest,
                     loan_term).monthly_payment)
            entry['insurance'] = insurance * make_float(entry['SQFT'])
            if entry['insurance'] == 0:
                entry['insurance'] == 60
            entry['tax'] = entry['price'] * tax_rate / 12
            entry['total_pmt'] = make_float(
                entry['HOA']
            ) + entry['mortgage_pmt'] + entry['insurance'] + entry['tax']
            entry['cashflow'] = get_cashflow(entry['rent'], entry['total_pmt'])
            #, entry['price_estimate']
            yield entry

        if int(total_page) > int(current_page):
            if int(current_page) == 1:
                next_page = response.url + "/page-2"
            else:
                next_page = re.sub(r"[page-][\d]+",
                                   "-" + str(int(current_page) + 1),
                                   response.url)
            yield Request(next_page, callback=self.parse)
コード例 #54
0
def amica(report_label, product, model):
    from globals import file_path

    if product[7].startswith('http'):
        page_address = product[7]
        driver.get(product[7])
        html = requests.get(product[7]).content
        sel = Selector(text=html)
    else:
        search = product[1][product[1].lower().find('amica') + len('amica') +
                            1:]
        amica_link = f'https://www.amica.pl/szukaj/{search}'
        driver.get(amica_link)
        html = requests.get(amica_link).content
        sel = Selector(text=html)

        # Znajdź model na stronie Amica
        try:
            for i in range(len(sel.xpath('//div[@class="container"]'))):
                if driver.find_element_by_xpath(
                        f'//h3[@class="prodSymbol"][{i + 1}]').text == model:
                    page_address = driver.find_element_by_xpath(
                        f'//h3[@class="prodSymbol"][{i + 1}]/a').get_attribute(
                            'href')
                    break

        except NoSuchElementException:
            report_label[
                'text'] += f"Nie znaleziono {model} na stronie Amica. Pomijam go."
            return -1

        driver.find_element_by_css_selector(
            '#produkty > div.moreProducts > div > div > div > div > div > div > div.image > a'
        ).click()
    sleep(1)
    driver.find_element_by_css_selector(
        '#menu01 > div > div.product-view__media > img').click()

    first = driver.find_element_by_css_selector(
        '#prod_app > div.medialightbox__overlay > div > div.cool-lightbox__inner > div.cool-lightbox__wrapper > '
        'div > div > img').get_attribute('src')

    # Zapisywanie i obrabianie zdjęc do miniaturek
    i = 0
    while i < 15:
        if i == 0:
            res = requests.get(first)
        else:
            desc_img = driver.find_element_by_css_selector(
                '#prod_app > div.medialightbox__overlay > div > div.cool-lightbox__inner > div.cool-lightbox__wrapper '
                '> div > div > img').get_attribute('src')
            if desc_img == first:
                break
            res = requests.get(desc_img)
        with open(f'{file_path}/{model}/obrazki_produktu/{i}.jpg',
                  'wb') as file_format:
            file_format.write(res.content)
        try:
            driver.find_element_by_xpath(
                '//*[@id="prod_app"]/div[4]/div/div[2]/div[2]/button[2]/div'
            ).click()
        except ElementNotInteractableException:
            pass

        sleep(1)
        i = i + 1

    for y in range(i):
        im = Image.open(f'{file_path}/{model}/obrazki_produktu/{y}.jpg')
        file_format = im.format
        width, height = im.size
        if width > height:
            ratio = width / 600
        else:
            ratio = height / 600
        new_width = round(width / ratio)
        new_height = round(height / ratio)
        im = im.resize((new_width, new_height))
        if file_format == 'PNG':
            im.save(f'{file_path}/{model}/obrazki_produktu/{y}.jpg', 'PNG')
        elif file_format == 'JPEG':
            im.save(f'{file_path}/{model}/obrazki_produktu/{y}.jpg', 'JPEG')
        else:
            print(f"Nie umiem zrobić zdjęcia nr {y} :'( (typ {file_format})")
    driver.find_element_by_tag_name('body').send_keys(Keys.ESCAPE)

    html = requests.get(page_address).content
    sel = Selector(text=html)

    raw = sel.xpath('/html/body/div[1]/script[4]/text()').extract()

    for i in range(len(raw)):
        raw[i] = raw[i].replace('\n', '')
        raw[i] = raw[i].replace('\t', '')
        raw[i] = raw[i].replace('\xa0', '')
        raw[i] = raw[i].replace('\r', '')
        raw[i] = raw[i].replace('  ', '')

    t = raw[0]
    t = t[t.find('"descTitle":'):]
    t = t[:t.find('}]}')]
    desc = []
    imgs = []
    while t.find('"descTitle":') != -1:
        t = t[t.find('"descTitle":') + 13:]
        desc.append(t[:t.find('"')])
        t = t[t.find('"descIconUrl":') + 15:]
        imgs.append(t[:t.find('"')])
        t = t[t.find('"descText":') + 12:]
        desc.append(t[:t.find('"')])

    for i in range(len(imgs)):
        imgs[i] = imgs[i].replace('\\', '')

    # pobieranie zdjęć z opisu na dysk lokalny
    for i, img in enumerate(imgs):
        res = requests.get(img)
        with open(f'{file_path}/{model}/obrazki_opisu/{i}.jpg',
                  'wb') as file_format:
            file_format.write(res.content)

    for i in range(len(desc)):
        desc[i] = desc[i].replace('\\u0105', 'ą')
        desc[i] = desc[i].replace('\\u0119', 'ę')
        desc[i] = desc[i].replace('\\u0107', 'ć')
        desc[i] = desc[i].replace('\\u0144', 'ń')
        desc[i] = desc[i].replace('\\u015b', 'ś')
        desc[i] = desc[i].replace('\\u015a', 'Ś')
        desc[i] = desc[i].replace('\\u00f3', 'ó')
        desc[i] = desc[i].replace('\\u0141', 'Ł')
        desc[i] = desc[i].replace('\\u0142', 'ł')
        desc[i] = desc[i].replace('\\u017a', 'ź')
        desc[i] = desc[i].replace('\\u017b', 'Ż')
        desc[i] = desc[i].replace('\\u017c', 'ż')
        desc[i] = desc[i].replace('\\u017', 'Ź')
        desc[i] = desc[i].replace('\\u00ae', '®')
        desc[i] = desc[i].replace('\\u00b0', '°')
        desc[i] = desc[i].replace('\u00b0', '°')
        desc[i] = desc[i].replace('\u2070', '°')
        desc[i] = desc[i].replace('\\u2070', '°')
        desc[i] = desc[i].replace('\\u2013', '-')
        desc[i] = desc[i].replace('\u2013', '-')
        desc[i] = desc[i].replace('\\u2026', '...')
        desc[i] = desc[i].replace('\u2026', '...')
        desc[i] = desc[i].replace('\\n', '')
        desc[i] = desc[i].replace('\\/', '/')

    j = 0
    fin = ['<div class="product-description-section">']
    for i in range(0, len(desc), 6):
        fin.append('<div class="three-col-equaly">')
        try:
            fin.append(
                f'<div><img src="https://matrixmedia.pl/media/wysiwyg/Amica/'
                f'{model}/{j}.jpg"/><br/><h2 class="important-header">{desc[i]}</h2>'
            )
            fin.append(f'<p style="font-size: large;">{desc[i + 1]}</p></div>')
            fin.append(
                f'<div><img src="https://matrixmedia.pl/media/wysiwyg/Amica/'
                f'{model}/{j + 1}.jpg"/><br/><h2 class="important-header"> {desc[i + 2]}</h2>'
            )
            fin.append(f'<p style="font-size: large;">{desc[i + 3]}</p></div>')
            fin.append(
                f'<div><img src="https://matrixmedia.pl/media/wysiwyg/Amica/'
                f'{model}/{j + 2}.jpg"/><br/><h2 class="important-header"> {desc[i + 4]}</h2>'
            )
            fin.append(f'<p style="font-size: large;">{desc[i + 5]}</p></div>')
        except IndexError:
            pass
        finally:
            fin.append('</div>')
        j = j + 3
    fin.append('</div>')

    reg = ''.join(fin)
    reg = reg.replace(
        '*Zdjęcie ma charakter poglądowy i może nie przedstawiać dokładnego modelu produktu.',
        '')
    print("------------ OPIS GRAFICZNY ------------")
    print(reg + '\n\n')
    """ OPIS TECHNICZNY """
    html = requests.get(page_address).content
    sel = Selector(text=html)

    tech_raw = sel.xpath('/html/body/div[1]/script[4]/text()').extract()
    tech_raw2 = tech_raw[0]
    tech_d = tech_raw2[tech_raw2.find('"attrGroupData"'):tech_raw2.
                       find('"docFilesDataList"')]

    tech_desc_1 = []
    while tech_d.find('"attrName":') != -1:
        tech_d = tech_d[tech_d.find('"attrName":') + 12:]
        tech_desc_1.append(tech_d[:tech_d.find('"')])
        tech_d = tech_d[tech_d.find('"attrValue":') + 13:]
        tech_desc_1.append(tech_d[:tech_d.find('"')])

    tech_d2 = tech_d[tech_d.find(tech_desc_1[-1]):]

    tech_desc_2 = []
    while tech_d2.find('"attrValue":') != -1:
        tech_d2 = tech_d2[tech_d2.find('"attrValue":') + 13:]
        tech_desc_2.append(tech_d2[:tech_d2.find('"')])

    tech_desc = [
        '<table id="plan_b" class="data-table"><tbody><tr class="specs_category"><td '
        'colspan="2">Specyfikacja</td></tr>'
    ]
    for i in range(0, len(tech_desc_1), 2):
        tech_desc.append(f'<tr><td class="c_left">{tech_desc_1[i]}</td>')
        tech_desc.append(f'<td class="c_left">{tech_desc_1[i + 1]}</td></tr>')

    for i in range(len(tech_desc_2)):
        if i == 0:
            tech_desc.append(f'<tr><td class="c_left">Funkcje</td>')
            tech_desc.append(f'<td class="c_left">{tech_desc_2[i]}</td></tr>')
        else:
            tech_desc.append(f'<tr><td class="c_left"></td>')
            tech_desc.append(f'<td class="c_left">{tech_desc_2[i]}</td></tr>')
    tech_desc.append('</tbody></table>')

    for i in range(len(tech_desc)):
        tech_desc[i] = tech_desc[i].replace('\\u0105', 'ą')
        tech_desc[i] = tech_desc[i].replace('\\u0119', 'ę')
        tech_desc[i] = tech_desc[i].replace('\\u0107', 'ć')
        tech_desc[i] = tech_desc[i].replace('\\u0144', 'ń')
        tech_desc[i] = tech_desc[i].replace('\\u015b', 'ś')
        tech_desc[i] = tech_desc[i].replace('\\u015a', 'Ś')
        tech_desc[i] = tech_desc[i].replace('\\u00f3', 'ó')
        tech_desc[i] = tech_desc[i].replace('\\u0141', 'Ł')
        tech_desc[i] = tech_desc[i].replace('\\u0142', 'ł')
        tech_desc[i] = tech_desc[i].replace('\\u017a', 'ź')
        tech_desc[i] = tech_desc[i].replace('\\u017b', 'Ż')
        tech_desc[i] = tech_desc[i].replace('\\u017c', 'ż')
        tech_desc[i] = tech_desc[i].replace('\\u017', 'Ź')
        tech_desc[i] = tech_desc[i].replace('\\u00ae', '®')
        tech_desc[i] = tech_desc[i].replace('\\u00b0', '°')
        tech_desc[i] = tech_desc[i].replace('\u00b0', '°')
        tech_desc[i] = tech_desc[i].replace('\u2070', '°')
        tech_desc[i] = tech_desc[i].replace('\\u2070', '°')
        tech_desc[i] = tech_desc[i].replace('\\u2013', '-')
        tech_desc[i] = tech_desc[i].replace('\u2013', '-')
        tech_desc[i] = tech_desc[i].replace('\\u2026', '...')
        tech_desc[i] = tech_desc[i].replace('\u2026', '...')
        tech_desc[i] = tech_desc[i].replace('\\n', '')
        tech_desc[i] = tech_desc[i].replace('\\/', '/')
        tech_desc[i] = tech_desc[i].replace(':', '')

    tech = ''.join(tech_desc)
    print('------------ OPIS TECHNICZNY ------------')
    print(tech + '\n\n')
    """ OPIS KRÓTKI """
    for i in range(len(tech_desc_1)):
        tech_desc_1[i] = tech_desc_1[i].replace('\\u0105', 'ą')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u0119', 'ę')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u0107', 'ć')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u0144', 'ń')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u015b', 'ś')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u015a', 'Ś')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u00f3', 'ó')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u0141', 'Ł')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u0142', 'ł')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u017a', 'ź')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u017b', 'Ż')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u017c', 'ż')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u017', 'Ź')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u00ae', '®')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u00b0', '°')
        tech_desc_1[i] = tech_desc_1[i].replace('\u00b0', '°')
        tech_desc_1[i] = tech_desc_1[i].replace('\u2070', '°')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u2070', '°')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u2013', '-')
        tech_desc_1[i] = tech_desc_1[i].replace('\u2013', '-')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u2026', '...')
        tech_desc_1[i] = tech_desc_1[i].replace('\u2026', '...')
        tech_desc_1[i] = tech_desc_1[i].replace('\\n', '')
        tech_desc_1[i] = tech_desc_1[i].replace('\\/', '/')
        tech_desc_1[i] = tech_desc_1[i].replace(':', '')

    if len(tech_desc_1) < 12:
        n = len(tech_desc_1)
    else:
        n = 12

    short = ['<ul>']
    for i in range(0, n, 2):
        short.append(f'<li>{tech_desc_1[i]}: {tech_desc_1[i + 1]}</li>')
    short.append('</ul>')

    short = '\n'.join(short)
    print('------------ OPIS KRÓTKI ------------')
    print(short + '\n\n')

    return [reg, short, tech]
コード例 #55
0
info_transit = []
info_school = []
info_hospital = []
info_food = []
info_shopping = []
info_environment = []
id = 3211228186

wait = WebDriverWait(browser, 10)

browser.get(
    "http://esf.cd.fang.com/newsecond/map/NewHouse/NewProjMap.aspx?newcode={0}"
    .format(id))
time.sleep(1)
t_selector = Selector(text=browser.page_source)


def get_transit_detail():
    distance_lists = t_selector.xpath("//td/text()").extract()[1:]
    tag_names_list = t_selector.xpath("//th/text()").extract()
    for tag_name, distance in zip(tag_names_list, distance_lists):
        match_tag_name = re.match('【(.*)】(.*)', tag_name)
        if match_tag_name:
            tag = match_tag_name.group(1).encode('gbk', 'ignore').decode('gbk')
            nearname = match_tag_name.group(2).encode('gbk',
                                                      'ignore').decode('gbk')
        else:
            tag = ""
            nearname = ""
        match_distance = re.match('.*?(\d+)米', distance)
コード例 #56
0
ファイル: date_pages_spider.py プロジェクト: biatov/top
 def parse(self, response):
     root = Selector(response)
     item = HorseRacingItem()
     for each in root.xpath('//select[@id="raceDateSelect"]'):
         item['date_pages'] = each.xpath('.//option/@value').extract()
         return item
コード例 #57
0
    def postman(self, threadID, date, class_start=None, class_end=None):

        url = "https://bases-marques.inpi.fr/Typo3_INPI_Marques/marques_resultats_liste.html"

        payload = self.forge_payload(date, class_start, class_end)
        headers = {
            'accept':
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            'accept-encoding':
            "gzip, deflate, br",
            'accept-language':
            "en-US,en;q=0.9",
            'Cache-Control':
            "no-cache",
            'connection':
            "keep-alive",
            'content-length':
            "183",
            'content-type':
            "application/x-www-form-urlencoded",
            'host':
            "bases-marques.inpi.fr",
            'origin':
            "https://bases-marques.inpi.fr",
            'referer':
            "https://bases-marques.inpi.fr/Typo3_INPI_Marques/marques_recherche_avancee.html",
            'upgrade-insecure-requests':
            "1",
            'user-agent':
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu \
            Chromium/64.0.3282.167 Chrome/64.0.3282.167 Safari/537.36",
        }

        response = self.s.request("POST", url, data=payload, headers=headers)
        text_response = response.text

        soup = BeautifulSoup(text_response, features='html.parser')
        html_response = soup.prettify()

        with open(self.url_directory + date + '.txt', 'wb') as \
                my_file:
            my_file.write(html_response.encode('utf-8'))

        sel = Selector(text=html_response)
        company_number = 0
        try:
            company_number = sel.css(
                "div.csc-default:nth-child(2) div.tx-pitrechercheinpi-pi1:nth-child(1) form:nth-child(1) \
                div.txtresultats:nth-child(3) p:nth-child(1) > strong:nth-child(1)::text"
            ).extract()[0].strip()
            company_number = self.is_int(company_number)
        except:
            pass
        print(str(date) + " : spider-" + str(threadID))

        if company_number > 500:
            self.split_postman(date, self.threadID, 0, 10)
            self.split_postman(date, self.threadID, 11, 19)
            self.split_postman(date, self.threadID, 21, 27)
            self.split_postman(date, self.threadID, 28, 35)
            self.split_postman(date, self.threadID, 36, 45)
            return

        for i in range(1, company_number + 1):
            if i % 5 == threadID:
                self.detail_annonce(i, date)
            else:
                pass
コード例 #58
-1
    def parse(self, response):
        sel = Selector(response=response)
        comment_tables = sel.xpath('//div[@class="sub_ins"]/table')
        movie_id = re.findall(u'/subject/(\d+?)/', response.url)[0]
        for comment_table in comment_tables:
            user_info = dict()
            user_info['movie_id'] = movie_id
            comment_user_img_ele = comment_table.xpath('.//img')
            if comment_user_img_ele:
                comment_user_img = comment_user_img_ele.xpath('@src')
                user_info['img'] = comment_user_img.extract()[0]

            comment_username = comment_table.xpath('.//div[@class="pl2"]/a')
            if comment_username:
                username_str = comment_username.xpath('text()').extract()[0]
                user_info['name'] = username_str.strip()

                username_href = comment_username.xpath('@href').extract()[0]
                user_info['url'] = username_href.strip()

                comment_user_addr = comment_username.xpath('.//span')
                if comment_user_addr:
                    user_addr_str = comment_user_addr.xpath('text()').extract()[0]
                    user_info['address'] = user_addr_str.strip()[1:-1]

            comment_date = comment_table.xpath('.//p[@class="pl"]')
            if comment_date:
                user_info['date'] = comment_date.xpath('text()').extract()[0].strip()

            comment_content = comment_table.xpath('.//tr/td/p')
            if len(comment_content) == 2:
                p_values = comment_content.xpath('text()').extract()
                user_info['comment_content'] = p_values[len(p_values) - 1]
コード例 #59
-1
ファイル: spider.py プロジェクト: feilaoda/easyspider
    def on_detail_page(self, response):
        if response.url == response.old_url:
            try:
                hxs = Selector(text=response.content)

                summary = hxs.xpath('//div[@class="card-summary-content"]/*').extract()
                content = []
                for ctx in summary:
                    text = clean_html_text(ctx)
                    content.append(text)
                content_text = " ".join(content)
                content_text=content_text.replace("[1]","")
                content_text=content_text.replace("[2]","")
                
                item_dict={}
                items = hxs.xpath('//div[@class="baseInfoWrap"]/div/div/*')
                
                for item in items:
                    title = item.xpath('./span/text()').extract()
                    title_value = item.xpath('./div/text()').extract()
                    print("key:value", to_value(title), to_value(title_value))
                    item_dict[to_value(title)] = to_value(title_value)
                
                item_dict['summary'] = content_text
                imgs = hxs.xpath('//div[@class="lemma-picture summary-pic"]/a/img/@src').extract()
                item_dict['logo'] = to_value(imgs)
                print(item_dict)
                # save_content(self.site.name, url, json.dumps(item_dict))
                # update_url(self.site.name, url, 200)
                return item_dict
            except Exception,e:
                # update_url(self.site.name, url, 500)
                logging.error(e)
コード例 #60
-1
ファイル: doubanmovie.py プロジェクト: Suluo/spider-Scrapy
    def reviews_parse(self,response):
        hxs = Selector(response)
        # print 11111111
        item = reviewsItem()

        sites = hxs.xpath('//*[@class="article"]/div[2]/div[@class="item"]/div[@class="info"]/ul')
        # sites = hxs.xpath('//*[@class="article"]/div[2]/div[@class="item"]/div[@class="info"]')
        for site in sites:
            item['userID'] = re.findall('people/(.+)/collect',response.url)
            # print response.url
            item['moviename'] = site.xpath('li[@class="title"]/a/em/text()').extract()
            item['movieID'] = site.xpath('li[@class="title"]/a/@href').re('subject/(.+)/$')

            moviesUrl =site.xpath('li[@class="title"]/a/@href').extract()[0]
            yield Request(url=moviesUrl,callback=self.movie_parse)

            item['ratingdate'] = site.xpath('li[3]/span[@class="date"]/text()').extract()
            if re.findall('rating\d+-t',site.xpath('li[3]/span[1]/@class').extract()[0]):
                item['rating'] = site.xpath('li[3]/span[1]/@class').re('\d+')
            else:
                item['rating'] = [u'']
            if site.xpath('li[4]/span[@class="comment"]/text()').extract():
                item['comment'] = site.xpath('li[4]/span[@class="comment"]/text()').extract()
            else:
                item['comment'] = [u'']
            yield item
            # print item

        if hxs.xpath('//*[@class="paginator"]/span[@class="next"]/a/@href').extract():
            nextreviewsUrl = hxs.xpath('//*[@class="paginator"]/span[@class="next"]/a/@href').extract()[0]
            # print nextreviewsUrl
            yield Request(url=nextreviewsUrl, callback=self.reviews_parse)
        pass