コード例 #1
0
ファイル: filmla.py プロジェクト: joelwilson/filmscraper
    def parse_finalized_permits(self, response):
        """Extracts the download URLs for all finalized permits."""
        self.logger.info("Parsing finalized permits at " + response.url)
        urls = []

        if self.pop_shell:
            from scrapy.shell import inspect_response

            inspect_response(response, self)

        permitViewHrefs = response.xpath('//a[text()="view"]/@href').extract()
        for href in permitViewHrefs:
            url = response.urljoin(href)
            self.logger.debug("Found URL: " + url)
            urls.append(url)

        yield FinalizedPermitsPage(file_urls=urls)

        next_page_href = response.xpath('//a[text()="next >"]/@href').extract()[0]
        if not next_page_href:
            self.logger.info("No pages left!")
            return
        else:
            print "Moving on to the next page at ." + next_page_href
            yield scrapy.Request(url=response.urljoin(next_page_href), callback=self.parse_finalized_permits)
コード例 #2
0
ファイル: att_spider.py プロジェクト: craigkerstiens/simplerm
 def parse_tastypage(self, response):
     hxs = HtmlXPathSelector(response)
     print '--------------------------------------'
     print hxs.select('//title').extract()
     print '--------------------------------------'
     from scrapy.shell import inspect_response
     inspect_response(response)
コード例 #3
0
	def parse_details(self, response):
		item = response.meta.get('item', None)
		if item:
			# populate more `item` fields
			return item
		else:
			inspect_response(response, self)
コード例 #4
0
ファイル: crawlFoody.py プロジェクト: dongchirua/FoodyCrawl
 def parse_items(self, response):
     """
     Second step: yield item
     """
     page_number = response.meta['page']
     response_ = json.loads(response.body)
     if self.is_debug:
         inspect_response(response, self)
     for i in response_['searchItems']:
         item = FoodyItem()
         item['city'] = i['City']
         item['district_id'] = i['DistrictId']
         item['main_url'] = self.allowed_domains[0] + i['DetailUrl']
         item['addr'] = i['Address']
         item['mobile_pic'] = re.sub('^//', '', i['MobilePicturePath'])
         item['large_pic'] = re.sub('^//', '', i['PicturePathLarge'])
         item['main_category_id'] = i['MainCategoryId']
         item['lat'] = i['Latitude']
         item['long'] = i['Longitude']
         item['views'] = i['TotalView']
         item['cuisines'] = dict()
         item['cuisines']['id'] = list()
         item['cuisines']['name'] = list()
         for j in i['Cuisines']:
             item['cuisines']['id'].append(j['Id'])
             item['cuisines']['name'].append(j['Name'])
         item['rating'] = i['AvgRating']
         item['id'] = i['Id']
         item['category'] = i['MainCategoryId']
         yield item
コード例 #5
0
ファイル: CT_DOI.py プロジェクト: mcialini/tsne
    def dostuff(self, response):
        from scrapy.shell import inspect_response
        inspect_response(response)
        sel = Selector(response)
        rowSelector = '//table[@id="GridView1"]//tr'
        rows = sel.xpath(rowSelector)

        logging.info('---------CALLED PARSE*********')
        for r in range(1, len(rows)-2):
            # The last row is the footer
            item = default(FILEITEM)
            group = default(FILEGROUP)
            cols = rows[r].xpath('td')
            name = cols[0].xpath('text()').extract()[0].encode('ascii', 'ignore').upper()
            disp = cols[1].xpath('text()').extract()[0].encode('ascii', 'ignore')
            date = cols[2].xpath('text()').extract()[0].encode('ascii', 'ignore')
            y = re.compile('\d\d\d\d')
            year = y.findall(date)[0].encode('ascii', 'ignore')
            type = cols[3].xpath('text()').extract()[0].encode('ascii', 'ignore')
            link = cols[4].xpath('a')
            url = link.xpath('@href').extract()[0]
            title = 'Disposition'

            item['source'] = url
            item['name'] = name
            item['state'] = state
            item['year'] = year

            group['items'].append(item)
            logging.info(item['name'])
            item = default(FILEITEM)
コード例 #6
0
ファイル: 400kb.py プロジェクト: patrick-lu/auto
    def shareimage_post(self, response):
	#register_openers()
	params={};
	basedir = settings['IMAGES_STORE']
	key=response.meta['key']
	path_comps= key.split('/')
	filename=os.path.join(basedir, *path_comps)
	print filename;
	inspect_response(response)
	params["userfile[]"]=open(filename,"rb")
	params["private_upload"]=0
	datagen, headers = multipart_encode(params)
	upload_url = "http://shareimage.org/upload.php"
	req = urllib2.Request(upload_url, datagen, headers)
	print response.meta['cookie']
	req.add_header('Cookie',response.meta['cookie'])
	result = urllib2.urlopen(req)
	content=result.read()
	print content
	response["meta"]['tttt']=content
	inspect_response(response)
	p = re.compile(r'value="(http://\S+)"')
	match=p.search(content);
	print match.group(1)
	if match:		
		self.tt['publish']['imgs'].append(match.group(1))
		saveItem(self.tt)	
	else:
		self.log("ERROR: failed to upload image")
#	inspect_response(response)
	return;
コード例 #7
0
ファイル: spiderreviews.py プロジェクト: mart2010/brd
    def parse_reviews(self, response):
        widx = response.meta['work_index']
        item = response.meta['pass_item']

        if response.meta.get('main_page'):
            # it may happen that uid taken from search result is different (request was redirected)
            if response.meta.get('redirect_urls'):
                item['work_uid'] = response.url[response.url.index('/livres/')+8:]
            # fetch tags from main page
            tags_sel = response.xpath('//p[@class="tags"]/a[@rel="tag"]')
            tags_t = []
            tags_n = []
            # only way to get approx. frequency ia through font-size
            for tag_s in tags_sel:
                tags_t.append(tag_s.xpath('./text()').extract_first().strip())
                # "tag_t17 tc0 ..."
                tag_n_c = tag_s.xpath('./@class').extract_first()
                tags_n.append(tag_n_c[5:tag_n_c.index(u' ')])

            item['tags_t'] = u"__&__".join(tags_t)
            item['tags_n'] = u";".join(tags_n)

            item['tags_lang'] = u'fre'

            # request first review page
            yield scrapy.Request((self.url_main + self.param_review) % (item['work_uid'], 1),
                                 meta={'work_index': widx, 'pass_item': item},
                                 callback=self.parse_reviews)
        # collect from reviews page
        else:
            last_page = response.meta.get('last_page')
            if last_page is None:
                page_row = response.xpath('//div[@class="pagination row"]')
                if len(page_row) == 0:
                    last_page = 1
                else:
                    last_page = int(page_row.xpath('./a[last()-1]/text()').extract_first())
            # used for debugging... could be removed
            if response.url.find('?pageN=') == -1 or response.url.find('&tri=') == -1:
                from scrapy.shell import inspect_response
                inspect_response(response, self)

            cur_page = int(response.url[response.url.index('?pageN=') + 7:response.url.index('&tri=')])
            found_older = False
            reviews_sel = response.xpath('//div[@class="post_con"]')
            for rev in reviews_sel:
                new_item = self.extract_onereview(item, rev)
                comp = self.within_harvestperiod(new_item, self.works_to_harvest[widx]['last_harvest_date'])
                if comp == -1:
                    found_older = True
                    break
                elif comp == 0:
                    yield new_item
            if cur_page < last_page and not found_older:
                yield scrapy.Request((self.url_main + self.param_review) % (item['work_uid'], cur_page + 1),
                                     meta={'work_index': widx, 'last_page': last_page, 'pass_item': item},
                                     callback=self.parse_reviews)
コード例 #8
0
ファイル: login.py プロジェクト: StrongZhu/scrapy-webtools
 def pre_login(self, response):
     if self.login_debug:
         inspect_response(response, self)
     formdata = dict()
     if self.login_data:
         formdata.update(self.login_data)
     formdata[self.username_field] = self.username or ""
     formdata[self.passwd_field] = self.passwd or ""
     self.verbose("final: %s" % formdata)
     yield scrapy.FormRequest.from_response(response, formdata=formdata, callback=self.submit_login, dont_filter=True)
コード例 #9
0
ファイル: example.py プロジェクト: ShriramK/tutorial
    def parse_item(self, response):
        self.log('Hi, this is an item page! %s' % response.url)
        from scrapy.shell import inspect_response
        inspect_response(response)

        hxs = HtmlXPathSelector(response)
        item = Item()
        item['id'] = hxs.select('//td[@id="item_id"]/text()').re(r'ID: (\d+)')
        item['name'] = hxs.select('//td[@id="item_name"]/text()').extract()
        item['description'] = hxs.select('//td[@id="item_description"]/text()').extract()
        return item
コード例 #10
0
ファイル: userColumn.py プロジェクト: heamon7/zhUser
    def parsePage(self,response):

        if response.status != 200:
            yield FormRequest(url =response.request.url,
                                              #headers = self.headers,
                                              meta={'params':response.meta['params']
                                                  ,'xsrfValue':response.meta['xsrfValue']
                                                  ,'userDataId':response.meta['userDataId']
                                                  ,'offset':response.meta['offset']},
                                              formdata={
                                                  'method':'next',
                                                  'params':response.meta['params'],
                                                  '_xsrf':response.meta['xsrfValue'],

                                              },
                                              dont_filter = True,
                                              callback = self.parsePage
                                              )
        else:
            item =  UserColumnItem()
            data = json.loads(response.body)
            columnList = data['msg']
            inspect_response(response,self)
            item['spiderName'] = self.name
            # logging.warning('response.meta[params]: %s \n response.body: %s',response.meta['params'],response.body)
            #这里注意要处理含有匿名用户的情况
            if columnList:

                res = Selector(text = ''.join(columnList))
                item['userDataId'] = response.meta['userDataId']
                item['offset'] = response.meta['offset']

                for sel in res.xpath('//div[contains(@class,"zm-profile-section-item")]'):
                    item['columnLinkId'] = sel.xpath('a[@class="zm-list-avatar-link"]/@href').re(r'http://zhuanlan.zhihu.com/(.*)')[0]
                    item['columnImgLink'] = sel.xpath('a[@class="zm-list-avatar-link"]/img/@src').extract()[0]
                    item['columnId'] = sel.xpath('div[contains(@class,"zm-profile-section-main")]/button/@id').extract()[0]

                    try:
                        item['columnDescription'] = sel.xpath('div[contains(@class,"zm-profile-section-main")]/div[contains(@class,"description")]/text()').extract()[0]
                    except:
                        # logging.warning('item[columnLinkId]: %s',item['columnLinkId'])
                        item['columnDescription'] = ''
                    item['columnPostCount'] = sel.xpath('div[contains(@class,"zm-profile-section-main")]/div[contains(@class,"meta")]/span/text()').re(r'(\d+)')[0]

                    # 注意userLinkId中可能有中文

                    
                    yield item


            else:
                #没有用户
                item['userDataId']=''
                yield item
コード例 #11
0
ファイル: dianping.py プロジェクト: babykick/dphunter
 def parse_board_list(self, response):
     self.log("parse board list from %s" % response.url)
     inspect_response(response, self)
     for dt in response.xpath("//div/dl[contains(@class, 'perList')]"):
        link = dt.xpath("./dt/span/a/@href").extract_first()
        title = dt.xpath("./dt/span/a/img/@title").extract_first()
        intro = dt.xpath("./dd/div/p/text()").extract_first()
        url = urlparse.urljoin(self.root_domain, link)
        yield Request(url=url, callback=self.parse_board, meta={'board':{'url':url,
                                                                         'title': title,
                                                                         'intro': intro
                                                                         }})
コード例 #12
0
    def after_login(self, response):
        # from scrapy.shell import inspect_response
        # inspect_response(response)

        # check login succeed before going on
        if "Marketplace" not in response.body:
            scrapy.log.msg("Login failed", level=log.ERROR)
            return

        for idx, sel in enumerate(response.xpath('//table[contains(@class, "mpitems")]/tbody/tr')):
            try:
                record = DiscogsRecord()
                descXpath = sel.xpath('td[@class="item_description"]')
                record['title'] = descXpath.xpath('span[@class="br_item_title"]/a/text()').extract()[0].strip()

                info = sel.xpath('td[@class="item_description"]/text()').extract()
                record['catNum'] = info[6].strip()
                record['mediaCondition'] = info[8].strip()
                record['sleeveCondition'] = info[10].strip()
                record['sellerNotes'] = info[11].strip()
                record['label'] = sel.xpath('td[@class="item_description"]/a/text()').extract()[0].strip()

                sellerXpath = sel.xpath('.//td[@class=" seller_info"]')
                discogsSeller = DiscogsSeller()

                sellerInfo = sellerXpath.xpath('ul/li/b/a/text()').extract()

                discogsSeller['name'] = record['seller'] = sellerInfo[0].strip()

                if len(sellerInfo) > 1:
                    p = re.compile(r'.*(\d+).*')
                    m = p.match(sellerInfo[1].strip())
                    discogsSeller['numItems'] = int(m.group(1))
                else:
                    discogsSeller['numItems'] = 0

                discogsSeller['country'] = filter(lambda x : len(x) != 0, [x.strip() for x in sellerXpath.xpath('ul/li/text()').extract()])[0]

                priceXpath = sel.xpath('.//td[@align="center"]')
                record['price'] = priceXpath.xpath('span[@class="price"]/text()').extract()[0]
                record['shipping'] = priceXpath.xpath('span[@style="color:#555"]/text()').extract()[0].strip()

                yield discogsSeller
                yield record
            except:
                print "index %d" % idx
                print sel

                from scrapy.shell import inspect_response
                inspect_response(response)

                raise
コード例 #13
0
    def parse(self, response):
        inspect_response(response)
        all_provinces_value = response.xpath(
            '//select[@name="propinsi"]/option[contains(., "All")]/@value'
        ).extract()[0]

        return FormRequest.from_response(
            response,
            formdata={
                'propinsi': all_provinces_value,
                'keyword': '',
                'submit': 'search!',
            },
            callback=self.parse_list
        )
コード例 #14
0
ファイル: ZBJ_crawler.py プロジェクト: naifan/ZBJ
    def parse_item(self, response):
        #inspect one response
        if  response.url:
            from scrapy.shell import inspect_response
            inspect_response(response, self)
            
        tasks = response.xpath('//div[@class="success-task-list clearfix" ]/ul')

        for task in tasks:
            self.logger.info('parse '+response.url)
            item = ZBJItem()
            item['title'] = task.xpath('li[@class="task-item-title-li"]/a/text()').extract()[0]
            item['url'] = task.xpath('li[@class="task-item-title-li"]/a/@href').extract()[0]
            print item
            yield item
コード例 #15
0
ファイル: yesebbs.py プロジェクト: patrick-lu/auto
    def parse_thread(self,response):
        self.log("thread url:%s"% response.url)
        hxs = HtmlXPathSelector(response)
        first_floor= hxs.select('//div[starts-with(normalize-space(@id),"post_")]')[0]
        content=first_floor.select('.//td[starts-with(normalize-space(@id),"postmessage_")]')[0]
        imgs=content.select('.//img/@src').extract();
        if(len(imgs)==0):
	    self.threads_db.remove({"url":response.url})
            return;
        #items = [];
        #for img in imgs:
        item = AutobtItem()
	item['name'] = response.url
        item['image_urls'] = imgs 
        inspect_response(response)

	all=content.extract();
	con={"name":"","size":"","format":""}
	colon =  u'\uff1a' 
	tags={
         u'\u5f71\u7247\u540d\u7a31':"name",
         u'\u5f71\u7247\u540d\u79f0':"name",
         u'\u5f71\u7247\u683c\u5f0f':"format",
         u'\u5f71\u7247\u5927\u5c0f':"size",
        }

	for key in tags:
		index =all.find(key);
		if(index >0):
			name =all[index:]
			index = name.find("<br>") 
			if(index <0 ):
				index = name.find("\t")
			if(index >0):
				name = name[0:index]
				index = name.find(colon)
				if(index > 0):
					con[tags[key]] = name[(index+1):]
				else:
					index = name.find(u":")
					if(index>0):
						con[tags[key]] = name[(index+1):]

	tt=self.threads_db.find_one({"url":response.url})
	tt["content"]=con;
	tt['raw_content']=all;
	self.threads_db.save(tt);
        return item;
コード例 #16
0
ファイル: xmly.py プロジェクト: pysteven/sxs_spider
    def parse(self, response):
        audios  = response.css('.album_soundlist ul li')
        visitedIdSet = response.meta['audioId']
        allAudioNotVisited = True
        for audio in audios:
            sound_id = audio.xpath('@sound_id').extract()[0]
            if sound_id in visitedIdSet:
                allAudioNotVisited = False
            else:
                #访问未被访问到的地址
                pass

        print response.meta

        from scrapy.shell import inspect_response
        inspect_response(response,self)
        pass
コード例 #17
0
    def parse_item(self, response): 
	    
        #Debugger:
        from scrapy.shell import inspect_response
        inspect_response(response, self)
        
        #if response =[]:
        #    self.start_urls = the_
        item = RedditItem()

        item['dates'] = response.xpath('//div[@class="search-result-meta"]/span[@class="search-time"]/time/@title').extract()
        item['authors'] = response.xpath('//div[@class="search-result-meta"]/span[@class="search-author"]//a/text()').extract()
        item['votes'] = response.xpath('//div[@class="search-result-meta"]/span[@class="search-score"]/text()').extract()
        
        #self.last_date = item['dates'][-1]

        yield item
コード例 #18
0
ファイル: weibo.py プロジェクト: dragonly/WBPC
    def parse_fans_page(self, response):
        html = response.body.replace(r'\"','').replace(r'\/','/').replace(r'\t','\t').replace(r'\r\n','\n')
        start = re.search('html":([^}]+)follow_box', html).start(1)
        end   = html.find('"})', start)
        posts_cleaned = html[start:end]
        sel = Selector(text=posts_cleaned)

        response.sel = sel
        inspect_response(response, self)
        
        user_node_list = sel.css('.follow_item')
        for user_node in user_node_list:
            user_item = UserItem()
            
            name_node = user_node.css('.info_name')
            user_item['name'] = name_node.xpath('a[1]/text()').extract()
            user_item['name'] = self._extract_or_empty_string(user_item['name'])
            user_item['uid'] = name_node.xpath('a[1]/@usercard').extract()
            user_item['uid'] = self._extract_or_empty_string(user_item['uid'])
            if user_item['uid'] != '':
                user_item['uid'] = user_item['uid'].split('=')[1]
            is_male = name_node.css('.icon_male')
            is_female = name_node.css('.icon_female')
            if is_male:
                user_item['gender'] = 'm'
            elif is_female:
                user_item['gender'] = 'f'
            verified = name_node.css('[href="http://verified.weibo.com/verify"]')
            user_item['verified'] = True if verified else False
            club = name_node.css('[href="http://club.weibo.com/intro"]')
            user_item['club'] = True if club else False

            intro_node = user_node.css('.info_intro')
            user_item['intro'] = intro_node.xpath('span')

            tag_node = user_node.css('.person_label')
            user_item['tags'] = list(tag_node.xpath('./a/text()').extract())

            yield user_item
#            for page in range(NUM_POSTS_PAGE):
            page=0
            for rqst in self._make_request_posts(user_item['uid'], page):
                yield rqst
            
            for rqst in self._make_request_fans(user_item['uid']):
                yield rqst
コード例 #19
0
    def parse(self, response):
        currency_code = re.compile("[A-Z]{3}")
        unit = re.compile("\A[10]+\Z")
        price = re.compile("\A\d+[.,]{1}\d+\Z")

        bank_name = bank_from_url[response.url]
        try:
            table = bs(response.xpath(banks[bank_name][
                "selector"]).extract()[0], "lxml")
        except:
            inspect_response(response, self)
        rows = [row for row in table.findAll("tr")]
        total_n_cols = 0
        for row in rows:
            total_n_cols += len(row.findAll("td"))
        # If number of columns is less than average, ignore row
        try:
            avg_cols = int(ceil(total_n_cols / len(rows)))
        except:
            inspect_response(response, self)
        for row in rows:
            columns = row.findAll("td")
            if len(columns) < avg_cols:
                continue
            item = CurrencyItem()
            item["bank"] = bank_name
            item["date"] = datetime.today().date()
            for column in columns:
                col_content = column.text.strip()
                if price.match(col_content) is not None:
                    match = float(price.match(
                        col_content).group().replace(",", "."))
                    item["buy_price"] = match
                    try:
                        if match > item["sell_price"]:
                            item["buy_price"] = item["sell_price"]
                            item["sell_price"] = match
                    except KeyError as e:
                        item["sell_price"] = match
                if currency_code.match(col_content) is not None:
                    item["currency_code"] = currency_code.match(
                        col_content).group()
                if unit.match(col_content) is not None:
                    item["unit"] = unit.match(col_content).group()
            yield item
コード例 #20
0
ファイル: nba_spider.py プロジェクト: dysn/nbadata
    def parse(self, response):
        from scrapy.shell import inspect_response

        inspect_response(response, self)
        # for sel in response.xpath('//a[@class="active"]'):
        #    item = PlayerLinkItem()
        #   item['url']= sel.xpath('@href').extract()
        #  item['name']= sel.xpath('text()').extract()
        # item['status']=['active']
        # yield item
        for sel in [response.xpath('//a[@class="historic"]')[0]]:
            item = PlayerLinkItem()
            item["url"] = [urljoin("http://stats.nba.com/", sel.xpath("@href").extract()[0] + "career/")]
            item["name"] = sel.xpath("text()").extract()
            item["status"] = ["historic"]
            url = item["url"][0]
            item["uid"] = int(url.split("/")[-3])
            yield item
コード例 #21
0
ファイル: spiderreviews.py プロジェクト: mart2010/brd
 def parse_search_resp(self, response):
     widx = response.meta['work_index']
     isbns = self.works_to_harvest[widx]['isbns']
     nb_try = response.meta['nb_try']
     titre_xp = '//td[@class="titre_livre"]'
     res_sel = response.xpath(titre_xp + '/a[@class="titre_v2"]')
     uid_txt = res_sel.xpath('./@href').extract_first()  # u'/livres/Levy-Rien-de-grave/9229'
     # found it
     if uid_txt:
         uid = uid_txt[uid_txt.index(u'/livres/') + 8:]
         title = res_sel.xpath('./text()').extract_first().strip()
         author = response.xpath('//td[@class="auteur"]/a/text()').extract_first().strip()
         pass_item = self.build_review_item(work_refid=self.works_to_harvest[widx]['work_refid'],
                                            work_uid=uid,
                                            title=title,
                                            authors=author)
         nb_t = response.xpath(titre_xp + '/a[contains(@href,"#critiques")]/span/text()').extract_first()
         if not nb_t or int(nb_t) == 0:
             logger.info("No reviews found for work-refid=%s in site %s (uid=%s)"
                     % (pass_item['work_refid'], self.name, pass_item['work_uid']))
             yield pass_item
         # DATA-ISSUES: may indicate 1-review, when none exist (ex. Ballard-La-Course-au-Paradis/230360)
         # the impact: try many times to harvest same work as no reviews will be yielded
         else:
             yield scrapy.Request(self.url_main % uid,
                                  meta={'work_index': widx, 'pass_item': pass_item, 'main_page': True},
                                  callback=self.parse_reviews)
     else:
         n_found = response.xpath('//div[@class="content row"]//div[@class="titre"]/text()').extract_first()
         # found no book
         if n_found and n_found.find(u'(0)') != -1:
             if nb_try < len(isbns):
                 yield scrapy.FormRequest(self.form_search,
                                          formdata={'Recherche': str(isbns[nb_try]), 'item_recherche': 'isbn'},
                                          meta={'work_index': widx, 'nb_try': nb_try + 1},
                                          callback=self.parse_search_resp)
             else:
                 logger.info("Nothing found for isbn=%s in site %s" % (isbns[nb_try - 1], self.name))
                 yield self.build_review_item(work_refid=self.works_to_harvest[widx]['work_refid'], work_uid='-1')
         else:
             logger.error("Unexpected result page after %d try (search isbn=%s)" % (nb_try, isbns[nb_try - 1]))
             # interactively debug page
             from scrapy.shell import inspect_response
             inspect_response(response, self)
コード例 #22
0
ファイル: demo_spider.py プロジェクト: hackrole/scrapy-utils
    def parse_list(self, reponse):
        # the selector build and the data meta
        hxs = HtmlXPathSelector(response)
        meta = response.meta
        # log msg
        self.log('msg', loglevel=Error)

        # for debug and first xpath write
        open_in_browser(response)
        inspect_response(response, self)

        # selector choose and join or strip
        ''.join(hxs.select('').extract()).strip()

        # url join
        base_url = get_base_url(response)
        n_url = urljoin_rfc(base_url, 'url')

        return item
コード例 #23
0
ファイル: spider.py プロジェクト: Augustles/scrapydemo
    def parse(self, response):  # 解析返回的URL数据
        # 下载直接整理的图片
        # with open(pre+'/pic_links.txt','r') as f:
        #     for line in f:
        #         item = DoubanItem()
        #         item['image_urls'] = [line.strip()]
        #         item['images'] = line[line.rfind('/')+1:].strip()
        #         yield item

        inspect_response(response, self)
        soup = BeautifulSoup(response.body)  # 用beautifulsoup解析
        # print soup
        for y in soup.find_all('div', attrs={'class': 'doulist-item'}):
            item = DoubanItem()
            item['title'] = y.find('div', attrs={'class': 'title'}).a.text
            item['link'] = y.find('div', attrs={'class': 'title'}).a['href']
            item['rating'] = y.find(
                'span', attrs={'class': 'rating_nums'}).text
            item['major'] = y.find('div', attrs={'class': 'abstract'}).text
            yield item  # 生成器返回匹配到的项目
コード例 #24
0
ファイル: spiderreviews.py プロジェクト: mart2010/brd
 def parse_search_resp(self, response):
     widx = response.meta['work_index']
     isbns = self.works_to_harvest[widx]['isbns']
     nb_try = response.meta['nb_try']
     # found, map work_uid and request reviews page
     if response.url.find('/book/show/') != -1:
         gr_work_id = response.url[response.url.index('/book/show/') + 11:]
         # add title/authors (only done for initial QA checks)
         title = response.xpath('//h1[@class="bookTitle"]/text()').extract_first().strip()
         a_raw = response.xpath('//a[@class="authorName"]/child::*/text()').extract()
         authors = ",".join(a_raw)
         pass_item = self.build_review_item(work_refid=self.works_to_harvest[widx]['work_refid'],
                                            work_uid=gr_work_id,
                                            authors=authors,
                                            title=title)
         nb_rev = response.xpath('//a[@class="actionLinkLite"]/span[@class="count"]/span[@class="value-title"]/text()').extract_first()
         if int(nb_rev.replace(',', '')) == 0:
             logger.info("No reviews found for work-refid=%s (uid=%s of site %s)"
                         % (pass_item['work_refid'], pass_item['work_uid'], self.name))
             yield pass_item
         # map gr's id and trigger new Request to have reviews ordered correctly
         else:
             self.works_to_harvest[widx]['last_harvest_date'] = self.min_harvest_date
             # For popular review, GR has partial list, so must order by oldest
             yield scrapy.Request(self.url_review % (gr_work_id, 1, 'oldest'),
                                  meta={'work_index': widx, 'item': pass_item},
                                  callback=self.parse_reviews)
     # not found page
     elif 'Looking for a book?' in response.body:
         if nb_try < len(isbns):
             yield scrapy.Request(self.url_search + str(isbns[nb_try]),
                                  meta={'work_index': widx, 'nb_try': nb_try + 1},
                                  callback=self.parse_search_resp)
         else:
             logger.info("Nothing found for wid: %s, isbns: %s" % (self.works_to_harvest[widx]['work_refid'], str(isbns)))
             yield self.build_review_item(work_refid=self.works_to_harvest[widx]['work_refid'], work_uid='-1')
     else:
         logger.error("Unexpected result page after %d try (search isbn=%s)" % (nb_try, isbns[nb_try - 1]))
         # interactively debug page
         from scrapy.shell import inspect_response
         inspect_response(response, self)
コード例 #25
0
ファイル: login.py プロジェクト: StrongZhu/scrapy-webtools
 def submit_login(self, response):
     if self.logged_in:
         self.verbose("already logged in")
     elif self.login_check in response.body_as_unicode():
         self.verbose("logged in successfully")
         self.logged_in = True
         for req in self.start_requests():
             yield req
         self.verbose("bye")
     else:
         if self.allow_second_login and self.in_first_attempt:
             self.verbose("need another attempt..")
         else:
             self.verbose("login failed")
         if self.login_debug:
             inspect_response(response, self)
         if self.allow_second_login and self.in_first_attempt:
             self.in_first_attempt = False
             yield self.get_login_request()
         else:
             raise LoginFailed(response)
コード例 #26
0
    def parse(self, response):
        inspect_response(response, self)
        round = ItemLoader(item=ResultsItem(), response=response)
        round.add_xpath("home_team", '//td[@class="cThree"]', re=">.*<")
        round.add_xpath("away_team", '//td[@class="cFive"]', re=">.*<")
        round.add_xpath("home_score", '//td[@class="cFour"]/text()')
        round.add_xpath("away_score", '//td[@class="cFour"]/text()')
        round.add_xpath("date", '//td[@class="cOne first"]/text()')
        round.add_xpath("replay", '//td[@class="cTwo"]')
        round.add_xpath("match_id", '//td[@class="last"]/a/@rel')
        round = round.load_item()
        print round

        match_count = len(round.values()[0])

        for i in range(match_count):
            if round["home_score"][i] == "NaN":  # changed this to pick up missing scorelines
                match = {}  # create new item for each match
                for key, value in round.items():
                    match[key] = value[i]
                    match["round_id"] = response.meta["round_id"]
                yield match
コード例 #27
0
  def directory_parser(self, response):
    """
    Given a http://wesconnect.wesleyan.edu/directory search form, search and
    parse.
    """
    print "On a directory page, sending serach data"


    if "<b>Search Operator&nbsp;/&nbsp;Search Value</b>" in response.body:
      print "This page has a search form"

      x = HtmlXPathSelector(response)
      viewstate = x.select("//input[@name='__VIEWSTATE']").select('@value').extract()[0]
      eventvalidation = x.select("//input[@name='__EVENTVALIDATION']").select('@value').extract()[0]

      for year in xrange(1950, 2013):
        year = str(year)

        yield FormRequest.from_response(response,
          formdata = {
            # LastName Comparison method
            #'cid_41$SearchGUI$sc285$ddComparison_285' : "Contains",
            #'cid_41$SearchGUI$sc285$mf_285' : last_name,

            # FirstName Comparison method
            #'cid_41$SearchGUI$sc284$ddComparison_284' : "Contains",
            #'cid_41$SearchGUI$sc284$mf_284' : "",

            ## Year should be filled in or non-existant, NEVER blank
            'cid_41$SearchGUI$sc36$mf_36' : year,
            },
          callback=self.listing_parser)
    else:
      print "This is a result from a search"

      from scrapy.shell import inspect_response
      inspect_response(response)
      print "Got to results listing"
コード例 #28
0
    def parse_items(self, response):
        for i, sel in enumerate(response.xpath("//*[@id='the-list']/tr[./td]")):
            mil = MasjidItemLoader(selector=sel)
            mil.add_xpath('id_masjid', "./td[5]/text()")
            mil.add_xpath('nama_masjid', "./td[4]/a/text()")
            mil.add_xpath('link_detail', "./td[4]/a/@href")
            mil.add_xpath('kabupaten_kota', "./td[2]/text()")
            mil.add_xpath('kecamatan', "./td[3]/text()")
            mil.add_xpath('tipologi', "./td[6]/text()")
            mil.add_xpath('alamat', "./td[7]/text()")
            mil.add_xpath('luas_tanah', "./td[8]/text()")
            mil.add_xpath('status_tanah', "./td[9]/text()")
            mil.add_xpath('luas_bangunan', "./td[10]/text()")
            mil.add_xpath('tahun_berdiri', "./td[11]/text()")
            mil.add_xpath('jamaah', "./td[12]/text()")
            mil.add_xpath('imam', "./td[13]/text()")
            mil.add_xpath('khatib', "./td[14]/text()")
            mil.add_xpath('muazin', "./td[15]/text()")
            mil.add_xpath('remaja', "./td[16]/text()")
            mil.add_xpath('no_telepon', "./td[17]/text()")
            mil.add_xpath('keterangan', "./td[18]/text()")

            long_lat = sel.xpath("./comment()[2]").re(r'align="center">(-?[0-9.]+)</')

            try:
                mil.add_value('longitude', long_lat[0])
                mil.add_value('latitude', long_lat[1])
            except IndexError:
                self.logger.error(
                    "Can't get long-lat on %(url)s , element index = %(index)s",
                    {"url": response.url, "index": i},
                )
                from scrapy.shell import inspect_response
                inspect_response(response, self)

            yield mil.load_item()
コード例 #29
0
ファイル: shell-spider.py プロジェクト: clonegod/technology
 def parse(self, response):
     # We want to inspect one specific response.
     if ".org" in response.url:
         from scrapy.shell import inspect_response
         inspect_response(response, self)
コード例 #30
0
    def parse_statement(self, response):
        from scrapy.shell import inspect_response

        results = []
        app_id_code_reestri_db = urlparse.parse_qs(
            urlparse.urlparse(response.request.url)[4])['app_id'][0]

        soup = BeautifulSoup(response.body, "html5lib", from_encoding="utf-8")

        # First table: "Prepared documents" -- scrape details into CorpDoc item
        # and then grab the doc too; they are usually PDFs.

        prepared_table = soup.find("caption", text=u"მომზადებული დოკუმენტები")
        if prepared_table is not None:
            prepared_table = prepared_table.parent
            for row in prepared_table.find_all("tr"):
                # First cell contains link
                # Second contains title, date
                # Third is blank
                cells = row.find_all("td")
                link = ""
                if cells[0] is not None:
                    link = cells[0].a["href"]

                spans = cells[1].find_all("span")
                title = spans[0].string
                date = spans[1].string

                results.append(
                    StatementDocument(
                        fk_corp_id_code=response.meta['corp_id_code'],
                        fk_stmnt_id_code_reestri_db=app_id_code_reestri_db,
                        link=link,
                        title=title,
                        date=date))

                results.append(
                    Request(url=link,
                            callback=self.parse_stmnt_prepared_doc,
                            meta={
                                'cookiejar': response.meta['cookiejar'],
                                'corp_id_code': response.meta['corp_id_code']
                            }))

        # Second table: Status Documents. Scrape details into CorpDocs, and
        # grab the docs too, they are usually PDFs.
        status_table = soup.find("caption", text=u"სტატუსი / გადაწყვეტილება")
        if status_table is not None:
            status_table = status_table.parent

            for row in status_table.find_all("tr"):
                cells = row.find_all("td")
                link = ""
                if cells[0] is not None:
                    link = cells[0].a["href"]
                registration_num = cells[1].find(class_="maintxt").string
                date = cells[1].find(class_="smalltxt").string
                title = cells[2].find(style=True).string

                results.append(
                    StatementDocument(
                        fk_corp_id_code=response.meta['corp_id_code'],
                        fk_stmnt_id_code_reestri_db=app_id_code_reestri_db,
                        link=link,
                        title=title,
                        date=date,
                        registration_num=registration_num))

                # Probably don't actually need to parse these.
                #results.append(Request(url=link,
                #            callback=self.parse_stmnt_status_pdf,
                #            meta={'cookiejar':response.meta['cookiejar'],
                #                  'id_code_reestri_db':response.meta['id_code_reestri_db']}))
        # Third table: Scanned Documents. Scrape details into CorpDocs, and
        # grab the docs if they are PDFs.
        scanned_table = soup.find("caption", text=u"სკანირებული დოკუმენტები")
        if scanned_table is not None:
            scanned_table = scanned_table.parent

            for row in scanned_table.find_all("tr"):
                cells = row.find_all("td")
                link = ""
                if cells[0] is not None:
                    link = cells[0].a["href"]
                doc_info = cells[1].find_all(class_="maintxt")
                if (len(doc_info) == 2):
                    title = doc_info[0].string
                    date = doc_info[1].string
                else:
                    date = doc_info[0].string
                    title = None
                filename = cells[2].find("a").find("span").string

                doc = StatementDocument(
                    fk_corp_id_code=response.meta['corp_id_code'],
                    fk_stmnt_id_code_reestri_db=app_id_code_reestri_db,
                    link=link,
                    date=date,
                    filename=filename)
                if (title):
                    doc['title'] = title

                results.append(doc)

                #TODO: Check whether it's a PDF and if so, return
                # a Request to the document.

        # Fourth table: Statement details. Scrape details into RegistryStatement.
        statement = RegistryStatement()
        # First block of info, starting with statement number.
        regx = re.compile(u"^\s+განცხადება.+$")
        caption = soup.find("caption", text=regx)
        if caption is None:
            inspect_response(response)
        statement['statement_num'] = caption.string.split('#')[1]
        table = caption.parent

        statement['registration_num'] = self._get_header_sib(
            table, u"\n\s*რეგისტრაციის ნომერი\s*").span.string
        statement['statement_type'] = self._get_header_sib(
            table, u"\n\s*მომსახურების სახე\s*").span.string
        statement['service_cost'] = self._get_header_sib(
            table, u"\n\s*მომსახურების ღირებულება\s*").span.string
        pay_debt = self._get_header_sib(
            table, u"\n\s*გადასახდელი თანხა/ბალანსი\s*").span.string
        statement['payment'] = pay_debt.split("/")[0]
        statement['outstanding'] = pay_debt.split("/")[1]
        statement['id_reestri_db'] = response.meta['stmnt_id_reestri_db']

        # Second block of info, starting after payment details.
        # Find the correct table
        table = soup.find("div", id="application_tab").table
        # Grab the relevant parts
        statement['id_code_legal'] = self._get_header_sib(
            table, u"საიდენტიფიკაციო ნომერი").strong.string
        statement['name'] = self._get_header_sib(
            table, u"სუბიექტის დასახელება ").string
        statement['classification'] = self._get_header_sib(
            table, u"სამართლებრივი ფორმა").string
        statement['reorganization_type'] = self._get_header_sib(
            table, u"რეორგანიზაციის ტიპი ").string
        statement['quantity'] = self._get_header_sib(table,
                                                     u"რაოდენობა").string
        statement['changed_info'] = self._get_header_sib(
            table, u"შესაცვლელი რეკვიზიტი: ").string

        # Attached docs description is a <ul>
        attached = self._get_header_sib(table,
                                        u"\n\s*თანდართული დოკუმენტაცია\s")
        attached_desc = []
        for li in attached.ul.contents:
            attached_desc.append(li.string)
        statement['attached_docs_desc'] = attached_desc

        # Additional docs is a <div>, don't know what the format looks like yet
        addtl_td = self._get_header_sib(table,
                                        u"\n\s*დამატებით წარმოდგენილი\s*")
        statement['additional_docs'] = addtl_td.find(
            id="additional_docs_container").string

        # Issued docs also a ul
        issued = self._get_header_sib(table,
                                      u"\n\s*გასაცემი დოკუმენტები\s*").ul
        issued_desc = []
        for li in issued.contents:
            issued_desc.append(li.string)
        statement['issued_docs'] = issued_desc

        # Don't know the format of notes yet either.
        notes_td = self._get_header_sib(table, u"\n\s*შენიშვნა\s*")
        statement['notes'] = notes_td.string
        results.append(statement)

        # Cells containing people require a bit more intelligence
        representative_td = self._get_header_sib(table, u" წარმომადგენელი  ")
        rv_pers = self._person_from_statement_cell(representative_td)
        if len(rv_pers) > 0:
            results.append(
                PersonCorpRelation(
                    person=rv_pers,
                    fk_corp_id_code=response.meta['corp_id_code'],
                    relation_type=[u"წარმომადგენელი"],
                    cite_type="statement",
                    cite_link=response.request.url))

        representee_td = self._get_header_sib(table, u" წარმომდგენი  ")
        re_pers = self._person_from_statement_cell(representee_td)
        if len(re_pers) > 0:
            results.append(
                PersonCorpRelation(
                    person=re_pers,
                    fk_corp_id_code=response.meta['corp_id_code'],
                    relation_type=[u"წარმომდგენი"],
                    cite_type="statement",
                    cite_link=response.request.url))

        ganmcxadebeli_td = self._get_header_sib(table, u"განმცხადებელი  ")
        g_pers = self._person_from_statement_cell(ganmcxadebeli_td)
        if len(g_pers) > 0:
            results.append(
                PersonCorpRelation(
                    person=g_pers,
                    fk_corp_id_code=response.meta['corp_id_code'],
                    relation_type=[u"განმცხადებელი"],
                    cite_type="statement",
                    cite_link=response.request.url))

        return results
コード例 #31
0
 def inspect(self, response):
     from scrapy.shell import inspect_response
     inspect_response(response, self)
コード例 #32
0
    def parse(self,response):
        from scrapy.shell import inspect_response
        inspect_response(response,self)

        print response.xpath('//div[@class="content"]').extract()
コード例 #33
0
    def parse_news_page(self, response):
        from scrapy.shell import inspect_response
        inspect_response(response, self)
        stop_scrape_flag = False
        news_list = self.exchange.get_news_list(response)
        if not news_list:
            raise Exception('Error: Website Structure Has Been Changed!' +
                            ' Maintainance Needed!')
        for i, news_row in enumerate(news_list):
            # has to assign new dict every loop
            # otherwise mongodb raises dup key (Id) error
            item = {
                'mkt': self.exchange.uptick_name,
                'mkt_id': self.mkt_id,
                'tzinfo': self.exchange.tzinfo,
                'error': True
            }
            try:  # news row won't have error
                date_time, url, title, misc_fields_dict = self.exchange.get_news_fields(
                    news_row)

                # database has previous news and scraped news is older than database
                if self.latest_date and date_time < self.latest_date:
                    stop_scrape_flag = True
                    break

                # generate file name by date and number of events on that date
                # todo: change uptick_name to col_name
                # if exchange has multi news sources
                # assign key 'website_url' to misc_fields_dict
                website_url = ''
                if self.exchange.is_multi_source_exchange:
                    website_url = misc_fields_dict.get('website_url')
                filename = du.get_filename(date_time, self.exchange.col_name,
                                           website_url)

                # insert record to mongodb
                item['date_time'] = date_time
                item['title'] = title
                item['url'] = url
                item['unique_id'] = filename
                item['error'] = False
                item.update(misc_fields_dict)
                yield item

                utils.save_pdf_url_or_chrome(url, self.pdfs_dir + filename)

            except Exception as e:  # not news row, skip
                item['error'] = {
                    'news_row_html': news_row.extract(),
                    'error_message': '%s: %s' % (e.__class__, str(e)),
                    'row_no': i,
                    'traceback': traceback.format_exc(),
                    'url': response.url
                }
                yield item
                continue

        # todo: test without keep_follow_page flag
        if not stop_scrape_flag:
            for url, meta in self.exchange.get_pagination_urls(response):
                yield scrapy.Request(url,
                                     callback=self.parse_news_page,
                                     meta=meta)
コード例 #34
0
ファイル: baidu.py プロジェクト: usernamehcx/ScrapyTutorial
    def parse(self, response):
        # print response.text
        keyword = response.meta['keyword']
        results = response.xpath('//div[@class="result c-container "]')
        #print results
        time = datetime.datetime.now(self.tz)

        from scrapy.shell import inspect_response
        inspect_response(response, self)

        for res in results:
            #print res.extract()
            url = res.xpath(
                './/h3[contains(@class,"t")]/a/@href').extract_first()
            # print keyword,url
            bfdata = str(keyword) + str(url)

            item = ScrapyBaiduItem()
            item['url'] = url
            print url
            item['title'] = res.xpath('.//h3[contains(@class,"t")]/a').xpath(
                'string(.)').extract_first()
            timestr = res.xpath(
                './/span[contains(@class," newTimeFactor_before_abs m")]'
            ).xpath('string(.)').extract_first()
            # print timestr
            if timestr == None:
                item['time'] = time.strftime('%Y_%m_%d_%H_%M_%S')
            else:
                if str(timestr).find('天前') != -1:
                    time_num = int(timestr[:str(timestr).find('天前')])
                    delta = datetime.timedelta(days=time_num)
                    new_time = time - delta
                    item['time'] = new_time.strftime('%Y_%m_%d_%H_%M_%S')
                elif str(timestr).find('小时前') != -1:
                    time_num = int(timestr[:str(timestr).find('小时前')])
                    delta = datetime.timedelta(hours=time_num)
                    new_time = time - delta
                    item['time'] = new_time.strftime('%Y_%m_%d_%H_%M_%S')
                elif str(timestr).find('分钟前') != -1:
                    time_num = int(timestr[:str(timestr).find('分钟前')])
                    delta = datetime.timedelta(minutes=time_num)
                    new_time = time - delta
                    item['time'] = new_time.strftime('%Y_%m_%d_%H_%M_%S')
                elif str(timestr).find('秒钟前') != -1:
                    time_num = int(timestr[:str(timestr).find('秒钟前')])
                    delta = datetime.timedelta(seconds=time_num)
                    new_time = time - delta
                    item['time'] = new_time.strftime('%Y_%m_%d_%H_%M_%S')
            #print title
            abstract = res.xpath('.//div[contains(@class,"c-abstract")]'
                                 ).xpath('string(.)').extract_first()
            if abstract == None:
                #print res.extract()
                abstract = res.xpath(
                    './/div[@class="c-span18 c-span-last"]/font/p').xpath(
                        'string(.)').extract()
                abstract = ' '.join(abstract)
                #print abstract
                #print abstract
            s = abstract.find('-')
            if s > 0:
                abstract = abstract[s + 2:]
            item['abstract'] = abstract
            item['keyword'] = unicode(keyword)
            item['create_time'] = time.strftime('%Y_%m_%d_%H_%M_%S')
            yield item
コード例 #35
0
    def parse_page(self, response):

        # scrape dynamically generated HTML
        self.browser.get(response.url)
        hxs = Selector(text=self.browser.page_source)
        item = ScraperItem()

        # use scrapy shell to find xpath
        from scrapy.shell import inspect_response
        inspect_response(response)

        try:
            divs = hxs.xpath(
                '//div[@id="contentArea"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/descendant-or-self::*/text()'
            ).extract()
            text = u" ".join(divs[1:])
            no_text = len(divs) == 0
        except IndexError:
            no_text = True

        if no_text:
            try:
                text = " ".join(
                    hxs.xpath(
                        '//span[@class="hasCaption"]/child::node()').extract())
            except IndexError:
                text = ""

        item['url'] = response.url
        item['text'] = text
        item['title'] = hxs.xpath('//title/text()').extract()
        item['date'] = hxs.xpath(
            '//span[@class="timestampContent"]/text()').extract()

        comments = float(hxs.xpath('count(//abbr)').extract()[0]) - 1

        try:
            likes = hxs.xpath(
                '//div[@class="UFILikeSentenceText"]/span/span/text()'
            ).extract()[0]

            if "likes" in likes:
                like_count = 1.0
            else:
                try:
                    like_count = len(likes.split(", "))
                    if "others" in likes:
                        like_count += float(
                            likes.split("and ")[1].split(" others")[0].replace(
                                ",", ""))
                    elif "and" in likes:
                        like_count += 1.0
                except IndexError:
                    like_count = 2.0
        except IndexError:
            like_count = 0.0
        # print "like count: "+str(like_count)

        try:
            shares = hxs.xpath(
                '//a[@class="UFIShareLink"]/text()').extract()[0]

            share_count = float(shares.split(" share")[0].replace(",", ""))
        except IndexError:
            share_count = 0.0

        print like_count, share_count, comments

        item['comment_count'] = [like_count, share_count, comments]

        yield item
コード例 #36
0
    def parse_three(self, response):
        meta = response.meta
        unqiue_form = meta['cbSearchResultsUniqueId']
        bbb = json.loads(response.text)['responseText']
        __response = HtmlResponse(response.url, body=str.encode(bbb))
        business_name = __response.xpath(
            "//td[starts-with(text(),'Business:')]/following::tr/td[1]/span/text()"
        ).extract_first()
        if not business_name:
            inspect_response(__response, self)
        f_name = __response.xpath(
            "//td[starts-with(text(),'FName:')]/following::tr/td[1]/span/text()"
        ).extract_first()
        l_name = __response.xpath(
            "//td[starts-with(text(),'LName')]/following::tr/td[1]/span/text()"
        ).extract_first()
        phone = __response.xpath(
            "//td[starts-with(text(),'Phone:')]/following::tr/td[1]/span/text()"
        ).extract_first()
        county = __response.xpath(
            "//td[starts-with(text(),'County:')]/following::tr/td[1]/span/text()"
        ).extract_first()
        add = __response.xpath(
            "//td[starts-with(text(),'Address:')]/following::tr/td[1]/span/text()"
        ).extract_first()
        city = __response.xpath(
            "//td[starts-with(text(),'City:')]/following::tr/td[1]/span/text()"
        ).extract_first()
        state = __response.xpath(
            "//td[starts-with(text(),'State:')]/following::tr/td[1]/span/text()"
        ).extract_first()
        zip_code = __response.xpath(
            "//td[starts-with(text(),'Zip:')]/following::tr/td[1]/span/text()"
        ).extract_first()
        print("-------------------0999999999999", business_name, f_name,
              l_name, phone, county, add, city, state, zip_code)
        date_certified = __response.xpath(
            "//td[starts-with(text(),'Date Certified:')]/following::tr/td[1]/span/text()"
        ).extract_first()
        expire_date = __response.xpath(
            "//td[starts-with(text(),'Date of Expiration:')]/following::tr/td[1]/span/text()"
        ).extract_first()
        person_name = f_name + ' ' + l_name
        location_address_string = self.format__address_4(
            add, city, state, zip_code)

        next_page = __response.xpath(
            '//a[@data-cb-name="JumpToNext"]/@href').extract_first()
        if next_page:

            app = next_page[next_page.rfind('?appSession=') +
                            12:next_page.rfind('&RecordID=')]
            RecordID = next_page[next_page.rfind('&RecordID=') +
                                 10:next_page.rfind('&cpipage=')]
            cpipage = next_page[next_page.rfind('&cpipage=') +
                                9:next_page.rfind('&PageID')]
            PageID = next_page[next_page.rfind('&PageID=') +
                               8:next_page.rfind('&PrevPageID')]
            PrevPageID = next_page[next_page.rfind('&PrevPageID=') +
                                   12:next_page.rfind('&CPISortType=')]
            cbCurrentPageSize = next_page[
                next_page.rfind('&cbCurrentPageSize') +
                19:next_page.rfind('&cbRandomSortKey=')]
            cbRandomSortKey = next_page[next_page.rfind('&cbRandomSortKey') +
                                        17:next_page.rfind('&cbRecordPosition'
                                                           )]
            cbCurrentRecordPosition = next_page[next_page.
                                                rfind('&cbRecordPosition=') +
                                                18:]

            a = int(time() * 1000)
            url1 = 'https://c0bkr159.caspio.com/dp/31cf1000eccbd58b888d45ff8350?rnd=' + str(
                a)

            print("---form------------------------------>", url1)
            form_data = {
                'AjaxAction': 'JumpToNext',
                'GridMode': 'False',
                'cbUniqueFormId': unqiue_form,
                'ClientQueryString': '',
                'appSession': app,
                'RecordID': RecordID,
                'cpipage': cpipage,
                'PageID': PageID,
                'PrevPageID': PrevPageID,
                'CPISortType': '',
                'CPIorderBy': '',
                'cbCurrentPageSize': cbCurrentPageSize,
                'cbRandomSortKey': cbRandomSortKey,
                'cbRecordPosition': cbCurrentRecordPosition,
                'AjaxActionHostName': 'https://c0bkr159.caspio.com',
            }
            print("---form------------------------------>", form_data)

            headerr = {
                'Origin':
                'https://c0bkr159.caspio.com',
                'Referer':
                'https://c0bkr159.caspio.com/dp/31cf1000eccbd58b888d45ff8350',
                'Sec-Fetch-Mode':
                'cors',
                'User-Agent':
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
            }

            yield scrapy.FormRequest(url=url1,
                                     callback=self.parse_three,
                                     method='POST',
                                     dont_filter=True,
                                     formdata=form_data,
                                     meta=meta,
                                     headers=headerr)

        il = ItemLoader(item=SdSeptictankLicensesSpiderItem(),
                        response=response)
        # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
        il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
        il.add_value('sourceName', 'SD_SepticTank_Licenses')
        il.add_value('url', 'http://denr.sd.gov/des/sw/SepticInstallers.aspx')
        il.add_value('permit_lic_eff_date', date_certified)
        il.add_value('City', city)
        il.add_value('State', state)
        il.add_value('Zip', zip_code)
        company_name = self._getDBA(business_name)[0]
        if len(company_name) < 2:
            company_name = person_name
        il.add_value('company_name', company_name)
        il.add_value('permit_lic_desc',
                     'Waste Transporter Licenses for ' + company_name)
        il.add_value('dba_name', self._getDBA(business_name)[1])
        il.add_value('county', county)
        il.add_value('permit_lic_exp_date', expire_date)
        il.add_value('location_address_string', location_address_string)
        il.add_value('company_phone', phone)
        il.add_value('person_name', self._getDBA(person_name)[0])
        il.add_value('permit_type', 'waste_transporter_license')
        yield il.load_item()
コード例 #37
0
ファイル: scraper.py プロジェクト: gobfink/Groceries
    def parse(self, response):
        # This callback determines if the selected menu is
        # at the top of the list, if it is then it adds the urls
        # to the list and keeps going
        # if its not, then it calls the lua to prepare the page
        # for scraping, and then scrapes it
        url = response.url

        menu = response.css(".category-filter__link")
        #submenu = response.css("")
        #print ("self.urls - " +str(self.urls))
        print("processing response.url - " + response.url)

        #print ("menu: ")
        #print (menu.getall())
        #print ("len(menu): " + str(len(menu)))
        #print ("menu[0] : " + menu.get())
        #print("name - " + menu[0].css('.category-filter__text ::text').get())
        #inspect_response(response,self)

        if (len(menu) > 0 and menu[0].css('[aria-current="page"]')):
            print(f"inside menu page for url - {url}")
            # The top page is active
            #print ("menu[0] : [aria-current=page] " + menu[0].css('[aria-current="page"]').get())
            # therefore we need to scrape the links, and continue searching
            # we then need to loop through each other page.
            # call parse, and scrape it is not
            menu_url = menu[0].css('::attr(href)').get()

            menu_name = menu[0].css('.category-filter__text ::text').get()
            for item in menu:
                heading = item.css('.category-filter__text ::text').get()
                scraped_url = item.css('::attr(href)').get()
                scraped_url = self.base_url + scraped_url
                section = menu_name
                subsection = heading
                category = lookup_category("", section, subsection)
                store_url(self.conn, scraped_url, self.store_id, category,
                          section, subsection)

                #self.section_dict[url]=(menu_name, heading)
                #if self.urls.count(url) == 0:
                #    self.urls.append(url)

            #urls=menu.css('::attr(href)').getall()
            # Remove the the first(this) page from list to parse
            #urls.pop()
            #self.urls.extend(urls)
            #print("urls to scrape - " + str(self.urls))
            #print("local urls - " + str(urls))
            """
            while len(self.urls) != 0:
                url = self.urls.pop()
                self.processedUrls.append(url)
                #url = self.base_url + url_suffix
                #print ("urls - " + str(self.urls))
                #print ("pulling from url - " + url)
                #print ("urls lengths - " + str(len(self.urls)))
                yield SplashRequest(url,
                                self.parse,
                                endpoint='execute',
                                args={'lua_source': self.expand_and_scroll_lua})
            """

        elif (len(menu) == 0):
            inspect_response(response, self)

        else:
            #we are on a subpage, so now we can start scraping
            #

            GROCERY_SELECTOR = '.grid-item'
            NAME_SELECTOR = '.small-type.detail-card-description ::text'
            PRICE_SELECTOR = '.price ::text'
            PRICE_PER_UNIT_SELECTOR = '.sub-headline.detail-card-subtext ::text'

            metadata = get_url_metadata(self.cursor, url)
            section = metadata[0]
            subsection = metadata[1]
            print("subpage - scraping " + url + ", from section - " + section)
            for grocery in response.css(GROCERY_SELECTOR):
                self.name = grocery.css(NAME_SELECTOR).extract_first()
                self.price = grocery.css(PRICE_SELECTOR).extract_first()
                if self.price is not None:
                    self.price = self.price.replace('*', '').replace('$', '')
                self.ppu = grocery.css(PRICE_PER_UNIT_SELECTOR).extract_first()
                if self.ppu is not None:
                    self.ppu = convert_ppu(self.ppu)
                #inspect_response(response, self)
                #parse the ounces off of the name
                yield {
                    'name': self.name,
                    'price': self.price,
                    'price-per-unit': self.ppu,
                    'section': section,
                    'subsection': subsection,
                    'url': response.url
                }
        finish_url(self.conn, self.store_id, url)
        print("finishing url - " + url)
        next_url = get_next_url(self.cursor, 1)
        if next_url is not None:
            print("got next_url - " + next_url)
            yield SplashRequest(
                next_url,
                self.parse,
                endpoint='execute',
                dont_filter=True,
                args={'lua_source': self.expand_and_scroll_lua})
        else:
            print("Next url is none therefore we must be finished ! ")
コード例 #38
0
 def parse(self, response):
     inspect_response(response, self)
コード例 #39
0
ファイル: digikey.py プロジェクト: fivol/sandbox
 def parse(self, response):
     from scrapy.shell import inspect_response
     inspect_response(response)
コード例 #40
0
 def after_login(self, response):
     inspect_response(response, self)  # opens terminal
     return
コード例 #41
0
 def test(self, response):
     from scrapy.shell import inspect_response
     inspect_response(response, self)
コード例 #42
0
 def parse_tongshi(self, response):
     courses = response.xpath('//table[@id="gridMain"]/tbody/tr[re:test(@class, "tdcolour\d$")]')
     from scrapy.shell import inspect_response
     inspect_response(response, self)
コード例 #43
0
    def parse(self, response):
        '''get scrapy shell in ipython

        '''
        inspect_response(response, self)
コード例 #44
0
    def parsePage(self, response):

        if response.status != 200:
            yield FormRequest(
                url=response.request.url,
                #headers = self.headers,
                meta={
                    'params': response.meta['params'],
                    'xsrfValue': response.meta['xsrfValue'],
                    'userDataId': response.meta['userDataId'],
                    'offset': response.meta['offset']
                },
                formdata={
                    'method': 'next',
                    'params': response.meta['params'],
                    '_xsrf': response.meta['xsrfValue'],
                },
                dont_filter=True,
                callback=self.parsePage)
        else:
            item = UserColumnItem()
            data = json.loads(response.body)
            columnList = data['msg']
            inspect_response(response, self)
            item['spiderName'] = self.name
            # logging.warning('response.meta[params]: %s \n response.body: %s',response.meta['params'],response.body)
            #这里注意要处理含有匿名用户的情况
            if columnList:

                res = Selector(text=''.join(columnList))
                item['userDataId'] = response.meta['userDataId']
                item['offset'] = response.meta['offset']

                for sel in res.xpath(
                        '//div[contains(@class,"zm-profile-section-item")]'):
                    item['columnLinkId'] = sel.xpath(
                        'a[@class="zm-list-avatar-link"]/@href').re(
                            r'http://zhuanlan.zhihu.com/(.*)')[0]
                    item['columnImgLink'] = sel.xpath(
                        'a[@class="zm-list-avatar-link"]/img/@src').extract(
                        )[0]
                    item['columnId'] = sel.xpath(
                        'div[contains(@class,"zm-profile-section-main")]/button/@id'
                    ).extract()[0]

                    try:
                        item['columnDescription'] = sel.xpath(
                            'div[contains(@class,"zm-profile-section-main")]/div[contains(@class,"description")]/text()'
                        ).extract()[0]
                    except:
                        # logging.warning('item[columnLinkId]: %s',item['columnLinkId'])
                        item['columnDescription'] = ''
                    item['columnPostCount'] = sel.xpath(
                        'div[contains(@class,"zm-profile-section-main")]/div[contains(@class,"meta")]/span/text()'
                    ).re(r'(\d+)')[0]

                    # 注意userLinkId中可能有中文

                    yield item

            else:
                #没有用户
                item['userDataId'] = ''
                yield item
コード例 #45
0
 def parse(self, response):
     if "c5game" in response.url:
         from scrapy.shell import inspect_response
         inspect_response(response, self)
コード例 #46
0
 def next(self, response):
     print("此时已经登录完成并爬取了个人中心的数据")
     title = response.xpath("/html/head/title/text()").extract()
     print(title[0])
     inspect_response(response, self)
コード例 #47
0
    def parse_item(self, response):
        '''
        this part for debug
        '''

        from scrapy.shell import inspect_response
        inspect_response(response, self)

        item = ImdbItem()
        soup = BeautifulSoup(response.text)

        try:
            genre = list(
                map(lambda x: x.text,
                    soup.find("div", {
                        "class": "subtext"
                    }).findAll("a")))
        except:
            genre = None
        item["genre"] = genre
        #
        try:
            name = soup.find("div", {
                "class": "title_wrapper"
            }).find("h1").contents[0]
        except:
            name = None
        item['name'] = name
        #
        try:
            year = soup.find("div", {
                "class": "title_wrapper"
            }).find("h1").span.text.strip("(").strip(")")
        except:
            year = None
        item['year'] = year
        #
        try:
            director = list(
                map(
                    lambda x: x.text,
                    soup.find("div", {
                        "class": "credit_summary_item"
                    }).findAll("a")))
        except:
            director = None
        item["director"] = director
        #
        try:
            score = int(
                soup.find("div", {
                    "class": "ratingValue"
                }).contents[1].text)
        except:
            score = None
        item["score"] = score
        #
        try:
            stars = list(
                map(
                    lambda x: x.text,
                    soup.findAll(
                        "div",
                        {"class": "credit_summary_item"})[-1].findAll("a")))
        except:
            stars = None
        item["stars"] = stars

        yield item
コード例 #48
0
 def parse_detail(self, response):
     inspect_response(response, self)
コード例 #49
0
 def parse_test(self, response):
     print("####IN TEST####")
     from scrapy.shell import inspect_response
     inspect_response(response, self)
     yield {'Test': "Passed"}
コード例 #50
0
ファイル: scrapy_shell.py プロジェクト: shiratori3/MySpider
def scrapy_shell_called(response, self):
    # for test
    # pls use external system terminal, otherwise raise a error
    from scrapy.shell import inspect_response
    inspect_response(response, self)
コード例 #51
0
 def test(self, response):
     '''测试方法,回调进入交互式测试'''
     from scrapy.shell import inspect_response
     inspect_response(response, self)
コード例 #52
0
 def debug(self, response):
     from scrapy.shell import inspect_response
     inspect_response(response, self)
     raise CloseSpider('debug stop')
コード例 #53
0
def inspect_spider_response(response, spider):

    if spider.settings.get('DEBUG', True):
        return inspect_response(response, spider)
コード例 #54
0
 def parse_country(self, response):
     inspect_response(response, self)
コード例 #55
0
ファイル: utils.py プロジェクト: nyov/scrapyext
def inspect(response):
	inspect_response(response)
	raise CloseSpider('Done')
コード例 #56
0
 def parse_nodes(self, response, nodes):
     inspect_response(response, self)
コード例 #57
0
    def parse_series(self, response):
        sel = Selector(response)

        from scrapy.shell import inspect_response
        inspect_response(response)
コード例 #58
0
ファイル: mydomain.py プロジェクト: aqy2013/tutorial
 def parseInfo(self, response):
     inspect_response(response, self)
     title = response.css('body > h1::text').extract()