Esempio n. 1
0
    def parse_item(self, response: Response, selector: Selector):
        try:
            param_names = selector.xpath('.//dt/text()').extract()
            param_values = selector.xpath('.//dd/text()').extract()
            params = {n.strip(': '): v.strip(' ') for n, v in zip(param_names, param_values) if '\n' not in v}

            loader = ProductLoader(item=ProductItem(), response=response, selector=selector)

            loader.add_xpath('name', './/*[@itemprop="name"]/text()')
            loader.add_xpath('category', '//*[@id="main"]/h1/text()')
            loader.add_xpath('link', './/*[@itemprop="name"]/../@href')
            loader.add_xpath('price', './/*[@class="price" or @class="price action_special"]/text()')
            loader.add_xpath('price_old', './/*[@class="price-old"]/text()')
            loader.add_xpath('rating', './/*[@class="oh-rating"]/text()')

            loader.add_value('params', params)

            loader.add_value('where_found', response.request.url)
            loader.add_value('project', self.settings.get('BOT_NAME'))
            loader.add_value('spider', self.name)
            loader.add_value('server', socket.gethostname())
            loader.add_value('parse_datetime', datetime.datetime.now())

            return loader.load_item()

        except Exception as e:
            print(e)
Esempio n. 2
0
    def parse_showdesk_members_treat(self, resp):
        hxs = Selector(resp)
        next_page_nodes = hxs.xpath('//a[@class="next_page"]')
        meta = resp.meta
        if next_page_nodes and meta['page'] == 1:
            next_page_node = next_page_nodes[0]
            total_page = next_page_node.xpath('./parent::li/preceding-sibling::li')[-1].xpath('a/child::text()').extract()[0].strip()
            for i in xrange(2, int(total_page) + 1):
                new_meta = dict(meta)
                new_meta['page'] = i
                self.log('%s yield member list page %d' % (self.name, i))
                yield FormRequest(url="http://vip6.sentree.com.cn/shair/timesItem!initTreat.action", formdata={
                             'page.currNum' : str(i),
                             'page.rpp' : '30',
                             'r' : str(meta['r']),
                             'set' : 'manage'
                             }, callback=self.parse_showdesk_members_treat, meta=new_meta)
        treat_info_tabs = hxs.xpath('//div[@class="page_main"]//div[@class="table-responsive"]/table')
        if not treat_info_tabs:
            yield None
            return
        treat_info_tab = treat_info_tabs[0]
        ths = str_list_strip_replace(treat_info_tab.xpath('./thead/tr/th/child::text()').extract(), [' ', '\t', '\n', ' '])

        info_nodes = treat_info_tab.xpath('./tbody/tr')
        for i_n in info_nodes:
            infos = []
            info_tds = i_n.xpath('./td')
            for i_t in info_tds:
                info = ''.join(str_list_strip_replace(i_t.xpath('.//child::text()').extract(), [' ', '\t', '\n', ' ']))
                infos.append(info)
            item = SentreeMemberTreatItem()
            item['hs'] = ths
            item['vals'] = infos
            yield item
Esempio n. 3
0
    def parse_showdesk_membercards(self, resp):
        hxs = Selector(resp)
        headers = hxs.xpath('//form[@id="cardTypeForm"]//table/thead/tr/th/child::text()').extract()
        if not headers:
            self.log('%s can not find table headers.' % self.name, level=log.ERROR)
            yield None
            return
        employee_nodes = hxs.xpath('//form[@id="cardTypeForm"]//table/tbody/tr')
        if not employee_nodes:
            self.log('%s can not find member card info' % self.name, level=log.ERROR)
            yield None
            return
        for e_n in employee_nodes:
            info_nodes = e_n.xpath('td')
            info = OrderedDict({})
            for idx, i_n in enumerate(info_nodes):
                if idx == 0 or idx == len(info_nodes) - 2:
                    continue
                if idx == len(info_nodes) - 1:
                    info[headers[idx]] = ' | '.join(str_list_strip_replace(i_n.xpath('./child::text()').extract(), [' ', '\t', '\n', ' ']))
                    continue
                sep = ' | '
                if idx == 3:
                    sep = ''
                info[headers[idx]] = sep.join(str_list_strip_replace(str_list_strip(i_n.xpath('descendant::text()').extract()), [' ', '\t', '\n', ' ']))

            item = SentreeMemberCardItem()
            item['info'] = info
#             items.append(info)
            yield item
Esempio n. 4
0
 def parse(self, response):
     sel = Selector(response)
     locations = Locations()
     locations["restaurantIDs"] = sel.xpath('//a/@data-id').extract()
     locations["coordinates"] = {}
     locations["coordinates"]["longitude"] = self.coordinatesURLTranslator.getLongitude(response.url)
     locations["coordinates"]["latitude"] = self.coordinatesURLTranslator.getLatitude(response.url)
     return locations
Esempio n. 5
0
 def parse(self, response):
     sel = Selector(response)
     restaurants = sel.xpath('//a[contains(@id, "establecimiento")]')
     for restaurant in restaurants:
         locationCsv = LocationCsv()
         locationCsv["id_restaurante"] = restaurant.css("a::attr(data-id)").extract()
         locationCsv["nombre_restaurante"] = restaurant.css("a .result-info h4::text").extract()
         locationCsv["latitud"] = self.coordinatesURLTranslator.getLatitude(response.url)
         locationCsv["longitud"] = self.coordinatesURLTranslator.getLongitude(response.url)
         yield locationCsv
Esempio n. 6
0
 def parse_member_overdraft(self, resp):
     hxs = Selector(resp)
     mem_item = resp.meta['item']
     overdraft_click_nodes = hxs.xpath('//ul[@class="tab-nav"]//a[@href="#tab7"]/@onclick')
     if not overdraft_click_nodes:
         mem_item['overdraft'] = '0.0'
         yield mem_item
     else:
         click_str = overdraft_click_nodes.extract()[0]
         ids = re.findall(r'\d+', click_str)
         yield FormRequest(url='http://vip6.sentree.com.cn/shair/memberArchives!debtlist.action', formdata={'id' : ids[0], 'shopid' : ids[1]}, callback=self.parse_member_overdraft2, meta=resp.meta)
Esempio n. 7
0
    def parse_showdesk_members2(self, resp):
        hxs = Selector(resp)
        next_page_nodes = hxs.xpath('//a[@class="next_page"]')
        meta = resp.meta
        if next_page_nodes and meta['page'] == 1:
            next_page_node = next_page_nodes[0]
            total_page = next_page_node.xpath('./parent::li/preceding-sibling::li')[-1].xpath('a/child::text()').extract()[0].strip()
            for i in xrange(2, int(total_page) + 1):
                new_meta = dict(meta)
                new_meta['page'] = i
                self.log('%s yield member list page %d' % (self.name, i))
                yield FormRequest(url="http://vip6.sentree.com.cn/shair/memberInfo!memberlist.action", formdata={
                             'page.currNum' : str(i),
                             'page.rpp' : '30',
                             'r' : str(meta['r']),
                             'set' : 'manage'
                             }, callback=self.parse_showdesk_members2, meta=new_meta)

        member_nodes = hxs.xpath('//form[@id="delForm"]//table/tbody/tr')
        if member_nodes:
            for m_n in member_nodes:
                member_tds = m_n.xpath('td')
                info_query_str = None
                try:
                    phone = member_tds[1].xpath('a/child::text()').extract()[0].replace(' ', '').strip()
                    name = member_tds[2].xpath('span/child::text()').extract()[0].replace(' ', '').strip()
                    card_no = member_tds[6].xpath('table/tr/td[1]/a/child::text()').extract()[0].replace(' ', '').strip()
                    info_query_str = member_tds[6].xpath('table/tr/td[1]/a/@onclick').extract()[0]
                    info_query_str = info_query_str[info_query_str.find('?') + 1:]
                    info_query_str = info_query_str[:info_query_str.find("'")]
                    card_name = member_tds[6].xpath('table/tr/td[2]/child::text()').extract()[0].replace(' ', '').strip()
                    card_type = member_tds[6].xpath('table/tr/td[3]//child::text()').extract()[0].replace(' ', '').replace(' ', '').strip()
                    discont = member_tds[6].xpath('table/tr/td[4]/child::text()').extract()[0].replace(' ', '').replace(' ', '').strip()
                    timeout = member_tds[6].xpath('table/tr/td[9]/child::text()').extract()[0].replace(' ', '').replace(' ', '').strip()
                    overage = str_list_strip_replace(member_tds[6].xpath('table/tr/td[7]//child::text()').extract(), [' ', ' ', '\t', '\n'])
                except:
                    self.log(traceback.format_exc())
                    continue
                mem_item = SentreeMembersSimpleItem()
                mem_item[u'phone'] = phone
                mem_item[u'name'] = name
                mem_item[u'card_no'] = card_no
                mem_item[u'card_name'] = card_name
                mem_item[u'card_type'] = card_type
                mem_item[u'discont'] = discont
                mem_item[u'timeout'] = timeout
                mem_item[u'overage'] = overage
                if info_query_str:
                    new_meta = dict(meta)
                    new_meta['item'] = mem_item
                    yield Request(url='http://vip6.sentree.com.cn/shair/memberArchives!editMember.action?%s%d' % (info_query_str, time.time()), callback=self.parse_member_overdraft, meta=new_meta)
                else:
                    mem_item['overdraft'] = '0.0'
                    yield mem_item
Esempio n. 8
0
 def parse(self, response):
     sel = Selector(response)
     sites = sel.xpath('//ul[@class="directory-url"]/li')
     items = []
     for site in sites:
         item = Dmozitem()
         item['title'] = site.xpath('a/text()').extract()
         item['link'] = site.xpath('a/@href').extract()
         item['desc'] = site.xpath('text()').extract()
         items.append(item)
     return items
Esempio n. 9
0
    def parse(self, response):
        topic_id = response.meta[ 'topic_id' ]
        sel = Selector(text=response.body, type="html")
        
        topic_lists = sel.xpath('//div[re:test(@class,"result.*?")]')
        for topic in topic_lists:
            topic_item = Topic_Item()
            temp_sel = Selector(text=topic.extract())
            
            title = temp_sel.xpath('//h3[re:test(@class,"subheader-3")]/a/text()')[0].extract()
            print title
            topic_item['topic_title']=title
            
            board = temp_sel.xpath('//div[re:test(@class,"meta")]/a/text()')[0].extract()
            print board
            poster = temp_sel.xpath('//div[re:test(@class,"meta")]/a/text()')[1].extract().strip()
            print poster
            topic_item['topic_author']=poster
            
            main_con = temp_sel.xpath('//div[re:test(@class,"meta")]')[0].extract().strip()
            post_time_ = re.findall(self.post_time_pa,main_con)[0]
            post_time_str = '20'+post_time_[0]+' '+post_time_[2]+':00'
            post_time = time.strptime(post_time_str, '%Y-%m-%d %H:%M:%S')
#             print post_time
            
            if '4e0b' in post_time_[1].__repr__():
                print u'下午'
                post_time = self.time_to_datetime(post_time)+ datetime.timedelta(hours=12)
            elif '4e0a' in post_time_[1].__repr__():
                print u'上午'
                post_time = post_time_str
            
            print post_time
            topic_item['topic_post_time']=post_time
            
            content = temp_sel.xpath('//div[re:test(@class,"content")]/text()')[0].extract().strip()
            print content
            topic_item['topic_content']=content
            
            url = temp_sel.xpath('//h3[re:test(@class,"subheader-3")]/a/@href').extract()[0]
            url = 'http://www.battlenet.com.cn'+url
            print url
            topic_item['topic_url']=url
            
            reply_num = temp_sel.xpath('//h3[re:test(@class,"subheader-3")]/span[re:test(@class,"small")]/text()').extract()
            reply_num = reply_num[len(reply_num)-1]
            reply_num = re.findall(self.dig_pattern,reply_num)[0]
            print reply_num
            topic_item['topic_reply']=reply_num
            
            print '+++++++++++++++++++++++++++++++++++++'
            yield  scrapy.Request(url,callback=self.parse_torrent,meta={'topic_item':topic_item})  
Esempio n. 10
0
    def parse(self, response):
        topic_id = response.meta[ 'topic_id' ]
        sel = Selector(text=response.body, type="html")
        
        topic_lists = sel.xpath('//div[re:test(@class,"result f s3")]')
        for topic in topic_lists:
            topic_item = Topic_Item()
            temp_sel = Selector(text=topic.extract())
        
            title = temp_sel.xpath('//h3[re:test(@class,"c-title")]/a/text()').extract()[0].strip()
#             print title
            topic_item['topic_title']=title
            
            content = temp_sel.xpath('//div[re:test(@class,"c-abstract")]/text()').extract()[0].strip()
            print content
            topic_item['topic_content']=content
            
            post_time = temp_sel.xpath('//div[re:test(@class,"c-summary-1")]/span')[2].extract()
            post_time = re.findall(self.post_pa,post_time)[0]+' 00:00:00'
            print post_time
            topic_item['topic_post_time']=post_time
            
            author = temp_sel.xpath('//div[re:test(@class,"c-summary-1")]/span/text()')[1].extract()
            print author
            topic_item['topic_author']=author
            
            url = temp_sel.xpath('//h3[re:test(@class,"c-title")]/a/@href').extract()[0]
            print url
            topic_item['topic_url']=url
            topic_item['topic_reply']=0
            
            print '+++++++++++++++++++++++++++++++++++++'
            yield  scrapy.Request(url,callback=self.parse_torrent,meta={'topic_item':topic_item})  
Esempio n. 11
0
	def parse_item(self, response):
		item = BuscapeItem()
		sel = Selector(response)
		title = sel.xpath('//h1[@class="name"]/text()').extract()[0]
		item["title"] = title
		item["url"] = response.url
		attributes = []
		pares = sel.xpath('//*[@class="product-details"]/ul/li')
		for par in pares:
			key = par.xpath('span[@class="name"]/text()').extract()
			value = par.xpath('span[@class="value"]/text()').extract()
			attributes.append({"key": key[0], "value" : value})
		item["attributes"] = attributes
		return item
Esempio n. 12
0
 def parse(self, response):
     sel = Selector(response)
     restaurants = sel.xpath('//a[contains(@id, "establecimiento")]')
     for restaurant in restaurants:
         locationCsv = LocationCsv()
         locationCsv["id_restaurante"] = restaurant.css(
             "a::attr(data-id)").extract()
         locationCsv["nombre_restaurante"] = restaurant.css(
             "a .result-info h4::text").extract()
         locationCsv["latitud"] = self.coordinatesURLTranslator.getLatitude(
             response.url)
         locationCsv[
             "longitud"] = self.coordinatesURLTranslator.getLongitude(
                 response.url)
         yield locationCsv
Esempio n. 13
0
 def __check_detail_title_valid(detail_title: Selector) -> bool:
     if not isinstance(detail_title, Selector):
         raise TypeError(f"detail_title必须是Selector类型")
     else:
         loc_title = detail_title.extract()
     # 自定义检验规格
     return True
Esempio n. 14
0
        def extract_text(row_sel: Selector, query: str) -> List[str]:
            vals = row_sel.css(query).getall()

            if len(vals) > 0:
                return [v.strip() for v in vals]
            else:
                return vals
Esempio n. 15
0
    def parse(self, response):
        topic_kws = response.meta[ 'topic_kws' ]

        all_content = BeautifulSoup(response.body,'html5lib')
        topic_lists = all_content.find_all('li',class_="pbw")
        for topic in topic_lists:
            topic_item = Topic_Item()
            topic_item['topic_db_message'] = topic_kws
            temp_sel = Selector(text=topic.prettify(), type="html")
            
                        
            title = topic.find_all("a")[0].get_text()
#             print title
            topic_item['topic_title']=title
            
            url = topic.find_all("a")[0].get('href')
            print url
            topic_item['topic_url']=url   
             
            topic_content = topic.find_all("p")[1].get_text()
            print topic_content
            topic_item['topic_content']=topic_content  
            
            post_time = temp_sel.xpath('//p/span/text()')[0].extract().strip()
            print post_time
            topic_item['topic_post_time']=post_time   
            
            author = temp_sel.xpath('//p/span/a/text()')[0].extract().strip()
#             print author
            topic_item['topic_author']=author  

            reply_msg = topic.find_all('p',class_='xg1')[0]
            msg = re.findall(self.reply_pattern,reply_msg.get_text())[0]
            print msg
            reply_num = msg[0]
            read_num = msg[1]
            topic_item['topic_reply']=reply_num    
            
            homepage = temp_sel.xpath('//p/span/a/@href').extract()[0]
            user_id = re.findall(self.userid_pa,homepage)[0]
            print user_id
            topic_item['poster_id']=user_id    
            
            topic_item['homepage'] = homepage 
            
            print '+++++++++++++++++++++++++++++++++++++++++'
            yield  scrapy.Request(url,callback=self.parse_torrent,meta={'topic_item':topic_item})  
Esempio n. 16
0
    def parse(self, response):
        topic_kws = response.meta["topic_kws"]

        all_content = BeautifulSoup(response.body, "html5lib")
        topic_lists = all_content.find_all("li", class_="pbw")
        for topic in topic_lists:
            topic_item = Topic_Item()
            topic_item["topic_db_message"] = topic_kws
            temp_sel = Selector(text=topic.prettify(), type="html")

            title = topic.find_all("a")[0].get_text()
            #             print title
            topic_item["topic_title"] = title

            url = topic.find_all("a")[0].get("href")
            print url
            topic_item["topic_url"] = url

            topic_content = topic.find_all("p")[1].get_text()
            #             print topic_content
            topic_item["topic_content"] = topic_content

            post_time = temp_sel.xpath("//p/span/text()")[0].extract().strip()
            print post_time
            topic_item["topic_post_time"] = post_time

            author = temp_sel.xpath("//p/span/a/text()")[0].extract().strip()
            #             print author
            topic_item["topic_author"] = author

            reply_msg = topic.find_all("p", class_="xg1")[0]
            msg = re.findall(self.reply_pattern, reply_msg.get_text())[0]
            #             print msg
            reply_num = msg[0]
            read_num = msg[1]
            topic_item["topic_reply"] = reply_num

            homepage = temp_sel.xpath("//p/span/a/@href").extract()[0]
            user_id = re.findall(self.userid_pa, homepage)[0]
            #             print user_id
            topic_item["poster_id"] = user_id

            topic_item["homepage"] = homepage

            print "+++++++++++++++++++++++++++++++++++++++++"
            yield scrapy.Request(url, callback=self.parse_torrent, meta={"topic_item": topic_item})
Esempio n. 17
0
    def parse_quick_facts(self, selector: Selector, quest: Quest):
        """
        parses the quick facts section on a wowhead quest page

        :param selector: selector of the quick facts section
        :param quest: quest item to store gathered info in
        :return:
        """
        result = selector.re(r"Start:\s(.*</a>)")
        if result:
            element = Selector(text=result[0])
            quest["npc"] = element.xpath("//a/text()").get()
            quest["npc_link"] = self.base_url + element.xpath(
                "//a/@href").get()
        else:
            quest["npc"] = "Unknown"
            quest["npc_link"] = "Unknown"
Esempio n. 18
0
    def parse_item(self, response):
        index = response.meta['index']
        if index == 1:
            index_count = response.selector.xpath('//*[@id="m-page"]/span/text()').extract()
            index_count = [x.strip() for x in index_count if x.strip()]
            index, count = [int(x) for x in index_count[0].split('/')]
            for i in range(index + 1, count + 1):
                yield Request(url=self.get_gn_url(i), headers=TONGHUASHUN_GN_HEADER,
                              meta={'index': i},
                              callback=self.parse_item)

        trs = response.xpath('/html/body/table/tbody//tr').extract()

        try:
            for tr in trs:
                start_date = Selector(text=tr).xpath('//td[1]/text()').extract_first()
                name = Selector(text=tr).xpath('//td[2]/a/text()').extract_first()
                link = Selector(text=tr).xpath('//td[2]/a/@href').extract_first()
                news_title = Selector(text=tr).xpath('//td[3]/a/text()').extract_first()
                news_link = Selector(text=tr).xpath('//td[3]/a/@href').extract_first()
                leadings = [x.rsplit('/')[-2] for x in Selector(text=trs[0]).xpath('//td[4]/a/@href').extract()]
                count = Selector(text=tr).xpath('//td[5]/text()').extract()
                yield SectorItem(id='{}_{}_{}'.format('10jqka', 'gn', name), start_date=start_date, name=name,
                                 link=link, news_title=news_title, news_link=news_link, leadings=leadings, count=count,
                                 producer='10jqka', type='gn')
        except Exception as e:
            self.logger.error('error parse 10jqka gainian sector url:{} {}'.format(response.url, e))
Esempio n. 19
0
class RestaurantIDsGetter(object):
    def __init__(self, response):
        self.sel = Selector(response)

    def getID(self, url):
        xpathQuery = '//a[contains(@href, "' + url + '")]/@data-id'
        queryResults = self.sel.xpath(xpathQuery).extract()
        if len(queryResults) == 0:
            return "NoID"
        return queryResults[0]
Esempio n. 20
0
    def detail_parse(self, response):
        page = response.meta['page']
        token = json.loads(requests.post(self.token_url,
                                         headers=self.header).text,
                           strict=False).get('d', '')
        data = copy.deepcopy(self.data)
        data.update({'Token': token, 'PageIndex': str(page)})
        list_content = json.loads(requests.post(self.list_url,
                                                headers=self.header,
                                                json=data).text,
                                  strict=False).get('d', '')
        cont_list = json.loads(list_content).get('Table', [])
        for cont in cont_list:
            result_dict = {}
            info_id = cont.get('InfoID', '')
            post_data = {
                "Token":
                json.loads(requests.post(self.token_url,
                                         headers=self.header).text,
                           strict=False).get('d', ''),
                "PageIndex":
                "1",
                "PageSize":
                "1",
                "InfoID":
                info_id
            }
            detail_content = json.loads(requests.post(self.detail_url,
                                                      headers=self.header,
                                                      json=post_data).text,
                                        strict=False).get('d', '')
            detail = json.loads(detail_content, strict=False).get('Table',
                                                                  [])[0]

            result_dict['punish_code'] = detail.get('name1', '')
            result_dict['case_name'] = detail.get('name2', '')
            result_dict['punish_category_one'] = detail.get('name3', '')
            result_dict['punish_category_two'] = detail.get('name4', '')
            result_dict['punish_type'] = detail.get('name5', '')
            result_dict['punish_basis'] = detail.get('name6', '')
            result_dict['company_name'] = detail.get('name7', '')
            result_dict['credit_code'] = detail.get('name8', '')
            result_dict['organization_code'] = detail.get('name9', '')
            result_dict['regno'] = detail.get('name10', '')
            result_dict['tax_code'] = detail.get('name11', '')
            result_dict['id_number'] = detail.get('name12', '')
            result_dict['frname'] = detail.get('name13', '')
            result_dict['punish_content'] = detail.get('name14', '')
            result_dict['public_date'] = detail.get('name15', '')
            result_dict['punish_org'] = detail.get('name16', '')
            result_dict['update'] = detail.get('infodate', '')
            for key, value in result_dict.items():
                result_dict[key] = ''.join(Selector(text=value).xpath('//p//text()').extract()).strip()\
                    if '<p style' in value else value
            yield self.handle_result(response, result_dict, info_id)
Esempio n. 21
0
    def parse(self, response):
        sel = Selector(response)
        sites = sel.xpath('//div[@class="mainleft"]')
        itemlist= []
        
        for site in sites:
            item = CnkispiderItem()
            
            title = site.xpath('//*[@id="chTitle"]/text()').extract()
            #将相应的值填入到item对应的属性中去
            item['title'] = [t.encode('utf-8') for t in title] 
            author = site.xpath('//*[@id="content"]/div[1]/div[3]/div[2]/p[1]/a/text()').extract()
            if author == None:
                author = site.xpath('//*[@id="content"]/div[1]/div[2]/p[1]/a/text()').extract()
            item['author'] = [a.encode('utf-8') for a in author]
            institution = site.xpath('//*[@id="content"]/div[1]/div[3]/div[2]/p[3]/a/text()').extract()
            item['institution'] = [i.encode('utf-8') for i in institution]
            abstract = site.xpath('//*[@id="ChDivSummary"]/text()').extract()
            item['abstract'] = [a.encode('utf-8') for a in abstract]
            keyWord = site.xpath('//*[@id="ChDivKeyWord"]/a/text()').extract()
            item['keyWord'] = [k.encode('utf-8') for k in keyWord]
            downloadFreq = site.xpath('//*[@id="content"]/div[1]/div[5]/ul/li/text()').re(u'\s*【下载频次】(.*)')
            item['downloadFreq'] = [d.encode('utf-8') for d in downloadFreq]
            quoteFreq = site.xpath('//*[@id="rc3"]/text()').re('\W(\d+)\W')
            item['quoteFreq'] = [q.encode('utf-8') for q in quoteFreq]
            
            itemlist.append(item)
            
            #加入日志记录,级别为info
            log.msg("Appending item...", level=log.INFO)
        #生成日志
        log.msg("Append done.", level=log.INFO)
        return itemlist



# if __name__ == "__main__":
#     sys.path.append('F:\Pythonworkspace\cnkiSpider_master\cnkiSpider\cnkiSpider')
#     cnki = CNKI_Spiders()
# #     print os.getcwd()
#     print cnki
#         
Esempio n. 22
0
 def parse_member_overdraft2(self, resp):
     mem_item = resp.meta['item']
     hxs = Selector(resp)
     total_overdraft_nodes = hxs.xpath('//div[@class="table-responsive"]/table/tbody/tr/td[3]/child::text()')
     if not total_overdraft_nodes:
         overdraft = '0.0'
     else:
         overdrafts = str_list_strip_replace(total_overdraft_nodes.extract(), ['&nbsp;', ' ', '\t', '\n'])
         overdraft_statuss = str_list_strip_replace(hxs.xpath('//div[@class="table-responsive"]/table/tbody/tr/td[5]/font/child::text()').extract(), ['&nbsp;', ' ', '\t', '\n'])
         overdraft = float(0)
         for i, s_overdraft in enumerate(overdrafts):
             f_overdraft = float(s_overdraft)
             if u'已还清' in overdraft_statuss[i]:
                 overdraft = overdraft - f_overdraft
                 continue
             if u'未还清' in overdraft_statuss[i]:
                 overdraft = overdraft + f_overdraft
         if overdraft < 0:
             overdraft = float(0)
         overdraft = '%.1f' % overdraft
     mem_item['overdraft'] = overdraft
     yield mem_item
Esempio n. 23
0
 def parse_store(self, response, js):
     props = {}
     props["addr_full"] = Selector(text=js["address"]).xpath("//p/text()").get()
     props["ref"] = js["url_title"]
     props["lat"] = js["coordinates"][0]
     props["lon"] = js["coordinates"][1]
     props["city"] = js["city"]
     props["state"] = js["state"]
     props["postcode"] = js["zip"]
     props["phone"] = js["phone_number"]
     hours = response.css(".hours p:not(:empty)").xpath("text()").get()
     props["opening_hours"] = hours
     return GeojsonPointItem(**props)
Esempio n. 24
0
    def parse_showdesk_services(self, resp):
        hxs = Selector(resp)
        headers = hxs.xpath('//table[@id="itemset"]/thead/tr/th/child::text()').extract()
        if not headers:
            self.log('%s can not find table headers.' % self.name, level=log.ERROR)
            yield None
            return
        service_nodes = hxs.xpath('//table[@id="itemset"]/tbody/tr')
        if not service_nodes:
            self.log('%s can not find services info' % self.name, level=log.ERROR)
            yield None
            return
        for s_n in service_nodes:
            info_nodes = s_n.xpath('td')
            info = OrderedDict({})
            no = None
            for idx, i_n in enumerate(info_nodes):
                if idx == 0 or idx == len(info_nodes) - 1:
                    continue
                if idx == 8:
                    info[headers[idx]] = str_list_strip_replace(str_list_strip(hxs.xpath('//span[@id="pricespan%s"]' % no).xpath('child::text()').extract()), [' ', '\t', '\n'])
                    continue
                if idx == 9:
                    discount_nodes = i_n.xpath('.//div[starts-with(@id, "icddiv")]')
                    discounts = []
                    if discount_nodes:
                        for d_n in discount_nodes:
                            discounts.append(' | '.join(str_list_strip_replace(str_list_strip(d_n.xpath('./child::text()').extract()), [' ', '\t', '\n'])))
                    info[headers[idx]] = ' ||| '.join(discounts)
                    continue
                info[headers[idx]] = ' | '.join(str_list_strip_replace(str_list_strip(i_n.xpath('descendant::text()').extract()), [' ', '\t', '\n']))
                if idx == 1:
                    no = info[headers[idx]]

            item = SentreeServiceItem()
            item['info'] = info
#             items.append(info)
            yield item
Esempio n. 25
0
    def _validate_response(self, response: Union[Response, str]) -> bool:
        """

        :param response:
        :type response: Response
        :return:
        :rtype: bool
        """
        if isinstance(response, str):
            response: Selector = Selector(text=response)

        response: Union[Response, Selector]
        names_in_meta: List[str] = response.xpath("/html/head/meta").xpath(
            "@name").extract()

        return "ROBOTS" not in names_in_meta
Esempio n. 26
0
    def parse(self, response):
        topic_id = response.meta[ 'topic_id' ]
        sel = Selector(text=response.body, type="html")
        print 'starting'
        topic_lists = sel.xpath('//ul[re:test(@id,"results")]/li')
        for topic in topic_lists:
            topic_item = Topic_Item()
            temp_sel = Selector(text=topic.extract())
            topic_item['topic_id'] = topic_id
    
            title = temp_sel.xpath('//h3/a')[0].extract()
            title = self.parse_html_content(title)
            print title
            topic_item['topic_title']=title
            
            content = temp_sel.xpath('//p')[0].extract()
            content = self.parse_html_content(content).strip()
            print type(content)
            print content.encode('gbk','ignore')
            topic_item['topic_content']=content
    
            ttime = temp_sel.xpath('//span[re:test(@class,"green stat")]/text()').extract()[0]
            tt = ttime.split()[1].__repr__()
            print tt
            now = datetime.datetime.now()
            if '5e74' in tt:
                time_pa = re.findall(self.time_1_pa,ttime.split()[1])[0]
                new_time = str(time_pa[0])+'-'+str(time_pa[1])+'-'+str(time_pa[2])+' '+'00:00:00'
                print time_pa
            elif '5206' in tt:
                time_pa = re.findall(self.time_2_pa,ttime.split()[1])[0]
                new_time = now - datetime.timedelta(minutes=int(time_pa))
                print time_pa
            elif '5c0f' in tt:
                time_pa = re.findall(self.time_2_pa,ttime.split()[1])[0]
                new_time = now - datetime.timedelta(hours=int(time_pa))
                print time_pa
            print new_time    
            topic_item['topic_post_time']= new_time  
            poster =  ttime.split()[0]
            topic_item['topic_author'] = poster
            
            url = temp_sel.xpath('//h3/a/@href').extract()[0]
            print url
            topic_item['topic_url']=url
            yield  scrapy.Request(url,callback=self.parse_torrent,meta={'topic_item':topic_item})            
            

            print '++++++++++++++++++++++++++++++'
class RestaurantIDsGetter(object):

    def __init__(self, response):
        self.sel = Selector(response)
        

    
    def getID(self, url):
        xpathQuery = '//a[contains(@href, "' + url + '")]/@data-id'
        queryResults = self.sel.xpath(xpathQuery).extract()
        if len(queryResults) == 0:
            return "NoID"
        return queryResults[0]

    
    
        
Esempio n. 28
0
    def parse_XML(self, response):
        if not hasattr(self, 'parse_node'):
            raise NotConfigured('You must define parse_node method in order to scrape this XML feed')
        response = self.adapt_response(response)
        if self.iterator == 'iternodes':
            nodes = self._iternodes(response)
        elif self.iterator == 'xml':
            selector = Selector(response, type='xml')
            self._register_namespaces(selector)
            nodes = selector.xpath('//%s' % self.itertag)
        elif self.iterator == 'html':
            selector = Selector(response, type='html')
            self._register_namespaces(selector)
            nodes = selector.xpath('//%s' % self.itertag)
        else:
            raise NotSupported('Unsupported node iterator')

        return self.parse_nodes(response, nodes)
Esempio n. 29
0
 def test_parsel_parse_and_extract(self):
     for i in range(ITERATIONS):
         for name, page in ibl_pages.items():
             s = Selector(text=page.body)
             extract(parsel_extractors[name], s)
Esempio n. 30
0

schema = FakeContainer(descriptors['#default'])
validate = schema._validate_and_adapt_item
_names_map = {'daft_ie': 'daft', 'patchofland': 'pol'}
ibl_extractors = {}
ibl_pages = {}
selector_pages = {}
for template_name in ('daft_ie', 'hn', 'patchofland'):
    with open('%s/data/templates/%s.html' % (_PATH, template_name)) as f:
        html_page = HtmlPage(body=f.read().decode('utf-8'))
        name = _names_map.get(template_name, template_name)
        ibl_pages[name] = html_page
        ibl_extractors[name] = SlybotIBLExtractor([(html_page, descriptors,
                                                    '0.13.0')])
        selector_pages[name] = Selector(text=html_page.body)


class TestExtractionSpeed(TestCase):
    def test_parsel_parse_and_extract(self):
        for i in range(ITERATIONS):
            for name, page in ibl_pages.items():
                s = Selector(text=page.body)
                extract(parsel_extractors[name], s)

    def test_slybot_parse_and_extract(self):
        for i in range(ITERATIONS):
            for name, page in ibl_pages.items():
                extraction_page = HtmlPage(body=page.body)
                ibl_extractors[name].extract(extraction_page)
Esempio n. 31
0
    def parse_consumer_bill_stream_validate(self, resp):
        hxs = Selector(resp)
        menu = [u'营业记录', u'水单记录', u'水单审查']
        bill_headers = []
        head_nodes = hxs.xpath('//tbody[@id="billBody"]/parent::table/thead/tr/th')
        if not head_nodes:
            self.log('in %s.parse_consumer_bill_stream_validate, can not get table headers.' % self.name, level=log.ERROR)
            yield None
            return
        for idx, hd in enumerate(head_nodes):
            if idx == len(head_nodes) - 1:
                break
            txts = hd.xpath('child::text()').extract()
            bill_headers.append('/'.join(txts))

        bill_nodes = hxs.xpath('//tbody[@id="billBody"]/tr')
        if bill_nodes:
            for bn in bill_nodes:
                item = SentreeShuiDanShenChaItem()
                item['menu'] = menu
                headers = []
                item['data'] = OrderedDict({})
                data_nodes = bn.xpath('td')
                for idx, dn in enumerate(data_nodes):
                    if idx == 6:
                        break
                    h = bill_headers[idx]
                    if idx == 0 or idx == 4:
                        headers.append(h)
                        item['data'][h] = [str_list_strip(dn.xpath('descendant::text()').extract())[0], True]
                        continue
                    if idx == 1 or idx == 2 or idx == 3:
                        headers.append(h)
                        item['data'][h] = [str_list_strip(dn.xpath('descendant::text()').extract()), True]
                        continue
                    if idx == 5:
                        detail = []
                        subtrs = dn.xpath('table/tr')
                        recoded_headers = False
                        for tr in subtrs:
                            empperfors = []
                            subdetail = OrderedDict({})
                            subtds = tr.xpath('td')
                            h = bill_headers[idx + 0]
                            if not recoded_headers:
                                headers.append(h)
                            subdetail[h] = [str_list_strip(subtds[0].xpath('descendant::text()').extract()), True]
                            h = bill_headers[idx + 1]
                            if not recoded_headers:
                                headers.append(h)
                            subdetail[h] = [str_list_strip(subtds[1].xpath('descendant::text()').extract())[0], True]

                            subtrs2 = subtds[2].xpath('table/tr')
                            for kdx, tr2 in enumerate(subtrs2):
                                if kdx == len(subtrs2) - 1:
                                    break
                                empperfor = OrderedDict({})
                                subtds2 = tr2.xpath('td')
                                h = bill_headers[idx + 2 + 0]
                                if not recoded_headers:
                                    headers.append(h)
                                if h not in empperfor:
                                    empperfor[h] = []
                                empperfor[h].append([str_list_strip(subtds2[0].xpath('descendant::text()').extract()), True])
                                h = bill_headers[idx + 2 + 1]
                                if not recoded_headers:
                                    headers.append(h)
                                if h not in empperfor:
                                    empperfor[h] = []
                                empperfor[h].append([str_list_strip(subtds2[1].xpath('descendant::text()').extract())[0], True])
                                h = bill_headers[idx + 2 + 2]
                                h = u'员工' + h
                                if not recoded_headers:
                                    headers.append(h)
                                if h not in empperfor:
                                    empperfor[h] = []
                                empperfor[h].append([str_list_strip(subtds2[2].xpath('descendant::text()').extract())[0], True])
                                empperfors.append(empperfor)
                                recoded_headers = True
                            subdetail[u'员工业绩'] = [empperfors, False]
                            detail.append([subdetail, False])
                            recoded_headers = True
                        item['headers'] = headers
                        item['data'][u'详情'] = [detail, False]
#                 items.append(item)
                yield item
 def __init__(self, response):
     self.sel = Selector(response)
Esempio n. 33
0
            item['vals'] = infos
            yield item

items = []

if __name__ == '__main__':
    f = open('e:\\1.html')

    html = ""
    for l in f:
        html += l
    f.close()

    resp = TextResponse(url="", body=html)
    if 1:
        hxs = Selector(resp)
        total_overdraft_nodes = hxs.xpath('//div[@class="table-responsive"]/table/tbody/tr/td[3]')
        total_overdraft_nodes = hxs.xpath('//div[@class="table-responsive"]/table/tbody/tr/td[3]/child::text()')
        if not total_overdraft_nodes:
            overdraft = '0'
        else:
            overdraft = str_list_strip_replace(total_overdraft_nodes.extract(), ['&nbsp;', ' ', '\t', '\n'])[0]
        print overdraft
    sys.exit(0)

    s = SentreeSpider()
    try:
        s.parse_showdesk_services(resp)
    except:
        print traceback.format_exc()
Esempio n. 34
0
 def extractData(self, body, xpath):
     if isinstance(body, str):
         return Selector(text=body).xpath(xpath).extract()
     return Selector(response=body).xpath(xpath).extract()
Esempio n. 35
0
 def parse_page(self,response):
     item=CnbetaItem()
     sel=Selector(response)
     item["title"]=sel.xpath('//title/text()').extract()
     item['url']=response.url
     return item
Esempio n. 36
0
 def __init__(self, response):
     self.sel = Selector(response)