Example #1
0
 def parse_detail(self, response):
     """爬取详情页"""
     item = ToolsItem()
     item['name'] = response.xpath('//td[@colspan]/../../tr[1]/td[2]/text()').get()
     item['suoyin'] = response.xpath('//td[@colspan]/../../tr[2]/td[2]/text()').get()
     item['wenhao'] = response.xpath('//td[@colspan]/../../tr[4]/td[2]/text()').get()
     item['start_time'] = response.xpath('//td[@colspan]/../../tr[3]/td[4]/text()').get()
     item['url'] = response.url
     yield item
    def parse_kexueyuan_detail(self, response):
        """爬取科学院士简历"""
        item = ToolsItem()
        item['name'] = "科学院院士"
        item['title'] = response.xpath('//h1/text()').get()
        item['content'] = re.sub(
            r'\s+', '', "".join(response.xpath('//p//text()').getall()))
        item['url'] = response.url

        yield item
 def parse_detail(self, response):
     """爬取详情页"""
     item = ToolsItem()
     item['name'] = response.xpath('//div[@id="detail_leftcontent"]/div/h1/text()').get()
     item['organization'] = response.xpath('//t[text()="发布单位"]/../../td/text()').get()
     item['public_date'] = response.xpath('//t[text()="发布日期"]/../../td/text()').get()
     item['used_date'] = response.xpath('//t[text()="实施日期"]/../../td/text()').get()
     item['standard_num'] = response.xpath('//t[text()="标准编号"]/../../td/text()').get()
     item['country'] = response.xpath('//t[text()="国别"]/../../td/text()').get()
     item['url'] = response.url
     yield item
Example #4
0
    def parse_gongchengyuan_detail(self, response):
        """爬取工程院士简历"""
        item = ToolsItem()
        item['name'] = "工程院院士"
        item['title'] = response.xpath(
            '//div[@class="right_md_name"]/text()').get()
        item['content'] = re.sub(
            r'\xa0|\u2002', "",
            "".join(response.xpath('//div[@class="intro"]/p/text()').getall()))
        item['url'] = response.url

        yield item
 def parse_detail(self, response):
     """爬取详情页"""
     item = ToolsItem()
     item['name'] = response.xpath('//head/title/text()').get()
     item['key_words'] = response.xpath('//div[text()="关键词:"]/../div[2]/a[@title]/text()').getall()
     item['author'] = response.xpath('//div[text()="作者:"]/../div[2]/a/text()').getall()
     item['author_organ'] = response.xpath('//div[text()="作者单位:"]/../div[2]/a/text()').getall()
     item['report_type'] = response.xpath('//div[text()="报告类型:"]/../div[2]/text()').get()
     item['plan_name'] = response.xpath('//div[text()="计划名称:"]/../div[2]/text()').get()
     item['plan_year'] = response.xpath('//div[text()="立项批准年:"]/../div[2]/text()').get()
     item['id_number'] = response.xpath('//div[text()="馆藏号:"]/../div[2]/text()').get()
     item['url'] = response.url
     yield item
Example #6
0
 def parse_detail(self, response):
     """爬取详情页"""
     item = ToolsItem()
     item['name'] = response.xpath('//head/title/text()').get()
     item['doi'] = response.xpath('//div[text()="doi:"]/../div[2]/a/text()').get()
     item['key_words'] = response.xpath('//div[text()="关键词:"]/../div[2]/a[@title]/text()').getall()
     item['author'] = response.xpath('//div[text()="作者:"]/../div[2]/a/text()').get()
     item['organ'] = response.xpath('//div[text()="学位授予单位:"]/../div[2]/a/text()').get()
     item['degree'] = response.xpath('//div[text()="授予学位:"]/../div[2]/text()').get()
     item['subject'] = response.xpath('//div[text()="学科专业:"]/../div[2]/a/text()').get()
     item['instructor'] = response.xpath('//div[text()="导师姓名:"]/../div[2]/a/text()').get()
     item['time'] = response.xpath('//div[text()="学位年度:"]/../div[2]/text()').get()
     item['url'] = response.url
     yield item
 def parse_detail(self, response):
     """爬取详情页"""
     item = ToolsItem()
     item['st_num'] = response.xpath('//td[@width="560"]/h1/text()').get()
     item['e_name'] = response.xpath(
         '//b/text()/../../../../tr[2]/td/text()').get()
     item['c_name'] = response.xpath('//b/text()').get()
     item['beg_time'] = re.sub(
         r'\D', "",
         response.xpath(
             '/html/body/div[3]/div/div/div/div/div[3]/div[4]/text()').get(
             ))
     item['url'] = response.url
     yield item
 def parse_detail(self, response):
     """爬取详情页"""
     item = ToolsItem()
     item['name'] = response.xpath(
         '//div[@id="detail_leftcontent"]/div/h1/text()').get()
     item['apply_num'] = response.xpath('//th/../../tr[2]/td/text()').get()
     item['apply_date'] = response.xpath('//th/../../tr[3]/td/text()').get()
     item['public_num'] = response.xpath('//th/../../tr[5]/td/text()').get()
     item['public_date'] = response.xpath(
         '//th/../../tr[4]/td/text()').get()
     item['apply_man'] = response.xpath('//th/../../tr[8]/td/text()').get()
     item['invent_man'] = response.xpath('//th/../../tr[9]/td/text()').get()
     item['abstract'] = response.xpath(
         '//*[@id="detail_leftcontent"]/div/div[2]/text()').get()
     item['url'] = response.url
     yield item
 def parse_detail(self, response):
     """爬取详情页"""
     item = ToolsItem()
     item['name'] = response.xpath('//head/title/text()').get()
     item['key_words'] = response.xpath(
         '//div[text()="关键词:"]/../div[2]/a[@title]/text()').getall()
     item['author'] = response.xpath(
         '//div[text()="作者:"]/../div[2]/a/text()').getall()
     item['conference_name'] = response.xpath(
         '//div[text()="会议名称:"]/../div[2]/a[@onclick]/text()').get()
     item['conference_time'] = re.sub(
         r'\D', "",
         response.xpath('//div[text()="会议时间:"]/../div[2]/text()').get())
     item['conference_local'] = response.xpath(
         '//div[text()="会议地点:"]/../div[2]/text()').get()
     item['conference_organ'] = response.xpath(
         '//div[text()="主办单位:"]/../div[2]/a/text()').get()
     item['url'] = response.url
     yield item
 def parse_detail(self, response):
     """爬取详情页"""
     item = ToolsItem()
     item['name'] = response.xpath('//head/title/text()').get()
     item['key_words'] = response.xpath(
         '//div[text()="关键词:"]/../div[2]/a[@title]/text()').getall()
     item['author'] = response.xpath(
         '//div[text()="作者:"]/../div[2]/a/text()').getall()
     item['author_organ'] = response.xpath(
         '//div[text()="作者单位:"]/../div[2]/*/text()').getall()
     item['periodical_name'] = response.xpath(
         '//div[text()="刊名:"]/../div[2]/a[@onclick]/text()').get()
     item['periodical_section'] = response.xpath(
         '//div[text()="所属期刊栏目:"]/../div[2]/a[@onclick]/text()').get()
     item['periodical_year_issue'] = response.xpath(
         '//div[text()="年,卷(期):"]/../div[2]/a/text()').get()
     item['fund_project'] = response.xpath(
         '//div[text()="基金项目:"]/../div[2]/a[@onclick]/text()').get()
     item['page_number'] = response.xpath(
         '//div[text()="页码:"]/../div[2]/text()').get()
     item['url'] = response.url
     yield item
 def parse_detail(self, response):
     """爬取详情页"""
     item = ToolsItem()
     item['name'] = response.xpath('//font/text()').get()
     item['project_num'] = response.xpath(
         '//div[text()="项目年度编号:"]/../div[2]/text()').get()
     item['public_year'] = response.xpath(
         '//div[text()="公布年份:"]/../div[2]/text()').get()
     orgs = response.xpath(
         '//div[text()="完成单位:"]/../div[2]/a/@onclick').getall()
     """得到列表元素为wfAnalysis('tech_result','山东时风(集团)有限责任公司','unit_name_teachResult')的列表"""
     organs = []
     while orgs:
         i = orgs.pop()
         current_org = i[26:-26]
         organs.append(current_org)
     item['organs'] = organs
     item['key_words'] = response.xpath(
         '//div[text()="关键词:"]/../div[2]/a[@title]/text()').getall()
     item['author'] = response.xpath(
         '//div[text()="完成人:"]/../div[2]/a[@id]/text()').getall()
     item['url'] = response.url
     yield item
Example #12
0
 def parse_detail(self, response):
     """爬取详情页"""
     item = ToolsItem()
     item['name'] = response.xpath('//head/title/text()').get()
     item['document_number'] = response.xpath(
         '//div[text()="发文文号:"]/../div[2]/text()').get()
     item['type'] = response.xpath(
         '//div[text()="库别名称:"]/../div[2]/text()').get()
     item['issuing_department'] = response.xpath(
         '//div[text()="颁布部门:"]/../div[2]/text()').get()
     item['effective_level'] = response.xpath(
         '//div[text()="效力级别:"]/../div[2]/text()').get()
     item['timeliness'] = response.xpath(
         '//div[text()="时效性:"]/../div[2]/text()').get()
     item['release_date'] = re.sub(
         r'\D', "",
         response.xpath('//div[text()="颁布日期:"]/../div[2]/text()').get())
     item['start_date'] = re.sub(
         r'\D', "",
         response.xpath('//div[text()="实施日期:"]/../div[2]/text()').get())
     item['content_type'] = response.xpath(
         '//div[text()="内容分类:"]/../div[2]/text()').get()
     item['url'] = response.url
     yield item