Python ToolsItem Examples, task2.items.ToolsItem Python Examples

Example #1

0

Show file

 def parse_detail(self, response):
     """爬取详情页"""
     item = ToolsItem()
     item['name'] = response.xpath('//td[@colspan]/../../tr[1]/td[2]/text()').get()
     item['suoyin'] = response.xpath('//td[@colspan]/../../tr[2]/td[2]/text()').get()
     item['wenhao'] = response.xpath('//td[@colspan]/../../tr[4]/td[2]/text()').get()
     item['start_time'] = response.xpath('//td[@colspan]/../../tr[3]/td[4]/text()').get()
     item['url'] = response.url
     yield item

Example #2

0

Show file

File: kexueyuan.py Project: Sliuyi/data-governance-based-on-tech_info

    def parse_kexueyuan_detail(self, response):
        """爬取科学院士简历"""
        item = ToolsItem()
        item['name'] = "科学院院士"
        item['title'] = response.xpath('//h1/text()').get()
        item['content'] = re.sub(
            r'\s+', '', "".join(response.xpath('//p//text()').getall()))
        item['url'] = response.url

        yield item

Example #3

0

Show file

File: wf_standard_wrjs.py Project: Sliuyi/data-governance-based-on-tech_info

 def parse_detail(self, response):
     """爬取详情页"""
     item = ToolsItem()
     item['name'] = response.xpath('//div[@id="detail_leftcontent"]/div/h1/text()').get()
     item['organization'] = response.xpath('//t[text()="发布单位"]/../../td/text()').get()
     item['public_date'] = response.xpath('//t[text()="发布日期"]/../../td/text()').get()
     item['used_date'] = response.xpath('//t[text()="实施日期"]/../../td/text()').get()
     item['standard_num'] = response.xpath('//t[text()="标准编号"]/../../td/text()').get()
     item['country'] = response.xpath('//t[text()="国别"]/../../td/text()').get()
     item['url'] = response.url
     yield item

Example #4

0

Show file

    def parse_gongchengyuan_detail(self, response):
        """爬取工程院士简历"""
        item = ToolsItem()
        item['name'] = "工程院院士"
        item['title'] = response.xpath(
            '//div[@class="right_md_name"]/text()').get()
        item['content'] = re.sub(
            r'\xa0|\u2002', "",
            "".join(response.xpath('//div[@class="intro"]/p/text()').getall()))
        item['url'] = response.url

        yield item

Example #5

0

Show file

File: wf_report_xny.py Project: Sliuyi/data-governance-based-on-tech_info

 def parse_detail(self, response):
     """爬取详情页"""
     item = ToolsItem()
     item['name'] = response.xpath('//head/title/text()').get()
     item['key_words'] = response.xpath('//div[text()="关键词："]/../div[2]/a[@title]/text()').getall()
     item['author'] = response.xpath('//div[text()="作者："]/../div[2]/a/text()').getall()
     item['author_organ'] = response.xpath('//div[text()="作者单位："]/../div[2]/a/text()').getall()
     item['report_type'] = response.xpath('//div[text()="报告类型："]/../div[2]/text()').get()
     item['plan_name'] = response.xpath('//div[text()="计划名称："]/../div[2]/text()').get()
     item['plan_year'] = response.xpath('//div[text()="立项批准年："]/../div[2]/text()').get()
     item['id_number'] = response.xpath('//div[text()="馆藏号："]/../div[2]/text()').get()
     item['url'] = response.url
     yield item

Example #6

0

Show file

 def parse_detail(self, response):
     """爬取详情页"""
     item = ToolsItem()
     item['name'] = response.xpath('//head/title/text()').get()
     item['doi'] = response.xpath('//div[text()="doi："]/../div[2]/a/text()').get()
     item['key_words'] = response.xpath('//div[text()="关键词："]/../div[2]/a[@title]/text()').getall()
     item['author'] = response.xpath('//div[text()="作者："]/../div[2]/a/text()').get()
     item['organ'] = response.xpath('//div[text()="学位授予单位："]/../div[2]/a/text()').get()
     item['degree'] = response.xpath('//div[text()="授予学位："]/../div[2]/text()').get()
     item['subject'] = response.xpath('//div[text()="学科专业："]/../div[2]/a/text()').get()
     item['instructor'] = response.xpath('//div[text()="导师姓名："]/../div[2]/a/text()').get()
     item['time'] = response.xpath('//div[text()="学位年度："]/../div[2]/text()').get()
     item['url'] = response.url
     yield item

Example #7

0

Show file

File: standard.py Project: Sliuyi/data-governance-based-on-tech_info

 def parse_detail(self, response):
     """爬取详情页"""
     item = ToolsItem()
     item['st_num'] = response.xpath('//td[@width="560"]/h1/text()').get()
     item['e_name'] = response.xpath(
         '//b/text()/../../../../tr[2]/td/text()').get()
     item['c_name'] = response.xpath('//b/text()').get()
     item['beg_time'] = re.sub(
         r'\D', "",
         response.xpath(
             '/html/body/div[3]/div/div/div/div/div[3]/div[4]/text()').get(
             ))
     item['url'] = response.url
     yield item

Example #8

0

Show file

File: wf_patents_xny.py Project: Sliuyi/data-governance-based-on-tech_info

 def parse_detail(self, response):
     """爬取详情页"""
     item = ToolsItem()
     item['name'] = response.xpath(
         '//div[@id="detail_leftcontent"]/div/h1/text()').get()
     item['apply_num'] = response.xpath('//th/../../tr[2]/td/text()').get()
     item['apply_date'] = response.xpath('//th/../../tr[3]/td/text()').get()
     item['public_num'] = response.xpath('//th/../../tr[5]/td/text()').get()
     item['public_date'] = response.xpath(
         '//th/../../tr[4]/td/text()').get()
     item['apply_man'] = response.xpath('//th/../../tr[8]/td/text()').get()
     item['invent_man'] = response.xpath('//th/../../tr[9]/td/text()').get()
     item['abstract'] = response.xpath(
         '//*[@id="detail_leftcontent"]/div/div[2]/text()').get()
     item['url'] = response.url
     yield item

Example #9

0

Show file

File: wf_conference_wrjs.py Project: Sliuyi/data-governance-based-on-tech_info

 def parse_detail(self, response):
     """爬取详情页"""
     item = ToolsItem()
     item['name'] = response.xpath('//head/title/text()').get()
     item['key_words'] = response.xpath(
         '//div[text()="关键词："]/../div[2]/a[@title]/text()').getall()
     item['author'] = response.xpath(
         '//div[text()="作者："]/../div[2]/a/text()').getall()
     item['conference_name'] = response.xpath(
         '//div[text()="会议名称："]/../div[2]/a[@onclick]/text()').get()
     item['conference_time'] = re.sub(
         r'\D', "",
         response.xpath('//div[text()="会议时间："]/../div[2]/text()').get())
     item['conference_local'] = response.xpath(
         '//div[text()="会议地点："]/../div[2]/text()').get()
     item['conference_organ'] = response.xpath(
         '//div[text()="主办单位："]/../div[2]/a/text()').get()
     item['url'] = response.url
     yield item

Example #10

0

Show file

File: wf_periodical_wrjs.py Project: Sliuyi/data-governance-based-on-tech_info

 def parse_detail(self, response):
     """爬取详情页"""
     item = ToolsItem()
     item['name'] = response.xpath('//head/title/text()').get()
     item['key_words'] = response.xpath(
         '//div[text()="关键词："]/../div[2]/a[@title]/text()').getall()
     item['author'] = response.xpath(
         '//div[text()="作者："]/../div[2]/a/text()').getall()
     item['author_organ'] = response.xpath(
         '//div[text()="作者单位："]/../div[2]/*/text()').getall()
     item['periodical_name'] = response.xpath(
         '//div[text()="刊名："]/../div[2]/a[@onclick]/text()').get()
     item['periodical_section'] = response.xpath(
         '//div[text()="所属期刊栏目："]/../div[2]/a[@onclick]/text()').get()
     item['periodical_year_issue'] = response.xpath(
         '//div[text()="年，卷(期)："]/../div[2]/a/text()').get()
     item['fund_project'] = response.xpath(
         '//div[text()="基金项目："]/../div[2]/a[@onclick]/text()').get()
     item['page_number'] = response.xpath(
         '//div[text()="页码："]/../div[2]/text()').get()
     item['url'] = response.url
     yield item

Example #11

0

Show file

File: wf_achievement_wrjs.py Project: Sliuyi/data-governance-based-on-tech_info

 def parse_detail(self, response):
     """爬取详情页"""
     item = ToolsItem()
     item['name'] = response.xpath('//font/text()').get()
     item['project_num'] = response.xpath(
         '//div[text()="项目年度编号："]/../div[2]/text()').get()
     item['public_year'] = response.xpath(
         '//div[text()="公布年份："]/../div[2]/text()').get()
     orgs = response.xpath(
         '//div[text()="完成单位："]/../div[2]/a/@onclick').getall()
     """得到列表元素为wfAnalysis('tech_result','山东时风(集团)有限责任公司','unit_name_teachResult')的列表"""
     organs = []
     while orgs:
         i = orgs.pop()
         current_org = i[26:-26]
         organs.append(current_org)
     item['organs'] = organs
     item['key_words'] = response.xpath(
         '//div[text()="关键词："]/../div[2]/a[@title]/text()').getall()
     item['author'] = response.xpath(
         '//div[text()="完成人："]/../div[2]/a[@id]/text()').getall()
     item['url'] = response.url
     yield item

Example #12

0

Show file

 def parse_detail(self, response):
     """爬取详情页"""
     item = ToolsItem()
     item['name'] = response.xpath('//head/title/text()').get()
     item['document_number'] = response.xpath(
         '//div[text()="发文文号："]/../div[2]/text()').get()
     item['type'] = response.xpath(
         '//div[text()="库别名称："]/../div[2]/text()').get()
     item['issuing_department'] = response.xpath(
         '//div[text()="颁布部门："]/../div[2]/text()').get()
     item['effective_level'] = response.xpath(
         '//div[text()="效力级别："]/../div[2]/text()').get()
     item['timeliness'] = response.xpath(
         '//div[text()="时效性："]/../div[2]/text()').get()
     item['release_date'] = re.sub(
         r'\D', "",
         response.xpath('//div[text()="颁布日期："]/../div[2]/text()').get())
     item['start_date'] = re.sub(
         r'\D', "",
         response.xpath('//div[text()="实施日期："]/../div[2]/text()').get())
     item['content_type'] = response.xpath(
         '//div[text()="内容分类："]/../div[2]/text()').get()
     item['url'] = response.url
     yield item