def parse_detail(self, response): """爬取详情页""" item = ToolsItem() item['name'] = response.xpath('//td[@colspan]/../../tr[1]/td[2]/text()').get() item['suoyin'] = response.xpath('//td[@colspan]/../../tr[2]/td[2]/text()').get() item['wenhao'] = response.xpath('//td[@colspan]/../../tr[4]/td[2]/text()').get() item['start_time'] = response.xpath('//td[@colspan]/../../tr[3]/td[4]/text()').get() item['url'] = response.url yield item
def parse_kexueyuan_detail(self, response): """爬取科学院士简历""" item = ToolsItem() item['name'] = "科学院院士" item['title'] = response.xpath('//h1/text()').get() item['content'] = re.sub( r'\s+', '', "".join(response.xpath('//p//text()').getall())) item['url'] = response.url yield item
def parse_detail(self, response): """爬取详情页""" item = ToolsItem() item['name'] = response.xpath('//div[@id="detail_leftcontent"]/div/h1/text()').get() item['organization'] = response.xpath('//t[text()="发布单位"]/../../td/text()').get() item['public_date'] = response.xpath('//t[text()="发布日期"]/../../td/text()').get() item['used_date'] = response.xpath('//t[text()="实施日期"]/../../td/text()').get() item['standard_num'] = response.xpath('//t[text()="标准编号"]/../../td/text()').get() item['country'] = response.xpath('//t[text()="国别"]/../../td/text()').get() item['url'] = response.url yield item
def parse_gongchengyuan_detail(self, response): """爬取工程院士简历""" item = ToolsItem() item['name'] = "工程院院士" item['title'] = response.xpath( '//div[@class="right_md_name"]/text()').get() item['content'] = re.sub( r'\xa0|\u2002', "", "".join(response.xpath('//div[@class="intro"]/p/text()').getall())) item['url'] = response.url yield item
def parse_detail(self, response): """爬取详情页""" item = ToolsItem() item['name'] = response.xpath('//head/title/text()').get() item['key_words'] = response.xpath('//div[text()="关键词:"]/../div[2]/a[@title]/text()').getall() item['author'] = response.xpath('//div[text()="作者:"]/../div[2]/a/text()').getall() item['author_organ'] = response.xpath('//div[text()="作者单位:"]/../div[2]/a/text()').getall() item['report_type'] = response.xpath('//div[text()="报告类型:"]/../div[2]/text()').get() item['plan_name'] = response.xpath('//div[text()="计划名称:"]/../div[2]/text()').get() item['plan_year'] = response.xpath('//div[text()="立项批准年:"]/../div[2]/text()').get() item['id_number'] = response.xpath('//div[text()="馆藏号:"]/../div[2]/text()').get() item['url'] = response.url yield item
def parse_detail(self, response): """爬取详情页""" item = ToolsItem() item['name'] = response.xpath('//head/title/text()').get() item['doi'] = response.xpath('//div[text()="doi:"]/../div[2]/a/text()').get() item['key_words'] = response.xpath('//div[text()="关键词:"]/../div[2]/a[@title]/text()').getall() item['author'] = response.xpath('//div[text()="作者:"]/../div[2]/a/text()').get() item['organ'] = response.xpath('//div[text()="学位授予单位:"]/../div[2]/a/text()').get() item['degree'] = response.xpath('//div[text()="授予学位:"]/../div[2]/text()').get() item['subject'] = response.xpath('//div[text()="学科专业:"]/../div[2]/a/text()').get() item['instructor'] = response.xpath('//div[text()="导师姓名:"]/../div[2]/a/text()').get() item['time'] = response.xpath('//div[text()="学位年度:"]/../div[2]/text()').get() item['url'] = response.url yield item
def parse_detail(self, response): """爬取详情页""" item = ToolsItem() item['st_num'] = response.xpath('//td[@width="560"]/h1/text()').get() item['e_name'] = response.xpath( '//b/text()/../../../../tr[2]/td/text()').get() item['c_name'] = response.xpath('//b/text()').get() item['beg_time'] = re.sub( r'\D', "", response.xpath( '/html/body/div[3]/div/div/div/div/div[3]/div[4]/text()').get( )) item['url'] = response.url yield item
def parse_detail(self, response): """爬取详情页""" item = ToolsItem() item['name'] = response.xpath( '//div[@id="detail_leftcontent"]/div/h1/text()').get() item['apply_num'] = response.xpath('//th/../../tr[2]/td/text()').get() item['apply_date'] = response.xpath('//th/../../tr[3]/td/text()').get() item['public_num'] = response.xpath('//th/../../tr[5]/td/text()').get() item['public_date'] = response.xpath( '//th/../../tr[4]/td/text()').get() item['apply_man'] = response.xpath('//th/../../tr[8]/td/text()').get() item['invent_man'] = response.xpath('//th/../../tr[9]/td/text()').get() item['abstract'] = response.xpath( '//*[@id="detail_leftcontent"]/div/div[2]/text()').get() item['url'] = response.url yield item
def parse_detail(self, response): """爬取详情页""" item = ToolsItem() item['name'] = response.xpath('//head/title/text()').get() item['key_words'] = response.xpath( '//div[text()="关键词:"]/../div[2]/a[@title]/text()').getall() item['author'] = response.xpath( '//div[text()="作者:"]/../div[2]/a/text()').getall() item['conference_name'] = response.xpath( '//div[text()="会议名称:"]/../div[2]/a[@onclick]/text()').get() item['conference_time'] = re.sub( r'\D', "", response.xpath('//div[text()="会议时间:"]/../div[2]/text()').get()) item['conference_local'] = response.xpath( '//div[text()="会议地点:"]/../div[2]/text()').get() item['conference_organ'] = response.xpath( '//div[text()="主办单位:"]/../div[2]/a/text()').get() item['url'] = response.url yield item
def parse_detail(self, response): """爬取详情页""" item = ToolsItem() item['name'] = response.xpath('//head/title/text()').get() item['key_words'] = response.xpath( '//div[text()="关键词:"]/../div[2]/a[@title]/text()').getall() item['author'] = response.xpath( '//div[text()="作者:"]/../div[2]/a/text()').getall() item['author_organ'] = response.xpath( '//div[text()="作者单位:"]/../div[2]/*/text()').getall() item['periodical_name'] = response.xpath( '//div[text()="刊名:"]/../div[2]/a[@onclick]/text()').get() item['periodical_section'] = response.xpath( '//div[text()="所属期刊栏目:"]/../div[2]/a[@onclick]/text()').get() item['periodical_year_issue'] = response.xpath( '//div[text()="年,卷(期):"]/../div[2]/a/text()').get() item['fund_project'] = response.xpath( '//div[text()="基金项目:"]/../div[2]/a[@onclick]/text()').get() item['page_number'] = response.xpath( '//div[text()="页码:"]/../div[2]/text()').get() item['url'] = response.url yield item
def parse_detail(self, response): """爬取详情页""" item = ToolsItem() item['name'] = response.xpath('//font/text()').get() item['project_num'] = response.xpath( '//div[text()="项目年度编号:"]/../div[2]/text()').get() item['public_year'] = response.xpath( '//div[text()="公布年份:"]/../div[2]/text()').get() orgs = response.xpath( '//div[text()="完成单位:"]/../div[2]/a/@onclick').getall() """得到列表元素为wfAnalysis('tech_result','山东时风(集团)有限责任公司','unit_name_teachResult')的列表""" organs = [] while orgs: i = orgs.pop() current_org = i[26:-26] organs.append(current_org) item['organs'] = organs item['key_words'] = response.xpath( '//div[text()="关键词:"]/../div[2]/a[@title]/text()').getall() item['author'] = response.xpath( '//div[text()="完成人:"]/../div[2]/a[@id]/text()').getall() item['url'] = response.url yield item
def parse_detail(self, response): """爬取详情页""" item = ToolsItem() item['name'] = response.xpath('//head/title/text()').get() item['document_number'] = response.xpath( '//div[text()="发文文号:"]/../div[2]/text()').get() item['type'] = response.xpath( '//div[text()="库别名称:"]/../div[2]/text()').get() item['issuing_department'] = response.xpath( '//div[text()="颁布部门:"]/../div[2]/text()').get() item['effective_level'] = response.xpath( '//div[text()="效力级别:"]/../div[2]/text()').get() item['timeliness'] = response.xpath( '//div[text()="时效性:"]/../div[2]/text()').get() item['release_date'] = re.sub( r'\D', "", response.xpath('//div[text()="颁布日期:"]/../div[2]/text()').get()) item['start_date'] = re.sub( r'\D', "", response.xpath('//div[text()="实施日期:"]/../div[2]/text()').get()) item['content_type'] = response.xpath( '//div[text()="内容分类:"]/../div[2]/text()').get() item['url'] = response.url yield item