def get_info(self, rc_info_li, com_id, page): #解析详情页面代码,获取所需字段 count = (page - 1) * 10 for nbr, info in enumerate(rc_info_li, 1): count += 1 job_id = info.xpath('td[3]/a/@href')[0].split( 'jobdetail_')[1].strip() rc_num = info.xpath('td[1]/text()')[0].strip() pub_date = info.xpath('td[2]/text()')[0].strip() rc_job = info.xpath('td[3]/a/text()')[0].strip() salary = info.xpath('td[4]/text()')[0].strip() education = info.xpath('td[5]/text()')[0].strip() we = info.xpath('td[6]/text()')[0].strip() city = info.xpath('td[7]/text()')[0].strip() # print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format('-' * 9, count, page, count_page)) print('\n{0}--总第{1}条----第{2}页----{0}\n'.format( '-' * 9, count, page)) localtime = tm().get_localtime() # 当前时间 create_time = localtime print(f'当前时间:{create_time}') print( f'公司ID:{com_id}\n序号:{rc_num}\n岗位ID:{job_id}\n岗位名称:{rc_job}\n发布时间:{pub_date}\n' f'薪资:{salary}\n学历:{education}\n工作经历:{we}\n城市:{city}\n') ins = f""" INSERT INTO com_recruit (com_id,job_id,rc_num,pub_date,rc_job, salary,education,we,city,create_time) VALUES ("{com_id}","{job_id}","{rc_num}","{pub_date}","{rc_job}", "{salary}","{education}","{we}","{city}","{create_time}"); """ db().inssts(ins) return count
def parse_info(self, com_id, tree): #解析页面内容,获取相关数据 if tree == None: print('无相关数据!') else: member_li = tree.xpath( '//section[@id="Mainmember"]/table[contains(@class,"ntable ntable-odd")]/tr[position()>1]' ) count = 0 for member_info in member_li: count += 1 member_num = member_info.xpath('td[1]/text()')[0].strip() member_name = member_info.xpath( 'td[2]//*[@class="seo font-14"]/text()')[0].strip() member_post = member_info.xpath('td[3]/text()')[0].strip() localtime = tm().get_localtime() # 当前时间 create_time = localtime print('\n{0}--总第{1}条----{0}\n'.format('-' * 9, count)) print(f'当前时间:{create_time}') print( f'公司ID:{com_id}\n序号:{member_num}\n姓名:{member_name}\n职务:{member_post}\n' ) ins = f""" INSERT INTO `com_main_member` (com_id,member_num,member_name,member_post,create_time) VALUES ("{com_id}","{member_num}","{member_name}","{member_post}","{create_time}"); """ self.db.inssts(ins)
def __init__(self): self.db = db() self.dk = dk() self.gh = gh() self.tm = tm() self.gm = gm() self.index_url = 'https://www.qcc.com'
def cc_judge(self): global com_id, com_name alb = AdmLicenseCc() count_cc = 0 count = 0 while count_cc == 0: result = alb.adm_license_judge() com_id = result[0] com_name = result[1] key = dk().search_key(com_name) if com_id == None: pass else: count += 1 com_url = f'https://www.qcc.com/firm_{com_id}.html' hds = gh().header() time.sleep(random.randint(3, 5)) res = requests.get(com_url, headers=hds).text if '<script>window.location.href' in res: print('访问频繁,需验证!{cc_judge}') input('暂停') elif '<script>location.href="/user_login"</script>' in res: print('Cookie失效,需更换!{cc_judge}') input('程序暂停运行!') elif '您的账号访问超频,请稍后访问或联系客服人员' in res: print('账号访问超频,请更换账号!{cc_judge}') input('程序暂停运行!') else: tree = etree.HTML(res) try: count_cc = tree.xpath( '//div[@class="tcaption"]/h3[contains(text(),"[信用中国]")]/following-sibling::span[1]/text()' )[0] count_cc = int(count_cc) except: count_cc = 0 localtime = tm().get_localtime() # 当前时间 print(localtime) if count_cc == 0: print(f'计数器:{count}\n公司ID:{com_id}\n行政许可信息[工商局]条数:无') else: print( f'计数器:{count}\n公司ID:{com_id}\n行政许可信息[工商局]条数:{count_cc}' ) status_column = 'status_credit_adm_license_cc' # 表字段名 count_column = 'count_credit_adm_license_cc' # 表字段名 cd.upd_status(com_id, status_column, count_column, count_cc) return com_id, com_name, count_cc
def parse_info(self,tree,com_id,com_name,page,sh_page_count): count = (page - 1) * 50 if tree == None: print('无相关数据!\n') else: stockholder_li = tree.xpath('//table[contains(@class,"ntable ntable-odd npth")]/tr[position()>1]') for stockholder_info in stockholder_li: count += 1 stockholder_num = stockholder_info.xpath('td[1]/text()')[0].strip() stockholder_name = stockholder_info.xpath('td[2]//*[@class="seo font-14"]/text()')[0].strip() stockholder_rate = stockholder_info.xpath('td[3]/text()')[0].strip() subscribed_capital_amount = stockholder_info.xpath('td[4]/text()')[0].strip() subscribed_capital_date = stockholder_info.xpath('td[5]/text()')[0].strip() try: contributed_capital_amount = stockholder_info.xpath('td[6]/text()')[0].strip() except: contributed_capital_amount = '--' try: contributed_capital_date = stockholder_info.xpath('td[7]/text()')[0].strip() except: contributed_capital_date = '--' try: relation_product = stockholder_info.xpath('td[8]/text()')[0].strip() except: try: relation_product = stockholder_info.xpath('td[8]/a/text()')[0].strip() except: relation_product = '--' localtime = tm().get_localtime() # 当前时间 create_time = localtime print('\n{0}--总第{1}条----第{2}/{3}页----{0}\n'.format('-' * 9, count, page,sh_page_count)) print(f'公司ID:{com_id}\n公司名称:{com_name}') print(f'序号:{stockholder_num}\n股东:{stockholder_name}\n持股比例:{stockholder_rate}\n认缴出资额:{subscribed_capital_amount}\n认缴出资日期:{subscribed_capital_date}\n' f'实缴出资额:{contributed_capital_amount}\n实缴出资日期:{contributed_capital_date}\n关联产品/机构:{relation_product}\n') ins = f""" INSERT INTO `com_stockholder` (com_id,stockholder_num,stockholder_name,stockholder_rate,subscribed_capital_amount, subscribed_capital_date,contributed_capital_amount,contributed_capital_date,relation_product,create_time) VALUES ("{com_id}","{stockholder_num}","{stockholder_name}","{stockholder_rate}","{subscribed_capital_amount}", "{subscribed_capital_date}","{contributed_capital_amount}","{contributed_capital_date}","{relation_product}","{create_time}"); """ # udp = f""" # UPDATE `com_info` # SET `status_stockholder` = "9" # AND `count_stockholder` = "{count_sh}" # WHERE `com_id` = "{com_id}";""" db().inssts(ins)
def adm_license_judge(self): # 判断行政许可信息,如果有记录则执行解析,返回该公司相关信息 global com_id, com_name al = AdmLicense() count_adm_license = 0 count = 0 while count_adm_license == 0 or count_adm_license == -1: result = al.get_com_id() com_id = result[0] com_name = result[1] if com_id == None: pass else: count += 1 com_url = f'https://www.qcc.com/firm_{com_id}.html' hds = gh().header() time.sleep(random.randint(3, 5)) res = requests.get(com_url, headers=hds).text if '<script>window.location.href' in res: print('访问频繁,需验证!{adm_license_judge}') input('暂停') elif '<script>location.href="/user_login"</script>' in res: print('Cookie失效,需更换!{adm_license_judge}') input('程序暂停运行!') elif '您的账号访问超频,请稍后访问或联系客服人员' in res: print('账号访问超频,请更换账号!{adm_license_judge}') input('程序暂停运行!') else: tree = etree.HTML(res) try: count_adm_license = tree.xpath( '//div[@class="company-nav-items"]/span[contains(text(),"行政许可")]/span/text()|//div[@class="company-nav-items"]/a[@data-pos="licenslist"]/span/text()' )[0] count_adm_license = int(count_adm_license) except: count_adm_license = -1 localtime = tm().get_localtime() # 当前时间 print(localtime) if count_adm_license == 0 or count_adm_license == -1: print(f'计数器:{count}\n公司ID:{com_id}\n行政许可信息条数:无') else: print( f'计数器:{count}\n公司ID:{com_id}\n行政许可信息条数:{count_adm_license}' ) status_column = 'status_credit_adm_license' #表字段名 count_column = 'count_credit_adm_license' #表字段名 al.upd_status(com_id, status_column, count_column, count_adm_license) return com_id, com_name, count_adm_license
def faith_execued_judge(self): #判断失信被执行人信息,如果有记录则执行解析,返回该公司相关信息 global com_id, com_name fe = FaithExecued() count_breach_of_faith_execued = 0 count = 0 while count_breach_of_faith_execued == 0 or count_breach_of_faith_execued == -1: result = fe.get_com_id() com_id = result[0] com_name = result[1] if com_id == None: pass else: count += 1 com_url = f'https://www.qichacha.com/firm_{com_id}.html' hds = gh().header() time.sleep(random.randint(3, 5)) res = requests.get(com_url, headers=hds).text if '<script>window.location.href' in res: print('访问频繁,需验证!{faith_execued_judge}') input('暂停') elif '<script>location.href="/user_login"</script>' in res: print('Cookie失效,需更换!{faith_execued_judge}') input('程序暂停运行!') elif '您的账号访问超频,请稍后访问或联系客服人员' in res: print('账号访问超频,请更换账号!{faith_execued_judge}') input('程序暂停运行!') else: tree = etree.HTML(res) try: count_breach_of_faith_execued = tree.xpath( '//div[@class="company-nav-items"]/span[contains(text(),"失信信息")]/span/text()|//div[@class="company-nav-items"]/a[@data-pos="shixinlist"]/span/text()' )[0] count_breach_of_faith_execued = int( count_breach_of_faith_execued) except: count_breach_of_faith_execued = -1 localtime = tm().get_localtime() # 当前时间 print(localtime) if count_breach_of_faith_execued == 0 or coucount_breach_of_faith_execuednt_execued == -1: print(f'计数器:{count}\n公司ID:{com_id}\n失信被执行人信息条数:无') else: print( f'计数器:{count}\n公司ID:{com_id}\n失信被执行人信息条数:{count_breach_of_faith_execued}' ) cd.upd_status_execued(com_id, count_breach_of_faith_execued) return com_id, com_name, count_breach_of_faith_execued
def get_count_rc(self, count_rc, key, count): #根据模糊判断,到招聘详情页判断出精确的招聘数量 global res if count_rc > 0: info_url = f'https://www.qichacha.com/company_getinfos?unique={com_id}&companyname={key}&tab=run' hds = gh().header() time.sleep(random.randint(3, 5)) res = requests.get(info_url, headers=hds).text tree = gm.verify(res) count_rc = tree.xpath('//a[contains(@onclick,"#joblist")]/text()' )[0].split('招聘')[1].strip() count_rc = int(count_rc) localtime = tm().get_localtime() # 当前时间 print(localtime) print(f'计数器:{count}\n公司ID:{com_id}\n招聘岗位数:{count_rc}') status_column = 'status_recruit' # 表字段名 count_column = 'count_recruit' # 表字段名 gm.upd_status(com_id, status_column, count_column, count_rc) return count_rc, res
def get_page_info(self): # 解析页面内容 global project_name, license_status, license_content, expire_time, approval_category, area alb = AdmLicenseCc() value = alb.cc_judge() com_id = value[0] com_name = value[1] count_cc = value[2] key = dk().search_key(com_name) count = 0 index_url = 'https://www.qcc.com' page_url = f'{index_url}/company_getinfos?unique={com_id}&companyname={key}&p={page}&tab=run' hds = gh().header() hds.update({'Referer': f'{index_url}/firm_{com_id}.html'}) time.sleep(random.randint(3, 5)) res = requests.get(page_url, headers=hds).text if '<script>window.location.href' in res: print('访问频繁,需验证!{cc_judge}') input('暂停') elif '<script>location.href="/user_login"</script>' in res: print('Cookie失效,需更换!{cc_judge}') input('程序暂停运行!') elif '您的账号访问超频,请稍后访问或联系客服人员' in res: print('账号访问超频,请更换账号!{cc_judge}') input('程序暂停运行!') else: tree = etree.HTML(res) content_li = tree.xpath( '//div[@class="tcaption"]/span[contains(text(),"[信用中国]")]/parent::div/following-sibling::table[@class="ntable ntable-odd"]/tr[position()>2]' ) for nbr, content in enumerate(content_li, 1): count += 1 try: license_num = content.xpath('td[1]/text()')[0] dec_book_num = content.xpath('td[2]/text()')[0] license_office = content.xpath('td[3]/text()')[0] dec_date = content.xpath('td[4]/text()')[0] time.sleep(random.randint(1, 2)) dt_id = content.xpath( 'td[5]/a[@class="xzxukeView"]/@onclick')[0].split( 'xzxukeView("')[1].split('")')[0] dt_url = 'https://www.qcc.com/company_xzxukeView' para = {'id': f'{dt_id}'} res_info = requests.post(dt_url, headers=hds, data=para).text status = json.loads(res_info)['status'] if status == 200: data = json.loads(res_info)['data'] project_name = data['name'] license_status = data['status'] license_content = data['content'] expire_time = data['expire_time'] approval_category = data['type'] area = data['province'] else: print(f'响应失败!\n状态码:{status}') input('程序暂停运行!') except: license_num = None dec_book_num = None license_office = None dec_date = None dt_id = None project_name = None license_status = None license_content = None expire_time = None approval_category = None print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format( '-' * 9, count, page, count_page)) localtime = tm().get_localtime() # 当前时间 create_time = localtime print(f'当前时间:{create_time}') print( f'公司ID:{com_id}\n序号:{license_num}\n决定文书号:{dec_book_num}\n许可机关:{license_office}\n详情ID:{dt_id}\n' f'决定日期:{dec_date}\n项目名称:{project_name}\n许可状态:{license_status}\n许可内容:{license_content}\n截止时间:{expire_time}\n' f'审批类别:{approval_category}\n地域:{area}\n创建/入库时间:{create_time}' ) input('Pause')
def get_page_info(self): # 解析页面内容 alb = AdmLicenseBc() value = alb.get_page_count() com_id = value[0] com_name = value[1] count_page = value[2] count_record = value[3] key = dk().search_key(com_name) count = 0 for page in range(1, count_page + 1): index_url = 'https://www.qcc.com' page_url = f'{index_url}/company_getinfos?unique={com_id}&companyname={key}&p={page}&tab=run&box=licens' hds = gh().header() hds.update({'Referer': f'{index_url}/firm_{com_id}.html'}) time.sleep(random.randint(1, 2)) res = requests.get(page_url, headers=hds).text if '<script>window.location.href' in res: print('访问频繁,需验证!{get_page_info[2]}') input('暂停') elif '<script>location.href="/user_login"</script>' in res: print('Cookie失效,需更换!{get_page_info[2]}') input('程序暂停运行!') elif '您的账号访问超频,请稍后访问或联系客服人员' in res: print('账号访问超频,请更换账号!{get_page_info[2]}') input('程序暂停运行!') else: tree = etree.HTML(res) content_li = tree.xpath( '//table[@class="ntable ntable-odd"]/tr[position()>2]') for nbr, content in enumerate(content_li, 1): count += 1 try: license_num = content.xpath('td[1]/text()')[0] license_doc_num = content.xpath('td[2]/text()')[0] license_doc_name = content.xpath('td[3]/text()')[0] valid_period_from = content.xpath('td[4]/text()')[0] valid_period_to = content.xpath('td[5]/text()')[0] license_office = content.xpath('td[6]/text()')[0] license_content = content.xpath('td[7]/text()')[0] except: license_num = None license_doc_num = None license_doc_name = None valid_period_from = None valid_period_to = None license_office = None license_content = None print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format( '-' * 9, count, page, count_page)) localtime = tm().get_localtime() # 当前时间 create_time = localtime print(f'当前时间:{create_time}') print( f'公司ID:{com_id}\n序号:{license_num}\n许可文件编号:{license_doc_num}\n许可文件名称:{license_doc_name}\n有效期自:{valid_period_from}\n' f'有效期至:{valid_period_to}\n许可机关:{license_office}\n许可内容:{license_content}' ) if license_num == None: ins = """ INSERT INTO `com_credit_adm_license_bc` (`com_id`,`license_num`,`license_doc_num`,`license_doc_name`,`valid_period_from`, `valid_period_to`,`license_office`,`license_content`,`create_time`) VALUES (NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL); """ else: ins = f""" INSERT INTO `com_credit_adm_license_bc` (`com_id`,`license_num`,`license_doc_num`,`license_doc_name`,`valid_period_from`, `valid_period_to`,`license_office`,`license_content`,`create_time`) VALUES ("{com_id}","{license_num}","{license_doc_num}","{license_doc_name}","{valid_period_from}", "{valid_period_to}","{license_office}","{license_content}","{create_time}"); """ db().inssts(ins) upd = f""" UPDATE `com_info` SET `status_credit_adm_license_bc` = 1 WHERE `com_id` = "{com_id}" ; """ db().updsts(upd) localtime = tm().get_localtime() # 当前时间 print('\n{1}\n{0}数据采集完成!{0}\n{1}'.format('+' * 7, '+' * 25)) print(f'当前时间:{localtime}\n') time.sleep(3)
def get_page_info(self): #获取页面详情 pt = PatentInfo() value = pt.get_page_count() com_id = value[0] com_name = value[1] count_page = value[2] # 临时代码,供单次补采数据【001】 # com_id = 'x697654f34422233895571cf26e42268' # com_name = '青岛科技大学' # count_page = 500 # 临时代码,供单次补采数据【001】 if com_id == None: pass else: key = dk().search_key(com_name) index_url = value[3] count = 0 start_time = tm().get_localtime() #当前时间 for page in range(1, count_page + 1): #临时代码,供单次补采数据【001】 # for page in range(1, count_page + 1): # if page == 1: # page_url = f'https://www.qichacha.com/company_getinfos?unique={com_id}&companyname={com_name}&tab=assets' page_url = f'{index_url}/company_getinfos?unique={com_id}&companyname={key}&p={page}&tab=assets&box=zhuanli' hds = gh().header() hds.update({'Referer': f'{index_url}/firm_{com_id}.html'}) time.sleep(random.randint(1, 2)) res_pg = requests.get(page_url, headers=hds).text if '<script>window.location.href' in res_pg: print('访问频繁,需验证!{get_page_info[1]}') input('暂停') elif '<script>location.href="/user_login"</script>' in res_pg: print('Cookie失效,需更换!{get_page_info[1]}') input('程序暂停运行!') elif '您的账号访问超频,请稍后访问或联系客服人员' in res_pg: print('账号访问超频,请更换账号!{get_page_info[1]}') input('程序暂停运行!') else: tree_pg = etree.HTML(res_pg) content_li = tree_pg.xpath('//table/tr[position()>1]') for content in content_li: count += 1 patent_num = content.xpath('td[1]/text()')[0] patent_type = content.xpath('td[2]/text()')[0] patent_pub_num = content.xpath('td[3]/text()')[0] patent_pub_date = content.xpath('td[4]/text()')[0] patent_name = content.xpath( 'td[5]/a/text()')[0].strip() patent_link = content.xpath('td[5]/a/@href')[0] patent_id = patent_link.split('_com_')[1] patent_url = ''.join((index_url, patent_link)) time.sleep(random.randint(1, 3)) res_dt = requests.get(patent_url, headers=hds).text if '<script>window.location.href' in res_dt: print('访问频繁,需验证!{get_page_info[2]}') input('暂停') elif '<script>location.href="/user_login"</script>' in res_dt: print('Cookie失效,需更换!{get_page_info[2]}') input('程序暂停运行!') elif '您的账号访问超频,请稍后访问或联系客服人员' in res_dt: print('账号访问超频,请更换账号!{get_page_info[2]}') input('程序暂停运行!') else: tree_dt = etree.HTML(res_dt) app_num = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请号")]/following-sibling::td[1]/text()' )[0].strip() app_date = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请日")]/following-sibling::td[1]/text()' )[0].strip() prio_date = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"优先权日")]/following-sibling::td[1]/text()' )[0].strip() prio_num = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"优先权号")]/following-sibling::td[1]/text()' )[0].strip() inventor = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"发明人")]/following-sibling::td[1]/text()' )[0].strip() try: applicant = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请(专利权)人")]/following-sibling::td[1]/a/text()' )[0].strip() except: applicant = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请(专利权)人")]/following-sibling::td[1]' )[0].strip() try: agency = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理机构")]/following-sibling::td[1]/a/text()' )[0].strip() except: agency = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理机构")]/following-sibling::td[1]/text()' )[0].strip() agent = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理人")]/following-sibling::td[1]/text()' )[0].strip() ipc = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"IPC分类号")]/following-sibling::td[1]/text()' )[0].strip().replace(' ', '').replace('\n', '') cpc = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"CPC分类号")]/following-sibling::td[1]/text()' )[0].strip() app_address = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请人地址")]/following-sibling::td[1]/text()' )[0].strip() app_zip_code = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请人邮编")]/following-sibling::td[1]/text()' )[0].strip() try: abstract = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"摘要")]/following-sibling::td[1]/text()' )[0].strip() except: abstract = tree_dt.xpath( 'string(//table[@class="ntable"]/tbody/tr/td[contains(text(),"摘要")]/following-sibling::td)' ).strip() try: abstract_photo = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"摘要附图")]/following-sibling::td[1]/img/@src' )[0].strip() except: abstract_photo = '-' try: claim = tree_dt.xpath( '//table[@class="ntable"]/tr/td[@class="ea_instructions" and position()=1]/p/text()' ) claim = ''.join(claim).replace('"', "'") except: claim = '-' try: instructions = tree_dt.xpath( '//div[@class="tcaption"]/h3[text()="说明书"]/parent::div/following-sibling::table[@class="ntable"]/tr/td[@class="ea_instructions"]/h1/text()|//div[@class="tcaption"]/h3[text()="说明书"]/parent::div/following-sibling::table[@class="ntable"]/tr/td[@class="ea_instructions"]/h2/text()|//div[@class="tcaption"]/h3[text()="说明书"]/parent::div/following-sibling::table[@class="ntable"]/tr/td[@class="ea_instructions"]/p/text()' ) instructions = ''.join(instructions) except: instructions = '-' print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format( '-' * 9, count, page, count_page)) localtime = tm().get_localtime() # 当前时间 create_time = localtime print(f'公司ID:{com_id} 当前时间:{localtime}') print(f'公司名称:{com_name}\n专利ID:{patent_id}') print( f'序号:{patent_num}\n专利类型:{patent_type}\n公开(公告)号:{patent_pub_num}\n公开(公告)日期:{patent_pub_date}\n专利名称:{patent_name}\n' f'专利页URL:{patent_url}\n申请号:{app_num}\n申请日期:{app_date}\n优先权日:{prio_date}\n优先权号:{prio_num}\n' f'发明人:{inventor}\n申请(专利权)人:{applicant}\n代理机构:{agency}\n代理人:{agent}\nIPC分类号:{ipc}\n' f'CPC分类号:{cpc}\n申请人地址:{app_address}\n申请人邮编:{app_zip_code}\n摘要:{abstract}\n摘要附图:{abstract_photo}\n' f'权利要求:{claim}\n说明书:{instructions}\n') ins = f""" INSERT INTO `com_patent` (`com_id`,`patent_num`,`patent_type`,`patent_pub_num`,`patent_pub_date`, `patent_name`,`patent_url`,`app_num`,`app_date`,`prio_date`, `prio_num`,`inventor`,`applicant`,`agency`,`agent`, `ipc`,`cpc`,`app_address`,`app_zip_code`,`abstract`,`abstract_photo`, `claim`,`instructions`,`create_time`,`patent_id`) VALUES ("{com_id}","{patent_num}","{patent_type}","{patent_pub_num}","{patent_pub_date}", "{patent_name}","{patent_url}","{app_num}","{app_date}","{prio_date}", "{prio_num}","{inventor}","{applicant}","{agency}","{agent}", "{ipc}","{cpc}","{app_address}","{app_zip_code}","{abstract}","{abstract_photo}", "{claim}","{instructions}","{create_time}","{patent_id}"); """ db().inssts(ins) upd = f""" UPDATE `com_info` SET `status_patent` = 1 WHERE `com_id` = "{com_id}" ; """ db().updsts(upd) localtime = tm().get_localtime() # 当前时间 print('\n{1}\n{0}数据采集完成!{0}\n{1}'.format('+' * 7, '+' * 25)) print(f'当前时间:{localtime}\n') time.sleep(3)
def get_page_info(self): #解析页面内容 cd = Credit() value = cd.get_page_count() com_id = value[0] com_name = value[1] count_page = value[2] count_record = value[3] key = dk().search_key(com_name) count = 0 for page in range(1, count_page + 1): index_url = 'https://www.qichacha.com' page_url = f'{index_url}/company_getinfos?unique={com_id}&companyname={key}&p={page}&tab=susong&box=zhixing' hds = gh().header() hds.update({'Referer': f'{index_url}/firm_{com_id}.html'}) time.sleep(random.randint(1, 2)) res_pg = requests.get(page_url, headers=hds).text if '<script>window.location.href' in res_pg: print('访问频繁,需验证!{get_page_info}') input('暂停') elif '<script>location.href="/user_login"</script>' in res_pg: print('Cookie失效,需更换!{get_page_info}') input('程序暂停运行!') elif '您的账号访问超频,请稍后访问或联系客服人员' in res_pg: print('账号访问超频,请更换账号!{get_page_info}') input('程序暂停运行!') else: tree_pg = etree.HTML(res_pg) content_li = tree_pg.xpath( '//table[@class="ntable ntable-odd"]/tr[position()>2]') for nbr, content in enumerate(content_li, 1): count += 1 try: exec_num = content.xpath('td[1]/text()')[0] case_num = content.xpath('td[2]/a/text()')[0] case_id = content.xpath( 'td[2]/a[contains(@onclick,"showRelatModal")]/@onclick' )[0].split('zhixing",')[1].split('"')[1] case_url = 'id='.join( ('https://www.qichacha.com/company_zhixingRelat?', case_id)) filing_time = content.xpath('td[3]/text()')[0] court_of_exec = content.xpath('td[4]/text()')[0] exec_obj = content.xpath('td[5]/text()')[0] time.sleep(random.randint(1, 2)) res_info = requests.get(case_url, headers=hds).text if '<script>window.location.href' in res_info: print('访问频繁,需验证!{get_page_info}') input('暂停') elif '<script>location.href="/user_login"</script>' in res_info: print('Cookie失效,需更换!{get_page_info}') input('程序暂停运行!') elif '您的账号访问超频,请稍后访问或联系客服人员' in res_info: print('账号访问超频,请更换账号!{get_page_info}') input('程序暂停运行!') else: tree_info = etree.HTML(res_info) exec_person = tree_info.xpath( '//table/tbody/tr[1]/td[2]/text()')[0] occ = tree_info.xpath( '//table/tbody/tr[1]/td[4]/text()')[0] except: exec_num = None case_num = None case_id = None case_url = None filing_time = None court_of_exec = None exec_obj = None exec_person = None occ = None print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format( '-' * 9, count, page, count_page)) localtime = tm().get_localtime() # 当前时间 create_time = localtime print(f'当前时间:{create_time}') print( f'公司ID:{com_id}\n序号:{exec_num}\n案号:{case_num}\n案例ID:{case_id}\n案例链接:{case_url}\n' f'立案时间:{filing_time}\n执行法院:{court_of_exec}\n执行标的:{exec_obj}\n被执行人:{exec_person}\n身份证号/组织机构代码:{occ}\n' ) if exec_num == None: ins = """ INSERT INTO `com_credit_execued` (`com_id`,`exec_num`,`case_num`,`case_id`,`filing_time`, `court_of_exec`,`exec_obj`,`exec_person`,`occ`,`create_time`) VALUES (NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL); """ else: ins = f""" INSERT INTO `com_credit_execued` (`com_id`,`exec_num`,`case_num`,`case_id`,`filing_time`, `court_of_exec`,`exec_obj`,`exec_person`,`occ`,`create_time`) VALUES ("{com_id}","{exec_num}","{case_num}","{case_id}","{filing_time}", "{court_of_exec}","{exec_obj}","{exec_person}","{occ}","{create_time}"); """ db().inssts(ins) upd = f""" UPDATE `com_info` SET `status_credit_execued` = 1 WHERE `com_id` = "{com_id}" ; """ db().updsts(upd) localtime = tm().get_localtime() # 当前时间 print('\n{1}\n{0}数据采集完成!{0}\n{1}'.format('+' * 7, '+' * 25)) print(f'当前时间:{localtime}\n') time.sleep(3)
def get_page_info(self): #获取页面详情 ws = WebSite() value = ws.get_page_count() com_id = value[0] com_name = value[1] count_page = value[2] # 临时代码,供单次补采数据【001】 # com_id = 'f1c5372005e04ba99175d5fd3db7b8fc' # com_name = '深圳市腾讯计算机系统有限公司' # count_page = 45 # 临时代码,供单次补采数据【001】 if com_id == None: pass else: key = ws.dk.search_key(com_name) index_url = value[3] count = 0 start_time = ws.tm.get_localtime() #当前时间 for page in range(1, count_page + 1): #临时代码,供单次补采数据【001】 # for page in range(1, count_page + 1): # if page == 1: # page_url = f'https://www.qichacha.com/company_getinfos?unique={com_id}&companyname={com_name}&tab=assets' page_url = f'{index_url}/company_getinfos?unique={com_id}&companyname={key}&p={page}&tab=assets&box=website' hds = ws.gh.header() hds.update({'Referer': f'{index_url}/firm_{com_id}.html'}) time.sleep(random.randint(1, 2)) res_pg = requests.get(page_url, headers=hds).text tree_pg = ws.gm.verify(res_pg) content_li = tree_pg.xpath('//table/tr[position()>1]') for content in content_li: count += 1 web_num = content.xpath('td[1]/text()')[0] web_name = content.xpath('td[2]/text()')[0] web_site = content.xpath('td[3]/a/text()') if len(web_site) > 1: web_site = web_site elif len(web_site) == 0: web_site = '-' else: web_site = web_site[0] domain_name = content.xpath('td[4]/text()')[0].split('\n') if len(domain_name) > 2: domain_name_li = [] for domain in domain_name: if domain != '': domain = domain.strip() domain_name_li.append(domain) else: pass domain_name = domain_name_li else: domain_name = domain_name[1].strip() icp = content.xpath('td[5]/text()')[0].strip() approved_date = content.xpath('td[6]/text()')[0] print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format( '-' * 9, count, page, count_page)) localtime = tm().get_localtime() # 当前时间 create_time = localtime print(f'公司ID:{com_id} 当前时间:{localtime}') print(f'公司名称:{com_name}\n序号:{web_num}') print( f'网站名称:{web_name}\n网址:{web_site}\n域名:{domain_name}\n网站备案/许可证号:{icp}\n审核日期:{approved_date}\n' ) ins = f""" INSERT INTO `com_web` (`com_id`,`web_num`,`web_name`,`web_site`,`domain_name`, `icp`,`approved_date`,`create_time`) VALUES ("{com_id}","{web_num}","{web_name}","{web_site}","{domain_name}", "{icp}","{approved_date}","{create_time}"); """ db().inssts(ins) upd = f""" UPDATE `com_info` SET `status_web` = 1 WHERE `com_id` = "{com_id}" ; """ db().updsts(upd) localtime = tm().get_localtime() # 当前时间 print('\n{1}\n{0}数据采集完成!{0}\n{1}'.format('+' * 7, '+' * 25)) print(f'当前时间:{localtime}\n') time.sleep(3)
def __init__(self): self.db = db() self.dk = dk() self.gh = gh() self.gm = gm() self.tm = tm()
def get_page_info(self): #获取页面详情 tmi = TradeMarkInfo() value = tmi.get_page_count() com_id = value[0] com_name = value[1] count_page = value[2] if com_id == None: pass else: key = tmi.dk.search_key(com_name) index_url = value[3] count = 0 for page in range(1, count_page + 1): # 'https://www.qichacha.com/company_getinfos?unique=&companyname=&p=2&tab=assets&box=zhuanli&zlpublicationyear=&zlipclist=&zlkindcode=&zllegalstatus=' page_url = f'{index_url}/company_getinfos?unique={com_id}&companyname={key}&p={page}&tab=assets&box=shangbiao' hds = tmi.gh.header() hds.update({'Referer': f'{index_url}/firm_{com_id}.html'}) time.sleep(random.randint(1, 2)) res_tmi = requests.get(page_url, headers=hds).text tree_tmi = etree.HTML(res_tmi) content_li = tree_tmi.xpath('//table/tr[position()>1]') for content in content_li: count += 1 tm_num = content.xpath('td[1]/text()')[0] tm_logo_url = content.xpath('td[2]/img/@src')[0] tm_name = content.xpath('td[3]/text()')[0] tm_status = content.xpath('td[4]/text()')[0] app_date = content.xpath('td[5]/text()')[0] tm_regno = content.xpath('td[6]/text()')[0] tm_int_type = content.xpath('td[7]/text()')[0] trademark_link = content.xpath('td[8]/a/@href')[0] trademark_url = ''.join((index_url, trademark_link)) time.sleep(random.randint(1, 3)) res_dt = requests.get(trademark_url, headers=hds).text tree_dt = etree.HTML(res_dt) sim_groups = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"类似群")]/following-sibling::td[1]/text()' )[0].strip() app_cn = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请人名称(中文)")]/following-sibling::td[1]/text()' )[0].strip() app_en = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请人名称(英文)")]/following-sibling::td[1]/text()' )[0].strip() app_addr_cn = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请人地址(中文)")]/following-sibling::td[1]/text()' )[0].strip() app_addr_en = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请人地址(英文)")]/following-sibling::td[1]/text()' )[0].strip() first_trial_no = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"初审公告期号")]/following-sibling::td[1]/text()' )[0].strip() first_trial_date = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"初审公告日期")]/following-sibling::td[1]/text()' )[0].strip().replace(' ', '').replace('\n', '') reg_not_peri_no = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"注册公告期号")]/following-sibling::td[1]/text()' )[0].strip() reg_not_peri_date = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"注册公告日期")]/following-sibling::td[1]/text()' )[0].strip() is_comm_tm = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"是否共有商标")]/following-sibling::td[1]/text()' )[0].strip() tm_type = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"商标类型")]/following-sibling::td[1]/text()' )[0].strip() exclu_right_limit = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"专用权期限")]/following-sibling::td[1]/text()' )[0].strip() tm_form = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"商标形式")]/following-sibling::td[1]/text()' )[0].strip() int_reg_date = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"国际注册日期")]/following-sibling::td[1]/text()' )[0].strip() later_scheduled_date = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"后期指定日期")]/following-sibling::td[1]/text()' )[0].strip() prio_date = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"优先权日期")]/following-sibling::td[1]/text()' )[0].strip() try: agency = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理/办理机构")]/following-sibling::td[1]/a/text()' )[0].strip() except: agency = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理/办理机构")]/following-sibling::td[1]/text()' )[0].strip() service = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"商品/服务")]/following-sibling::td[1]/text()' )[0].strip() print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format( '-' * 9, count, page, count_page)) localtime = tm().get_localtime() # 当前时间 create_time = localtime print(f'当前时间:{localtime}') print(f'公司ID:{com_id}\n公司名称:{com_name}') print( f'序号:{tm_num}\n商标LOGO URL:{tm_logo_url}\n商标名称:{tm_name}\n商标状态:{tm_status}\n申请时间:{app_date}\n' f'申请/注册号:{tm_regno}\n国际类型:{tm_int_type}\n类似群:{sim_groups}\n申请人名称(中文):{app_cn}\n申请人名称(英文):{app_en}\n' f'申请人地址(中文):{app_addr_cn}\n申请人地址(英文):{app_addr_en}\n初审公告期号:{first_trial_no}\n初审公告日期:{first_trial_date}\n注册公告期号:{reg_not_peri_no}\n' f'注册公告日期:{reg_not_peri_date}\n是否共有商标:{is_comm_tm}\n商标类型:{tm_type}\n专用权期限:{exclu_right_limit}\n商标形式:{tm_form}\n' f'国际注册日期:{int_reg_date}\n后期指定日期:{later_scheduled_date}\n优先权日期:{prio_date}\n代理机构:{agency}\n商品/服务:{service}' ) ins = f""" INSERT INTO `com_trademark` (`com_id`,`tm_num`,`tm_logo_url`,`tm_name`,`tm_status`, `app_date`,`tm_regno`,`tm_int_type`,`sim_groups`,`app_cn`, `app_en`,`app_addr_cn`,`app_addr_en`,`first_trial_no`,`first_trial_date`, `reg_not_peri_no`,`reg_not_peri_date`,`is_comm_tm`,`tm_type`,`exclu_right_limit`, `tm_form`,`int_reg_date`,`later_scheduled_date`,`prio_date`,`agency`, `service`,`create_time`) VALUES ("{com_id}","{tm_num}","{tm_logo_url}","{tm_name}","{tm_status}", "{app_date}","{tm_regno}","{tm_int_type}","{sim_groups}","{app_cn}", "{app_en}","{app_addr_cn}","{app_addr_en}","{first_trial_no}","{first_trial_date}", "{reg_not_peri_no}","{reg_not_peri_date}","{is_comm_tm}","{tm_type}","{exclu_right_limit}", "{tm_form}","{int_reg_date}","{later_scheduled_date}","{prio_date}","{agency}", "{service}","{create_time}"); """ db().inssts(ins) upd = f""" UPDATE `com_info` SET `status_tm` = 1 WHERE `com_id` = "{com_id}" ; """ db().updsts(upd) # input('暂停') localtime = tm().get_localtime() # 当前时间 print('\n{1}\n{0}数据采集完成!{0}\n{1}'.format('+' * 7, '+' * 25)) print(f'当前时间:{localtime}')
def get_page_info(self): #获取页面详情 cos = CprOfSoft() value = cos.get_page_count() com_id = value[0] com_name = value[1] count_page = value[2] # 临时代码,供单次补采数据【001】 # com_id = 'd02224f92dc49fb497774c88dd2c83c1' # com_name = '中译语通文娱科技(青岛)有限公司' # count_page = 2 # 临时代码,供单次补采数据【001】 if com_id == None: pass else: key = cos.dk.search_key(com_name) index_url = value[3] count = 0 start_time = cos.tm.get_localtime() #当前时间 for page in range(1, count_page + 1): #临时代码,供单次补采数据【001】 # for page in range(1, count_page + 1): # if page == 1: # page_url = f'https://www.qichacha.com/company_getinfos?unique={com_id}&companyname={com_name}&tab=assets' page_url = f'{index_url}/company_getinfos?unique={com_id}&companyname={key}&p={page}&tab=assets&box=rjzzq' hds = cos.gh.header() hds.update({'Referer': f'{index_url}/firm_{com_id}.html'}) time.sleep(random.randint(1, 2)) res_pg = requests.get(page_url, headers=hds).text tree_pg = cos.gm.verify(res_pg) content_li = tree_pg.xpath('//table/tr[position()>1]') for content in content_li: count += 1 soft_num = content.xpath('td[1]/text()')[0] soft_name = content.xpath('td[2]/text()')[0] try: soft_ver_no = content.xpath('td[3]/text()')[0] except: soft_ver_no = '-' soft_pub_date = content.xpath('td[4]/text()')[0].strip() soft_short_name = content.xpath('td[5]/text()')[0].strip() soft_reg_no = content.xpath('td[6]/text()')[0] reg_approval_date = content.xpath('td[7]/text()')[0] print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format( '-' * 9, count, page, count_page)) localtime = tm().get_localtime() # 当前时间 create_time = localtime print(f'公司ID:{com_id} 当前时间:{localtime}') print(f'公司名称:{com_name}') print( f'序号:{soft_num}\n软件名称:{soft_name}\n版本号:{soft_ver_no}\n发布日期:{soft_pub_date}\n软件简称:{soft_short_name}\n' f'登记号:{soft_reg_no}\n登记批准号:{reg_approval_date}\n') ins = f""" INSERT INTO `com_cpr_of_soft` (`com_id`,`soft_num`,`soft_name`,`soft_ver_no`,`soft_pub_date`, `soft_short_name`,`soft_reg_no`,`reg_approval_date`,`create_time`) VALUES ("{com_id}","{soft_num}","{soft_name}","{soft_ver_no}","{soft_pub_date}", "{soft_short_name}","{soft_reg_no}","{reg_approval_date}","{create_time}"); """ cos.db.inssts(ins) upd = f""" UPDATE `com_info` SET `status_cpr_of_soft` = 1 WHERE `com_id` = "{com_id}" ; """ cos.db.updsts(upd) localtime = cos.tm.get_localtime() # 当前时间 print('\n{1}\n{0}数据采集完成!{0}\n{1}'.format('+' * 7, '+' * 25)) print(f'当前时间:{localtime}\n') time.sleep(3)
def get_page_info(self): #获取页面详情 pt = PatentInfo() value = pt.get_page_count() com_id = value[0] com_name = value[1] count_page = value[2] if com_id == None: pass else: key = pt.search_key(com_name) index_url = value[3] count = 0 for page in range(1, count_page + 1): # 'https://www.qichacha.com/company_getinfos?unique=&companyname=&p=2&tab=assets&box=zhuanli&zlpublicationyear=&zlipclist=&zlkindcode=&zllegalstatus=' page_url = f'{index_url}/company_getinfos?unique={com_id}&companyname={key}&p={page}&tab=assets&box=zhuanli' hds = gh().header() hds.update({'Referer': f'{index_url}/firm_{com_id}.html'}) time.sleep(random.randint(1, 2)) res_pg = requests.get(page_url, headers=hds).text tree_pg = etree.HTML(res_pg) content_li = tree_pg.xpath('//table/tr[position()>1]') for content in content_li: count += 1 patent_num = content.xpath('td[1]/text()')[0] patent_type = content.xpath('td[2]/text()')[0] patent_pub_num = content.xpath('td[3]/text()')[0] patent_pub_date = content.xpath('td[4]/text()')[0] patent_name = content.xpath('td[5]/a/text()')[0].strip() patent_link = content.xpath('td[5]/a/@href')[0] patent_url = ''.join((index_url, patent_link)) time.sleep(random.randint(1, 3)) res_dt = requests.get(patent_url, headers=hds).text tree_dt = etree.HTML(res_dt) app_num = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请号")]/following-sibling::td[1]/text()' )[0].strip() app_date = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请日")]/following-sibling::td[1]/text()' )[0].strip() prio_date = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"优先权日")]/following-sibling::td[1]/text()' )[0].strip() prio_num = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"优先权号")]/following-sibling::td[1]/text()' )[0].strip() inventor = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"发明人")]/following-sibling::td[1]/text()' )[0].strip() try: applicant = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请(专利权)人")]/following-sibling::td[1]/a/text()' )[0].strip() except: applicant = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请(专利权)人")]/following-sibling::td[1]' )[0].strip() try: agency = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理机构")]/following-sibling::td[1]/a/text()' )[0].strip() except: agency = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理机构")]/following-sibling::td[1]/text()' )[0].strip() agent = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理人")]/following-sibling::td[1]/text()' )[0].strip() ipc = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"IPC分类号")]/following-sibling::td[1]/text()' )[0].strip().replace(' ', '').replace('\n', '') cpc = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"CPC分类号")]/following-sibling::td[1]/text()' )[0].strip() app_address = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请人地址")]/following-sibling::td[1]/text()' )[0].strip() app_zip_code = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请人邮编")]/following-sibling::td[1]/text()' )[0].strip() abstract = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"摘要")]/following-sibling::td[1]/text()' )[0].strip() try: abstract_photo = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"摘要附图")]/following-sibling::td[1]/img/@src' )[0].strip() except: abstract_photo = '-' try: claim = tree_dt.xpath( '//table[@class="ntable"]/tr/td[@class="ea_instructions" and position()=1]/p/text()' ) claim = ''.join(claim) except: claim = '-' try: instructions = tree_dt.xpath( '//div[@class="tcaption"]/h3[text()="说明书"]/parent::div/following-sibling::table[@class="ntable"]/tr/td[@class="ea_instructions"]/h1/text()|//div[@class="tcaption"]/h3[text()="说明书"]/parent::div/following-sibling::table[@class="ntable"]/tr/td[@class="ea_instructions"]/h2/text()|//div[@class="tcaption"]/h3[text()="说明书"]/parent::div/following-sibling::table[@class="ntable"]/tr/td[@class="ea_instructions"]/p/text()' ) instructions = ''.join(instructions) except: instructions = '-' print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format( '-' * 9, count, page, count_page)) localtime = tm().get_localtime() # 当前时间 print(f'公司ID:{com_id} 当前时间:{localtime}') print( f'序号:{patent_num}\n专利类型:{patent_type}\n公开(公告)号:{patent_pub_num}\n公开(公告)日期:{patent_pub_date}\n专利名称:{patent_name}\n' f'专利页URL:{patent_url}\n申请号:{app_num}\n申请日期:{app_date}\n优先权日:{prio_date}\n优先权号:{prio_num}\n' f'发明人:{inventor}\n申请(专利权)人:{applicant}\n代理机构:{agency}\n代理人:{agent}\nIPC分类号:{ipc}\n' f'CPC分类号:{cpc}\n申请人地址:{app_address}\n申请人邮编:{app_zip_code}\n摘要:{abstract}\n摘要附图:{abstract_photo}\n' f'权利要求:{claim}\n说明书:{instructions}') ins = f""" INSERT INTO `com_patent` (`com_id`,`patent_num`,`patent_type`,`patent_pub_num`,`patent_pub_date`, `patent_name`,`patent_url`,`app_num`,`app_date`,`prio_date`, `prio_num`,`inventor`,`applicant`,`agency`,`agent`, `ipc`,`cpc`,`app_address`,`app_zip_code`,`abstract`,`abstract_photo`, `claim`,`instructions`) VALUES ("{com_id}","{patent_num}","{patent_type}","{patent_pub_num}","{patent_pub_date}", "{patent_name}","{patent_url}","{app_num}","{app_date}","{prio_date}", "{prio_num}","{inventor}","{applicant}","{agency}","{agent}", "{ipc}","{cpc}","{app_address}","{app_zip_code}","{abstract}","{abstract_photo}", "{claim}","{instructions}"); """ db().inssts(ins) upd = f""" UPDATE `com_info` SET `status` = 1 WHERE `com_id` = "{com_id}" ; """ db().updsts(upd) # input('暂停') localtime = tm().get_localtime() # 当前时间 print('\n{1}\n{0}数据采集完成!{0}\n{1}'.format('+' * 7, '+' * 25)) print(f'当前时间:{localtime}')
def parse_info(self, tree, com_id, com_name, page, sh_page_count): sh = StockHolder() count = (page - 1) * 50 if tree == None: print('无相关数据!\n') else: # 引入verify_stockholder_args方法 -- 2019-11-26 stockholder_args = sh.verify_stockholder_args(tree) stockholder_li = tree.xpath( '//table[contains(@class,"ntable ntable-odd npth")]/tr[position()>1]|//table[contains(@class,"ntable ntable-odd npth")]/tbody/tr[position()>1]' ) for stockholder_info in stockholder_li: count += 1 stockholder_num = stockholder_info.xpath( 'td[1]/text()')[0].strip() stockholder_name = stockholder_info.xpath( 'td[2]//*[@class="seo font-14"]/text()')[0].strip() if stockholder_info.xpath('td[3]/text()')[0].strip() == '': stockholder_rate = stockholder_info.xpath( 'td[3]/span/text()')[0].strip() else: stockholder_rate = stockholder_info.xpath( 'td[3]/text()')[0].strip() if '最终受益股份' not in stockholder_args: if stockholder_info.xpath('td[4]/text()')[0].strip() == '': subscribed_capital_amount = stockholder_info.xpath( 'td[4]/span/text()')[0].strip() else: subscribed_capital_amount = stockholder_info.xpath( 'td[4]/text()')[0].strip() if stockholder_info.xpath('td[5]/text()')[0].strip() == '': subscribed_capital_date = stockholder_info.xpath( 'td[5]/span/text()')[0].strip() else: subscribed_capital_date = stockholder_info.xpath( 'td[5]/text()')[0].strip() else: if stockholder_info.xpath('td[5]/text()')[0].strip() == '': subscribed_capital_amount = stockholder_info.xpath( 'td[5]/span/text()')[0].strip() else: subscribed_capital_amount = stockholder_info.xpath( 'td[5]/text()')[0].strip() if stockholder_info.xpath('td[6]/text()')[0].strip() == '': subscribed_capital_date = stockholder_info.xpath( 'td[6]/span/text()')[0].strip() else: subscribed_capital_date = stockholder_info.xpath( 'td[6]/text()')[0].strip() if '实缴出资额' not in stockholder_args: contributed_capital_amount = '--' contributed_capital_date = '--' else: if '最终受益股份' not in stockholder_args: if stockholder_info.xpath( 'td[6]/text()')[0].strip() == '': contributed_capital_amount = stockholder_info.xpath( 'td[6]/span/text()')[0].strip() contributed_capital_date = stockholder_info.xpath( 'td[7]/span/text()')[0].strip() else: contributed_capital_amount = stockholder_info.xpath( 'td[6]/text()')[0].strip() contributed_capital_date = stockholder_info.xpath( 'td[7]/text()')[0].strip() else: if stockholder_info.xpath( 'td[7]/text()')[0].strip() == '': contributed_capital_amount = stockholder_info.xpath( 'td[7]/span/text()')[0].strip() contributed_capital_date = stockholder_info.xpath( 'td[8]/span/text()')[0].strip() else: contributed_capital_amount = stockholder_info.xpath( 'td[7]/text()')[0].strip() contributed_capital_date = stockholder_info.xpath( 'td[8]/text()')[0].strip() if '关联产品/机构' in stockholder_args: if '最终受益股份' not in stockholder_args and '实缴出资额' not in stockholder_args: if stockholder_info.xpath( 'td[6]/text()')[0].strip() == '': relation_product = stockholder_info.xpath( 'td[6]/a/text()')[0].strip() else: relation_product = stockholder_info.xpath( 'td[6]/text()')[0].strip() elif '最终受益股份' not in stockholder_args and '实缴出资额' in stockholder_args: if stockholder_info.xpath( 'td[8]/text()')[0].strip() == '': relation_product = stockholder_info.xpath( 'td[8]/a/text()')[0].strip() else: relation_product = stockholder_info.xpath( 'td[8]/text()')[0].strip() elif '最终受益股份' in stockholder_args and '实缴出资额' not in stockholder_args: if stockholder_info.xpath( 'td[7]/text()')[0].strip() == '': relation_product = stockholder_info.xpath( 'td[7]/a/text()')[0].strip() else: relation_product = stockholder_info.xpath( 'td[7]/text()')[0].strip() else: if stockholder_info.xpath( 'td[9]/text()')[0].strip() == '': relation_product = stockholder_info.xpath( 'td[9]/a/text()')[0].strip() else: relation_product = stockholder_info.xpath( 'td[9]/text()')[0].strip() else: relation_product = '--' localtime = tm().get_localtime() # 当前时间 create_time = localtime print('\n{0}--总第{1}条----第{2}/{3}页----{0}\n'.format( '-' * 9, count, page, sh_page_count)) print(f'当前时间:{create_time}') print(f'公司ID:{com_id}\n公司名称:{com_name}') print( f'序号:{stockholder_num}\n股东:{stockholder_name}\n持股比例:{stockholder_rate}\n认缴出资额:{subscribed_capital_amount}\n认缴出资日期:{subscribed_capital_date}\n' f'实缴出资额:{contributed_capital_amount}\n实缴出资日期:{contributed_capital_date}\n关联产品/机构:{relation_product}\n' ) ins = f""" INSERT INTO `com_stockholder` (com_id,stockholder_num,stockholder_name,stockholder_rate,subscribed_capital_amount, subscribed_capital_date,contributed_capital_amount,contributed_capital_date,relation_product,create_time) VALUES ("{com_id}","{stockholder_num}","{stockholder_name}","{stockholder_rate}","{subscribed_capital_amount}", "{subscribed_capital_date}","{contributed_capital_amount}","{contributed_capital_date}","{relation_product}","{create_time}"); """ # udp = f""" # UPDATE `com_info` # SET `status_stockholder` = "9" # AND `count_stockholder` = "{count_sh}" # WHERE `com_id` = "{com_id}";""" self.db.inssts(ins)