def get_com_id(self): #随机获取一条符合条件的公司信息 sel = """ SELECT `com_id`,`com_name` FROM `com_info` WHERE `origin` IS NOT NULL AND LENGTH(`com_id`) > 5 AND `status_credit_execued` IS NULL ORDER BY RAND() LIMIT 1; """ # 测试sql# # sel = """ # SELECT `com_id`, `com_name` # FROM `com_info` # WHERE com_id = '299eee201318f0283f086b4847d69fc7'; # """ # 测试sql# result = db().selsts(sel) if result == (): result = [None, None] else: result = result[0] return result
def get_page_info(self): #获取页面详情 pt = PatentInfo() value = pt.get_page_count() com_id = value[0] com_name = value[1] count_page = value[2] if com_id == None: pass else: key = pt.search_key(com_name) index_url = value[3] count = 0 for page in range(1, count_page + 1): # 'https://www.qichacha.com/company_getinfos?unique=&companyname=&p=2&tab=assets&box=zhuanli&zlpublicationyear=&zlipclist=&zlkindcode=&zllegalstatus=' page_url = f'{index_url}/company_getinfos?unique={com_id}&companyname={key}&p={page}&tab=assets&box=zhuanli' hds = gh().header() hds.update({'Referer': f'{index_url}/firm_{com_id}.html'}) time.sleep(random.randint(1, 2)) res_pg = requests.get(page_url, headers=hds).text tree_pg = etree.HTML(res_pg) content_li = tree_pg.xpath('//table/tr[position()>1]') for content in content_li: count += 1 patent_num = content.xpath('td[1]/text()')[0] patent_type = content.xpath('td[2]/text()')[0] patent_pub_num = content.xpath('td[3]/text()')[0] patent_pub_date = content.xpath('td[4]/text()')[0] patent_name = content.xpath('td[5]/a/text()')[0].strip() patent_link = content.xpath('td[5]/a/@href')[0] patent_url = ''.join((index_url, patent_link)) time.sleep(random.randint(1, 3)) res_dt = requests.get(patent_url, headers=hds).text tree_dt = etree.HTML(res_dt) app_num = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请号")]/following-sibling::td[1]/text()' )[0].strip() app_date = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请日")]/following-sibling::td[1]/text()' )[0].strip() prio_date = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"优先权日")]/following-sibling::td[1]/text()' )[0].strip() prio_num = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"优先权号")]/following-sibling::td[1]/text()' )[0].strip() inventor = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"发明人")]/following-sibling::td[1]/text()' )[0].strip() try: applicant = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请(专利权)人")]/following-sibling::td[1]/a/text()' )[0].strip() except: applicant = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请(专利权)人")]/following-sibling::td[1]' )[0].strip() try: agency = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理机构")]/following-sibling::td[1]/a/text()' )[0].strip() except: agency = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理机构")]/following-sibling::td[1]/text()' )[0].strip() agent = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理人")]/following-sibling::td[1]/text()' )[0].strip() ipc = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"IPC分类号")]/following-sibling::td[1]/text()' )[0].strip().replace(' ', '').replace('\n', '') cpc = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"CPC分类号")]/following-sibling::td[1]/text()' )[0].strip() app_address = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请人地址")]/following-sibling::td[1]/text()' )[0].strip() app_zip_code = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请人邮编")]/following-sibling::td[1]/text()' )[0].strip() abstract = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"摘要")]/following-sibling::td[1]/text()' )[0].strip() try: abstract_photo = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"摘要附图")]/following-sibling::td[1]/img/@src' )[0].strip() except: abstract_photo = '-' try: claim = tree_dt.xpath( '//table[@class="ntable"]/tr/td[@class="ea_instructions" and position()=1]/p/text()' ) claim = ''.join(claim) except: claim = '-' try: instructions = tree_dt.xpath( '//div[@class="tcaption"]/h3[text()="说明书"]/parent::div/following-sibling::table[@class="ntable"]/tr/td[@class="ea_instructions"]/h1/text()|//div[@class="tcaption"]/h3[text()="说明书"]/parent::div/following-sibling::table[@class="ntable"]/tr/td[@class="ea_instructions"]/h2/text()|//div[@class="tcaption"]/h3[text()="说明书"]/parent::div/following-sibling::table[@class="ntable"]/tr/td[@class="ea_instructions"]/p/text()' ) instructions = ''.join(instructions) except: instructions = '-' print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format( '-' * 9, count, page, count_page)) localtime = tm().get_localtime() # 当前时间 print(f'公司ID:{com_id} 当前时间:{localtime}') print( f'序号:{patent_num}\n专利类型:{patent_type}\n公开(公告)号:{patent_pub_num}\n公开(公告)日期:{patent_pub_date}\n专利名称:{patent_name}\n' f'专利页URL:{patent_url}\n申请号:{app_num}\n申请日期:{app_date}\n优先权日:{prio_date}\n优先权号:{prio_num}\n' f'发明人:{inventor}\n申请(专利权)人:{applicant}\n代理机构:{agency}\n代理人:{agent}\nIPC分类号:{ipc}\n' f'CPC分类号:{cpc}\n申请人地址:{app_address}\n申请人邮编:{app_zip_code}\n摘要:{abstract}\n摘要附图:{abstract_photo}\n' f'权利要求:{claim}\n说明书:{instructions}') ins = f""" INSERT INTO `com_patent` (`com_id`,`patent_num`,`patent_type`,`patent_pub_num`,`patent_pub_date`, `patent_name`,`patent_url`,`app_num`,`app_date`,`prio_date`, `prio_num`,`inventor`,`applicant`,`agency`,`agent`, `ipc`,`cpc`,`app_address`,`app_zip_code`,`abstract`,`abstract_photo`, `claim`,`instructions`) VALUES ("{com_id}","{patent_num}","{patent_type}","{patent_pub_num}","{patent_pub_date}", "{patent_name}","{patent_url}","{app_num}","{app_date}","{prio_date}", "{prio_num}","{inventor}","{applicant}","{agency}","{agent}", "{ipc}","{cpc}","{app_address}","{app_zip_code}","{abstract}","{abstract_photo}", "{claim}","{instructions}"); """ db().inssts(ins) upd = f""" UPDATE `com_info` SET `status` = 1 WHERE `com_id` = "{com_id}" ; """ db().updsts(upd) # input('暂停') localtime = tm().get_localtime() # 当前时间 print('\n{1}\n{0}数据采集完成!{0}\n{1}'.format('+' * 7, '+' * 25)) print(f'当前时间:{localtime}')
def get_com_name(self,sql): com_name = db().inssts(sql) return com_name
def get_page_info(self): #解析页面内容 cd = Credit() value = cd.get_page_count() com_id = value[0] com_name = value[1] count_page = value[2] count_record = value[3] key = dk().search_key(com_name) count = 0 for page in range(1, count_page + 1): index_url = 'https://www.qichacha.com' page_url = f'{index_url}/company_getinfos?unique={com_id}&companyname={key}&p={page}&tab=susong&box=zhixing' hds = gh().header() hds.update({'Referer': f'{index_url}/firm_{com_id}.html'}) time.sleep(random.randint(1, 2)) res_pg = requests.get(page_url, headers=hds).text if '<script>window.location.href' in res_pg: print('访问频繁,需验证!{get_page_info}') input('暂停') elif '<script>location.href="/user_login"</script>' in res_pg: print('Cookie失效,需更换!{get_page_info}') input('程序暂停运行!') elif '您的账号访问超频,请稍后访问或联系客服人员' in res_pg: print('账号访问超频,请更换账号!{get_page_info}') input('程序暂停运行!') else: tree_pg = etree.HTML(res_pg) content_li = tree_pg.xpath( '//table[@class="ntable ntable-odd"]/tr[position()>2]') for nbr, content in enumerate(content_li, 1): count += 1 try: exec_num = content.xpath('td[1]/text()')[0] case_num = content.xpath('td[2]/a/text()')[0] case_id = content.xpath( 'td[2]/a[contains(@onclick,"showRelatModal")]/@onclick' )[0].split('zhixing",')[1].split('"')[1] case_url = 'id='.join( ('https://www.qichacha.com/company_zhixingRelat?', case_id)) filing_time = content.xpath('td[3]/text()')[0] court_of_exec = content.xpath('td[4]/text()')[0] exec_obj = content.xpath('td[5]/text()')[0] time.sleep(random.randint(1, 2)) res_info = requests.get(case_url, headers=hds).text if '<script>window.location.href' in res_info: print('访问频繁,需验证!{get_page_info}') input('暂停') elif '<script>location.href="/user_login"</script>' in res_info: print('Cookie失效,需更换!{get_page_info}') input('程序暂停运行!') elif '您的账号访问超频,请稍后访问或联系客服人员' in res_info: print('账号访问超频,请更换账号!{get_page_info}') input('程序暂停运行!') else: tree_info = etree.HTML(res_info) exec_person = tree_info.xpath( '//table/tbody/tr[1]/td[2]/text()')[0] occ = tree_info.xpath( '//table/tbody/tr[1]/td[4]/text()')[0] except: exec_num = None case_num = None case_id = None case_url = None filing_time = None court_of_exec = None exec_obj = None exec_person = None occ = None print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format( '-' * 9, count, page, count_page)) localtime = tm().get_localtime() # 当前时间 create_time = localtime print(f'当前时间:{create_time}') print( f'公司ID:{com_id}\n序号:{exec_num}\n案号:{case_num}\n案例ID:{case_id}\n案例链接:{case_url}\n' f'立案时间:{filing_time}\n执行法院:{court_of_exec}\n执行标的:{exec_obj}\n被执行人:{exec_person}\n身份证号/组织机构代码:{occ}\n' ) if exec_num == None: ins = """ INSERT INTO `com_credit_execued` (`com_id`,`exec_num`,`case_num`,`case_id`,`filing_time`, `court_of_exec`,`exec_obj`,`exec_person`,`occ`,`create_time`) VALUES (NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL); """ else: ins = f""" INSERT INTO `com_credit_execued` (`com_id`,`exec_num`,`case_num`,`case_id`,`filing_time`, `court_of_exec`,`exec_obj`,`exec_person`,`occ`,`create_time`) VALUES ("{com_id}","{exec_num}","{case_num}","{case_id}","{filing_time}", "{court_of_exec}","{exec_obj}","{exec_person}","{occ}","{create_time}"); """ db().inssts(ins) upd = f""" UPDATE `com_info` SET `status_credit_execued` = 1 WHERE `com_id` = "{com_id}" ; """ db().updsts(upd) localtime = tm().get_localtime() # 当前时间 print('\n{1}\n{0}数据采集完成!{0}\n{1}'.format('+' * 7, '+' * 25)) print(f'当前时间:{localtime}\n') time.sleep(3)
def __init__(self): self.rc = RecruitInfo() self.db = db() self.gh = gh() self.gm = gm() self.index_url = 'https://www.qichacha.com'
def __init__(self): self.db = db() self.gh = gh() self.tm = tm() self.gm = gm() self.index_url = 'https://www.qichacha.com'
def __init__(self): self.db = db()
def get_page_info(self): #获取页面详情 pt = PatentInfo() value = pt.get_page_count() com_id = value[0] com_name = value[1] count_page = value[2] # 临时代码,供单次补采数据【001】 com_id = '6129f29192de208800c7b5d23486a154' com_name = '乐融致新电子科技(天津)有限公司' count_page = 298 # 临时代码,供单次补采数据【001】 if com_id == None: pass else: key = dk().search_key(com_name) index_url = value[3] count = 0 start_time = tm().get_localtime() #当前时间 for page in range(148, count_page + 1): #临时代码,供单次补采数据【001】 # for page in range(1, count_page + 1): # if page == 1: # page_url = f'https://www.qichacha.com/company_getinfos?unique={com_id}&companyname={com_name}&tab=assets' page_url = f'{index_url}/company_getinfos?unique={com_id}&companyname={key}&p={page}&tab=assets&box=zhuanli' hds = gh().header() hds.update({'Referer': f'{index_url}/firm_{com_id}.html'}) time.sleep(random.randint(1, 2)) res_pg = requests.get(page_url, headers=hds).text if '<script>window.location.href' in res_pg: print('访问频繁,需验证!{get_page_info[1]}') input('暂停') elif '<script>location.href="/user_login"</script>' in res_pg: print('Cookie失效,需更换!{get_page_info[1]}') input('程序暂停运行!') elif '您的账号访问超频,请稍后访问或联系客服人员' in res_pg: print('账号访问超频,请更换账号!{get_page_info[1]}') input('程序暂停运行!') else: tree_pg = etree.HTML(res_pg) content_li = tree_pg.xpath('//table/tr[position()>1]') for content in content_li: count += 1 patent_num = content.xpath('td[1]/text()')[0] patent_type = content.xpath('td[2]/text()')[0] patent_pub_num = content.xpath('td[3]/text()')[0] patent_pub_date = content.xpath('td[4]/text()')[0] patent_name = content.xpath( 'td[5]/a/text()')[0].strip() patent_link = content.xpath('td[5]/a/@href')[0] patent_id = patent_link.split('_com_')[1] patent_url = ''.join((index_url, patent_link)) time.sleep(random.randint(1, 3)) res_dt = requests.get(patent_url, headers=hds).text if '<script>window.location.href' in res_dt: print('访问频繁,需验证!{get_page_info[2]}') input('暂停') elif '<script>location.href="/user_login"</script>' in res_dt: print('Cookie失效,需更换!{get_page_info[2]}') input('程序暂停运行!') elif '您的账号访问超频,请稍后访问或联系客服人员' in res_dt: print('账号访问超频,请更换账号!{get_page_info[2]}') input('程序暂停运行!') else: tree_dt = etree.HTML(res_dt) app_num = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请号")]/following-sibling::td[1]/text()' )[0].strip() app_date = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请日")]/following-sibling::td[1]/text()' )[0].strip() prio_date = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"优先权日")]/following-sibling::td[1]/text()' )[0].strip() prio_num = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"优先权号")]/following-sibling::td[1]/text()' )[0].strip() inventor = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"发明人")]/following-sibling::td[1]/text()' )[0].strip() try: applicant = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请(专利权)人")]/following-sibling::td[1]/a/text()' )[0].strip() except: applicant = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请(专利权)人")]/following-sibling::td[1]' )[0].strip() try: agency = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理机构")]/following-sibling::td[1]/a/text()' )[0].strip() except: agency = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理机构")]/following-sibling::td[1]/text()' )[0].strip() agent = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理人")]/following-sibling::td[1]/text()' )[0].strip() ipc = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"IPC分类号")]/following-sibling::td[1]/text()' )[0].strip().replace(' ', '').replace('\n', '') cpc = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"CPC分类号")]/following-sibling::td[1]/text()' )[0].strip() app_address = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请人地址")]/following-sibling::td[1]/text()' )[0].strip() app_zip_code = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请人邮编")]/following-sibling::td[1]/text()' )[0].strip() abstract = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"摘要")]/following-sibling::td[1]/text()' )[0].strip() try: abstract_photo = tree_dt.xpath( '//table[@class="ntable"]/tbody/tr/td[contains(text(),"摘要附图")]/following-sibling::td[1]/img/@src' )[0].strip() except: abstract_photo = '-' try: claim = tree_dt.xpath( '//table[@class="ntable"]/tr/td[@class="ea_instructions" and position()=1]/p/text()' ) claim = ''.join(claim) except: claim = '-' try: instructions = tree_dt.xpath( '//div[@class="tcaption"]/h3[text()="说明书"]/parent::div/following-sibling::table[@class="ntable"]/tr/td[@class="ea_instructions"]/h1/text()|//div[@class="tcaption"]/h3[text()="说明书"]/parent::div/following-sibling::table[@class="ntable"]/tr/td[@class="ea_instructions"]/h2/text()|//div[@class="tcaption"]/h3[text()="说明书"]/parent::div/following-sibling::table[@class="ntable"]/tr/td[@class="ea_instructions"]/p/text()' ) instructions = ''.join(instructions) except: instructions = '-' print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format( '-' * 9, count, page, count_page)) localtime = tm().get_localtime() # 当前时间 create_time = localtime print(f'公司ID:{com_id} 当前时间:{localtime}') print(f'公司名称:{com_name}\n专利ID:{patent_id}') print( f'序号:{patent_num}\n专利类型:{patent_type}\n公开(公告)号:{patent_pub_num}\n公开(公告)日期:{patent_pub_date}\n专利名称:{patent_name}\n' f'专利页URL:{patent_url}\n申请号:{app_num}\n申请日期:{app_date}\n优先权日:{prio_date}\n优先权号:{prio_num}\n' f'发明人:{inventor}\n申请(专利权)人:{applicant}\n代理机构:{agency}\n代理人:{agent}\nIPC分类号:{ipc}\n' f'CPC分类号:{cpc}\n申请人地址:{app_address}\n申请人邮编:{app_zip_code}\n摘要:{abstract}\n摘要附图:{abstract_photo}\n' f'权利要求:{claim}\n说明书:{instructions}\n') ins = f""" INSERT INTO `com_patent` (`com_id`,`patent_num`,`patent_type`,`patent_pub_num`,`patent_pub_date`, `patent_name`,`patent_url`,`app_num`,`app_date`,`prio_date`, `prio_num`,`inventor`,`applicant`,`agency`,`agent`, `ipc`,`cpc`,`app_address`,`app_zip_code`,`abstract`,`abstract_photo`, `claim`,`instructions`,`create_time`,`patent_id`) VALUES ("{com_id}","{patent_num}","{patent_type}","{patent_pub_num}","{patent_pub_date}", "{patent_name}","{patent_url}","{app_num}","{app_date}","{prio_date}", "{prio_num}","{inventor}","{applicant}","{agency}","{agent}", "{ipc}","{cpc}","{app_address}","{app_zip_code}","{abstract}","{abstract_photo}", "{claim}","{instructions}","{create_time}","{patent_id}"); """ db().inssts(ins) upd = f""" UPDATE `com_info` SET `status_patent` = 1 WHERE `com_id` = "{com_id}" ; """ db().updsts(upd) localtime = tm().get_localtime() # 当前时间 print('\n{1}\n{0}数据采集完成!{0}\n{1}'.format('+' * 7, '+' * 25)) print(f'当前时间:{localtime}\n') time.sleep(3)
def get_page_info(self): # 解析页面内容 alb = AdmLicenseBc() value = alb.get_page_count() com_id = value[0] com_name = value[1] count_page = value[2] count_record = value[3] key = dk().search_key(com_name) count = 0 for page in range(1, count_page + 1): index_url = 'https://www.qichacha.com' page_url = f'{index_url}/company_getinfos?unique={com_id}&companyname={key}&p={page}&tab=run&box=licens' hds = gh().header() hds.update({'Referer': f'{index_url}/firm_{com_id}.html'}) time.sleep(random.randint(1, 2)) res = requests.get(page_url, headers=hds).text if '<script>window.location.href' in res: print('访问频繁,需验证!{get_page_info[2]}') input('暂停') elif '<script>location.href="/user_login"</script>' in res: print('Cookie失效,需更换!{get_page_info[2]}') input('程序暂停运行!') elif '您的账号访问超频,请稍后访问或联系客服人员' in res: print('账号访问超频,请更换账号!{get_page_info[2]}') input('程序暂停运行!') else: tree = etree.HTML(res) content_li = tree.xpath( '//table[@class="ntable ntable-odd"]/tr[position()>2]') for nbr, content in enumerate(content_li, 1): count += 1 try: license_num = content.xpath('td[1]/text()')[0] license_doc_num = content.xpath('td[2]/text()')[0] license_doc_name = content.xpath('td[3]/text()')[0] valid_period_from = content.xpath('td[4]/text()')[0] valid_period_to = content.xpath('td[5]/text()')[0] license_office = content.xpath('td[6]/text()')[0] license_content = content.xpath('td[7]/text()')[0] except: license_num = None license_doc_num = None license_doc_name = None valid_period_from = None valid_period_to = None license_office = None license_content = None print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format( '-' * 9, count, page, count_page)) localtime = tm().get_localtime() # 当前时间 create_time = localtime print(f'当前时间:{create_time}') print( f'公司ID:{com_id}\n序号:{license_num}\n许可文件编号:{license_doc_num}\n许可文件名称:{license_doc_name}\n有效期自:{valid_period_from}\n' f'有效期至:{valid_period_to}\n许可机关:{license_office}\n许可内容:{license_content}' ) if license_num == None: ins = """ INSERT INTO `com_credit_adm_license_bc` (`com_id`,`license_num`,`license_doc_num`,`license_doc_name`,`valid_period_from`, `valid_period_to`,`license_office`,`license_content`,`create_time`) VALUES (NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL); """ else: ins = f""" INSERT INTO `com_credit_adm_license_bc` (`com_id`,`license_num`,`license_doc_num`,`license_doc_name`,`valid_period_from`, `valid_period_to`,`license_office`,`license_content`,`create_time`) VALUES ("{com_id}","{license_num}","{license_doc_num}","{license_doc_name}","{valid_period_from}", "{valid_period_to}","{license_office}","{license_content}","{create_time}"); """ db().inssts(ins) upd = f""" UPDATE `com_info` SET `status_credit_adm_license_bc` = 1 WHERE `com_id` = "{com_id}" ; """ db().updsts(upd) localtime = tm().get_localtime() # 当前时间 print('\n{1}\n{0}数据采集完成!{0}\n{1}'.format('+' * 7, '+' * 25)) print(f'当前时间:{localtime}\n') time.sleep(3)