def insert_data(self): unique = self.get_uniques() create_time = time.time() # 获取包含所有字段的元组 p_num = 1 #self.proxy = proxy_pool.proxy while True: try: self.proxy = proxy_pool.change_proxy() proxy = self.proxy html = requests.get('https://www.qichacha.com/firm_%s'%unique,headers=headers_pool.requests_headers(),proxies=proxy,timeout=2) except (requests.exceptions.ProxyError, requests.exceptions.ConnectTimeout, requests.exceptions.ReadTimeout,requests.exceptions.SSLError,requests.exceptions.ConnectionError): self.proxy = proxy_pool.change_proxy() global proxy proxy = self.proxy print 'changing proxy...%s...%s'%(p_num,self.proxy) p_num+=1 continue break response = html.text print response # tuple = (unique,name, phone, website, email, province, city, county, address, intro, registered_capital, actual_capital,operating_state, establishment_date, uscc, taxpayer_number, registration_number, organization_code, type,industry, approval_date, registration_authority, area, english_name, used_name, insurancer_count,staff_count, operation_period, operation_scope,create_time,1) #cursor.execute('update company_branch set name=%s, phone=%s, website=%s, email=%s, province=%s, city=%s, county=%s, address=%s, intro=%s, registered_capital=%s, actual_capital=%s, operating_state=%s, establishment_date=%s, uscc=%s, taxpayer_number=%s, registration_number=%s, organization_code=%s, type=%s, industry=%s, approval_date=%s, registration_authority=%s, area=%s, english_name=%s, used_name=%s, insurancer_count=%s, staff_count=%s, operation_period=%s, operation_scope=%s, create_time=%s, status=%s where branch_no=%s' % company_fields) #db.commit() #cursor.execute('select id from company_branch where branch_no=%s' % unique) #branch_id = cursor.fetchone()[0] #print '第%s条插入成功,已插入%s条,剩余%s条'%(branch_id,n,len(uniques)-n) time.sleep(1.5)
def insert_data(self): uniques = self.get_uniques() if len(uniques) == 0: print 'table company_branch is ok' else: print '正在向company_branch插入数据...' n = 1 for unique in uniques: if unique == None: continue # 判断token使用次数,使用token超过800次,就换一个token使用 if n == 800: raise NameError create_time = time.time() # 获取包含所有字段的元组 p_num = 1 #self.proxy = proxy_pool.proxy while True: try: self.proxy = proxy_pool.change_proxy() proxy = self.proxy (fields, result) = get_all_fields.get_fields(unique, token, proxy) except (requests.exceptions.ProxyError, requests.exceptions.ConnectTimeout, requests.exceptions.ReadTimeout, requests.exceptions.SSLError, requests.exceptions.ConnectionError): self.proxy = proxy_pool.change_proxy() global proxy proxy = self.proxy print 'changing proxy...%s...%s' % (p_num, self.proxy) p_num += 1 continue break # (fields,result) = spider.get_fields(unique,token) # 转为列表,并将unique,create_time,status加入列表 company_fields = list(fields) unique = json.dumps(unique, encoding="utf-8", ensure_ascii=False) company_fields.append(create_time) company_fields.append(1) company_fields.append(unique) # 转为元组,插入数据 company_fields = tuple(company_fields) # tuple = (unique,name, phone, website, email, province, city, county, address, intro, registered_capital, actual_capital,operating_state, establishment_date, uscc, taxpayer_number, registration_number, organization_code, type,industry, approval_date, registration_authority, area, english_name, used_name, insurancer_count,staff_count, operation_period, operation_scope,create_time,1) cursor.execute( 'update company_branch set name=%s, phone=%s, website=%s, email=%s, province=%s, city=%s, county=%s, address=%s, intro=%s, registered_capital=%s, actual_capital=%s, operating_state=%s, establishment_date=%s, uscc=%s, taxpayer_number=%s, registration_number=%s, organization_code=%s, type=%s, industry=%s, approval_date=%s, registration_authority=%s, area=%s, english_name=%s, used_name=%s, insurancer_count=%s, staff_count=%s, operation_period=%s, operation_scope=%s, create_time=%s, status=%s where branch_no=%s' % company_fields) db.commit() cursor.execute('select id from company_branch where branch_no=%s' % unique) branch_id = cursor.fetchone()[0] print '第%s条插入成功,已插入%s条,剩余%s条' % (branch_id, n, len(uniques) - n) n += 1 time.sleep(1.5)
def insert_keyno(self): company_dict = self.get_id_name() if len(company_dict) == 0: print 'table source_company is ok' else: print '正在向source_company插入数据...' for com in company_dict.values(): # 判断token使用次数,使用token超过1000次,就换一个token使用 config.change_token() a = ','.join(com) #将set类型转为str headers = headers_pool.requests_headers() proxy = proxy_pool.change_proxy() js = requests.get( 'https://xcx.qichacha.com/wxa/v1/base/advancedSearchNew?searchKey=%s&token=%s' % (a, token), headers=headers, proxies=proxy) js = js.text js = json.loads(js) result = js.get('result') Result = result.get('Result')[0] keyno = Result.get('KeyNo') #print keyno time.sleep(2.5) id = list(company_dict.keys())[list( company_dict.values()).index(com)] #根据values得到对应的key值 #update_time = time.strftime("%Y-%m-%d %H:%M:%S") update_time = time.time() cursor.execute( "update source_company set key_no='%s',update_time=%s where id='%s'" % (keyno, update_time, id)) db.commit()
def insert_data(self): uniques = self.get_uniques() if len(uniques) == 0: print 'table company is ok' else: print '正在向company插入数据...' for unique in uniques: n = 1 create_time = time.time() # 判断token使用次数,使用token超过1000次,就换一个token使用 config.change_token() # 获取包含所有字段的元组 p_num = 1 #self.proxy = proxy_pool.proxy while True: try: self.proxy = proxy_pool.change_proxy() proxy = self.proxy (fields, result) = get_all_fields.get_fields(unique, token, proxy) except (requests.exceptions.ProxyError, requests.exceptions.ConnectTimeout, requests.exceptions.ReadTimeout, requests.exceptions.SSLError, requests.exceptions.ConnectionError): self.proxy = proxy_pool.change_proxy() global proxy proxy = self.proxy print 'changing proxy...%s...%s' % (p_num, self.proxy) p_num += 1 continue break # 转为列表,并将unique,create_time,status加入列表 company_fields = list(fields) unique = json.dumps(unique, encoding="utf-8", ensure_ascii=False) company_fields.insert(0, unique) company_fields.append(create_time) company_fields.append(1) # 转为元组,插入数据 company_fields = tuple(company_fields) #tuple = (unique,name, phone, website, email, province, city, county, address, intro, registered_capital, actual_capital,operating_state, establishment_date, uscc, taxpayer_number, registration_number, organization_code, type,industry, approval_date, registration_authority, area, english_name, used_name, insurancer_count,staff_count, operation_period, operation_scope,create_time,1) cursor.execute( 'insert into company(company_no,name,phone,website,email,province,city,county,address,intro,registered_capital,actual_capital,operating_state,establishment_date,uscc,taxpayer_number,registration_number,organization_code,type,industry,approval_date,registration_authority,area,english_name,used_name,insurancer_count,staff_count,operation_period,operation_scope,create_time,status) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' % company_fields) #cursor.execute('insert into company(company_no,name,phone,website,email,province,city,county,address,intro,registered_capital,actual_capital,operating_state,establishment_date,uscc,taxpayer_number,registration_number,organization_code,type,industry,approval_date,registration_authority,area,english_name,used_name,insurancer_count,staff_count,operation_period,operation_scope,create_time,status) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s' %(unique,name,phone,website,email,province,city,county,address,intro,registered_capital,actual_capital,operating_state,establishment_date,uscc,taxpayer_number,registration_number,organization_code,type,industry,approval_date,registration_authority,area,english_name,used_name,insurancer_count,staff_count,operation_period,operation_scope,create_time,1)) '''将branch_no,company_id,vestin_company,name插入company_branch表中''' branches = result.get('Branches') cursor.execute('select id from company where company_no=%s' % unique) company_id = cursor.fetchone()[0] for i in branches: keyno = i.get('KeyNo') keyno = json.dumps(keyno, encoding="utf-8", ensure_ascii=False) name = i.get('Name') name = json.dumps(name, encoding="utf-8", ensure_ascii=False) #self.company_id = company_id cursor.execute( 'insert into company_branch(branch_no,company_id,vestin_company,name) values(%s,%s,%s,%s)' % (keyno, company_id, 1, name)) #time.sleep(2) #db.commit() '''将investment_no,company_id,name插入company_investment''' #unique = '287d9caa36e789820710a762fac79ad5' unique = json.loads(unique) global token_num token_num += 1 while True: try: js = requests.get( 'https://xcx.qichacha.com/wxa/v1/base/getInvestments?unique=%s&token=%s' % (unique, token), headers=config.headers, proxies=self.proxy, verify=False, timeout=2) except: self.proxy = proxy_pool.change_proxy() continue break js = js.text js = json.loads(js) investments = js.get('result').get('Result') for i in investments: keyno = i.get('KeyNo') keyno = json.dumps(keyno, encoding="utf-8", ensure_ascii=False) name = i.get('Name') name = json.dumps(name, encoding="utf-8", ensure_ascii=False) #company_id = self.company_id cursor.execute( 'insert into company_investment(investment_no,company_id,name) values(%s,%s,%s)' % (keyno, company_id, name)) #time.sleep(2) #db.commit() #因为对外投资公司的网页有分页,20家为一页,因此判断公司数量(Total_investment)是否超过20,来翻页获取数据 Total_investment = js.get('result').get('Paging').get( 'TotalRecords') index = 1 num = (Total_investment - 1) / 20 #while Total_investment>20: for i in range(num): index += 1 token_num += 1 while True: try: js = requests.get( 'https://xcx.qichacha.com/wxa/v1/base/getInvestments?unique=%s&token=%s&pageIndex=%s' % (unique, token, index), headers=config.headers, proxies=self.proxy, verify=False, timeout=2) except: self.proxy = proxy_pool.change_proxy() continue break js = js.text js = json.loads(js) investments = js.get('result').get('Result') Total_investment = js.get('result').get('Paging').get( 'TotalRecords') for i in investments: keyno = i.get('KeyNo') keyno = json.dumps(keyno, encoding="utf-8", ensure_ascii=False) name = i.get('Name') name = json.dumps(name, encoding="utf-8", ensure_ascii=False) #company_id = self.company_id cursor.execute( 'insert into company_investment(investment_no,company_id,name) values(%s,%s,%s)' % (keyno, company_id, name)) #db.commit() #time.sleep(3) time.sleep(2) #db.commit() #一个商业公司插入完成后,将source_company中的status更新为1,如果中间中断,直接从status=0的开始重新插入。到这里,一条商业公司的信息就插入完成了 unique = json.dumps(unique, encoding="utf-8", ensure_ascii=False) cursor.execute( 'update source_company set status=1 where key_no=%s' % unique) #在三个表中都插入数据后,也就是一个商业公司插入完成后,提交事务 db.commit() time.sleep(2) print '已插入%s条,剩余%s条' % (n, len(uniques) - n) n += 1
def insert_data(self): unique = self.get_uniques() print 'unique = ' + unique create_time = time.time() # 获取包含所有字段的元组 p_num = 1 #self.proxy = proxy_pool.proxy while True: try: cookie = { 'Hm_lpvt_3456bee468c83cc63fb5147f119f1075': str(int(time.time())), 'QCCSESSID': 'kh50oeettqgbbphg0k57p37t40' } self.proxy = proxy_pool.change_proxy() proxy = self.proxy html = requests.get('https://www.qichacha.com/firm_%s' % unique, headers=headers_pool.requests_headers(), proxies=proxy, cookies=cookie, timeout=2) except (requests.exceptions.ProxyError, requests.exceptions.ConnectTimeout, requests.exceptions.ReadTimeout, requests.exceptions.SSLError, requests.exceptions.ConnectionError): self.proxy = proxy_pool.change_proxy() global proxy proxy = self.proxy print 'changing proxy...%s...%s' % (p_num, self.proxy) p_num += 1 continue break response = html.content print response #有时会获取不到页面,触发异常处理 if '<script>window.location.href' in response: raise UnboundLocalError s1 = etree.HTML(response) #获取字段 #注册资本 registered_capital = s1.xpath( '//*[@id="Cominfo"]/table[2]/tr[1]/td[2]//text()')[0].strip( ).encode('utf-8') if registered_capital == '-': registered_capital = None registered_capital = json.dumps(registered_capital, encoding="utf-8", ensure_ascii=False) test = s1.xpath( '//*[@id="company-top"]/div[2]/div[2]/div[3]/div[1]/span[1]/span[2]/span//text()' ) print test print registered_capital #unique = json.dumps(unique, encoding="utf-8", ensure_ascii=False) #cursor.execute('update branch_test set registered_capital=%s,status=1 where branch_no=%s' %(registered_capital,unique)) #db.commit() time.sleep(1)