def __init__(self, userId, password): self._qianzhan_client = QianzhanClient(userId, password) self._company_detail_url_list = [] self._txt = get_1000_txt() # self._txt_len = len(self._txt) # self._i = 0 # self._j = 0 logging.info("txt len:->%d" % len(self._txt)) pass
def __init__(self, userId, password): self._qianzhan_client = QianzhanClient(userId, password) pass
class Spider(object): def __init__(self, userId, password): self._qianzhan_client = QianzhanClient(userId, password) pass def _get_company(self, url): response = self._qianzhan_client.get_company(url) # print(response.text) soup = BeautifulSoup(response.text, 'lxml') company = {} company.update({ 'company_name': soup.select_one('h1[class="ct_name"]').contents[0] }) try: company.update({'url': soup.select_one('a[class="url"]').text}) except Exception, e: pass company.update({ 'item_update_time': time.strftime('%Y-%m-%d', time.localtime(time.time())) }) span_list = soup.select('ul[class="art-basic"] li span[class="info"]') company.update({ 'organization_registration_code': span_list[0].text, 'registration_number': span_list[1].text, 'legal_representative': span_list[2].text, 'business_status': span_list[3].text, 'registered_capital': span_list[4].text, 'business_type': span_list[5].text, 'register_date': span_list[6].text, 'operating_period': span_list[7].text, 'business_address': span_list[8].text, 'business_scope': span_list[9].text }) span_list_2 = soup.select('ul[class="art-org"] li span[class=""info]') company.update({ 'province': span_list_2[1].text, 'registration_authority': span_list_2[4].text }) # use company.update({ 'hdencryptCode': soup.select_one('input[id="hdencryptCode"]')['value'], 'hdoc_area': soup.select_one('input[id="hdoc_area"]')['value'] }) # logging.debug("company:->%s" % company) # company.update({'getcommentlist': self._qianzhan_client.post_getcommentlist(company['hdencryptCode'])}) # company.update({'SearchItemCCXX': self._qianzhan_client.post_SearchItemCCXX(company['hdencryptCode'], # company['hdoc_area'])}) # company.update({'searchitemdftz': self._qianzhan_client.post_searchitemdftz(company['company_name'])}) company.update({ 'searchitemnbinfo': self._qianzhan_client.post_searchitemnbinfo( company['hdencryptCode'], company['hdoc_area']) }) if company['searchitemnbinfo'] and len( company['searchitemnbinfo']) > 0: company.update({ 'searchitemnb': self._qianzhan_client.post_searchitemnb( company['hdencryptCode'], company['hdoc_area'], company['searchitemnbinfo'][0].get('year')) }) # company.update({'searchitemsite': self._qianzhan_client.post_searchitemsite(company['hdencryptCode'])}) # print "company:->", company return company