Example #1
0
 def insert_data(self):
     unique = self.get_uniques()
     create_time = time.time()
     # 获取包含所有字段的元组
     p_num = 1
     #self.proxy = proxy_pool.proxy
     while True:
         try:
             self.proxy = proxy_pool.change_proxy()
             proxy = self.proxy
             html = requests.get('https://www.qichacha.com/firm_%s'%unique,headers=headers_pool.requests_headers(),proxies=proxy,timeout=2)
         except (requests.exceptions.ProxyError, requests.exceptions.ConnectTimeout, requests.exceptions.ReadTimeout,requests.exceptions.SSLError,requests.exceptions.ConnectionError):
             self.proxy = proxy_pool.change_proxy()
             global proxy
             proxy = self.proxy
             print 'changing proxy...%s...%s'%(p_num,self.proxy)
             p_num+=1
             continue
         break
     response = html.text
     print response
     # tuple = (unique,name, phone, website, email, province, city, county, address, intro, registered_capital, actual_capital,operating_state, establishment_date, uscc, taxpayer_number, registration_number, organization_code, type,industry, approval_date, registration_authority, area, english_name, used_name, insurancer_count,staff_count, operation_period, operation_scope,create_time,1)
     #cursor.execute('update company_branch set name=%s, phone=%s, website=%s, email=%s, province=%s, city=%s, county=%s, address=%s, intro=%s, registered_capital=%s, actual_capital=%s, operating_state=%s, establishment_date=%s, uscc=%s, taxpayer_number=%s, registration_number=%s, organization_code=%s, type=%s, industry=%s, approval_date=%s, registration_authority=%s, area=%s, english_name=%s, used_name=%s, insurancer_count=%s, staff_count=%s, operation_period=%s, operation_scope=%s, create_time=%s, status=%s where branch_no=%s' % company_fields)
     #db.commit()
     #cursor.execute('select id from company_branch where branch_no=%s' % unique)
     #branch_id = cursor.fetchone()[0]
     #print '第%s条插入成功,已插入%s条,剩余%s条'%(branch_id,n,len(uniques)-n)
     time.sleep(1.5)
Example #2
0
    def insert_data(self):
        uniques = self.get_uniques()
        if len(uniques) == 0:
            print 'table company_branch is ok'
        else:
            print '正在向company_branch插入数据...'
        n = 1
        for unique in uniques:
            if unique == None:
                continue

            # 判断token使用次数,使用token超过800次,就换一个token使用
            if n == 800:
                raise NameError

            create_time = time.time()
            # 获取包含所有字段的元组
            p_num = 1
            #self.proxy = proxy_pool.proxy
            while True:
                try:
                    self.proxy = proxy_pool.change_proxy()
                    proxy = self.proxy
                    (fields,
                     result) = get_all_fields.get_fields(unique, token, proxy)
                except (requests.exceptions.ProxyError,
                        requests.exceptions.ConnectTimeout,
                        requests.exceptions.ReadTimeout,
                        requests.exceptions.SSLError,
                        requests.exceptions.ConnectionError):
                    self.proxy = proxy_pool.change_proxy()
                    global proxy
                    proxy = self.proxy
                    print 'changing proxy...%s...%s' % (p_num, self.proxy)
                    p_num += 1
                    continue
                break
            # (fields,result) = spider.get_fields(unique,token)
            # 转为列表,并将unique,create_time,status加入列表
            company_fields = list(fields)
            unique = json.dumps(unique, encoding="utf-8", ensure_ascii=False)
            company_fields.append(create_time)
            company_fields.append(1)
            company_fields.append(unique)
            # 转为元组,插入数据
            company_fields = tuple(company_fields)

            # tuple = (unique,name, phone, website, email, province, city, county, address, intro, registered_capital, actual_capital,operating_state, establishment_date, uscc, taxpayer_number, registration_number, organization_code, type,industry, approval_date, registration_authority, area, english_name, used_name, insurancer_count,staff_count, operation_period, operation_scope,create_time,1)
            cursor.execute(
                'update company_branch set name=%s, phone=%s, website=%s, email=%s, province=%s, city=%s, county=%s, address=%s, intro=%s, registered_capital=%s, actual_capital=%s, operating_state=%s, establishment_date=%s, uscc=%s, taxpayer_number=%s, registration_number=%s, organization_code=%s, type=%s, industry=%s, approval_date=%s, registration_authority=%s, area=%s, english_name=%s, used_name=%s, insurancer_count=%s, staff_count=%s, operation_period=%s, operation_scope=%s, create_time=%s, status=%s where branch_no=%s'
                % company_fields)
            db.commit()
            cursor.execute('select id from company_branch where branch_no=%s' %
                           unique)
            branch_id = cursor.fetchone()[0]
            print '第%s条插入成功,已插入%s条,剩余%s条' % (branch_id, n, len(uniques) - n)
            n += 1
            time.sleep(1.5)
Example #3
0
    def insert_keyno(self):
        company_dict = self.get_id_name()
        if len(company_dict) == 0:
            print 'table source_company is ok'
        else:
            print '正在向source_company插入数据...'
        for com in company_dict.values():
            # 判断token使用次数,使用token超过1000次,就换一个token使用
            config.change_token()
            a = ','.join(com)  #将set类型转为str
            headers = headers_pool.requests_headers()
            proxy = proxy_pool.change_proxy()
            js = requests.get(
                'https://xcx.qichacha.com/wxa/v1/base/advancedSearchNew?searchKey=%s&token=%s'
                % (a, token),
                headers=headers,
                proxies=proxy)
            js = js.text
            js = json.loads(js)
            result = js.get('result')
            Result = result.get('Result')[0]
            keyno = Result.get('KeyNo')

            #print keyno
            time.sleep(2.5)
            id = list(company_dict.keys())[list(
                company_dict.values()).index(com)]  #根据values得到对应的key值
            #update_time = time.strftime("%Y-%m-%d %H:%M:%S")
            update_time = time.time()

            cursor.execute(
                "update source_company set key_no='%s',update_time=%s where id='%s'"
                % (keyno, update_time, id))
            db.commit()
Example #4
0
    def insert_data(self):
        uniques = self.get_uniques()
        if len(uniques) == 0:
            print 'table company is ok'
        else:
            print '正在向company插入数据...'
        for unique in uniques:
            n = 1
            create_time = time.time()
            # 判断token使用次数,使用token超过1000次,就换一个token使用
            config.change_token()
            # 获取包含所有字段的元组
            p_num = 1
            #self.proxy = proxy_pool.proxy
            while True:
                try:
                    self.proxy = proxy_pool.change_proxy()
                    proxy = self.proxy
                    (fields,
                     result) = get_all_fields.get_fields(unique, token, proxy)
                except (requests.exceptions.ProxyError,
                        requests.exceptions.ConnectTimeout,
                        requests.exceptions.ReadTimeout,
                        requests.exceptions.SSLError,
                        requests.exceptions.ConnectionError):
                    self.proxy = proxy_pool.change_proxy()
                    global proxy
                    proxy = self.proxy
                    print 'changing proxy...%s...%s' % (p_num, self.proxy)
                    p_num += 1
                    continue
                break
            # 转为列表,并将unique,create_time,status加入列表
            company_fields = list(fields)
            unique = json.dumps(unique, encoding="utf-8", ensure_ascii=False)
            company_fields.insert(0, unique)
            company_fields.append(create_time)
            company_fields.append(1)
            # 转为元组,插入数据
            company_fields = tuple(company_fields)

            #tuple = (unique,name, phone, website, email, province, city, county, address, intro, registered_capital, actual_capital,operating_state, establishment_date, uscc, taxpayer_number, registration_number, organization_code, type,industry, approval_date, registration_authority, area, english_name, used_name, insurancer_count,staff_count, operation_period, operation_scope,create_time,1)

            cursor.execute(
                'insert into company(company_no,name,phone,website,email,province,city,county,address,intro,registered_capital,actual_capital,operating_state,establishment_date,uscc,taxpayer_number,registration_number,organization_code,type,industry,approval_date,registration_authority,area,english_name,used_name,insurancer_count,staff_count,operation_period,operation_scope,create_time,status) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
                % company_fields)
            #cursor.execute('insert into company(company_no,name,phone,website,email,province,city,county,address,intro,registered_capital,actual_capital,operating_state,establishment_date,uscc,taxpayer_number,registration_number,organization_code,type,industry,approval_date,registration_authority,area,english_name,used_name,insurancer_count,staff_count,operation_period,operation_scope,create_time,status) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s' %(unique,name,phone,website,email,province,city,county,address,intro,registered_capital,actual_capital,operating_state,establishment_date,uscc,taxpayer_number,registration_number,organization_code,type,industry,approval_date,registration_authority,area,english_name,used_name,insurancer_count,staff_count,operation_period,operation_scope,create_time,1))
            '''将branch_no,company_id,vestin_company,name插入company_branch表中'''
            branches = result.get('Branches')
            cursor.execute('select id from company where company_no=%s' %
                           unique)
            company_id = cursor.fetchone()[0]
            for i in branches:
                keyno = i.get('KeyNo')
                keyno = json.dumps(keyno, encoding="utf-8", ensure_ascii=False)
                name = i.get('Name')
                name = json.dumps(name, encoding="utf-8", ensure_ascii=False)

                #self.company_id = company_id

                cursor.execute(
                    'insert into company_branch(branch_no,company_id,vestin_company,name) values(%s,%s,%s,%s)'
                    % (keyno, company_id, 1, name))

                #time.sleep(2)
                #db.commit()
            '''将investment_no,company_id,name插入company_investment'''
            #unique = '287d9caa36e789820710a762fac79ad5'
            unique = json.loads(unique)
            global token_num
            token_num += 1
            while True:
                try:
                    js = requests.get(
                        'https://xcx.qichacha.com/wxa/v1/base/getInvestments?unique=%s&token=%s'
                        % (unique, token),
                        headers=config.headers,
                        proxies=self.proxy,
                        verify=False,
                        timeout=2)
                except:
                    self.proxy = proxy_pool.change_proxy()
                    continue
                break
            js = js.text
            js = json.loads(js)
            investments = js.get('result').get('Result')

            for i in investments:
                keyno = i.get('KeyNo')
                keyno = json.dumps(keyno, encoding="utf-8", ensure_ascii=False)
                name = i.get('Name')
                name = json.dumps(name, encoding="utf-8", ensure_ascii=False)
                #company_id = self.company_id

                cursor.execute(
                    'insert into company_investment(investment_no,company_id,name) values(%s,%s,%s)'
                    % (keyno, company_id, name))
                #time.sleep(2)
                #db.commit()

            #因为对外投资公司的网页有分页,20家为一页,因此判断公司数量(Total_investment)是否超过20,来翻页获取数据
            Total_investment = js.get('result').get('Paging').get(
                'TotalRecords')
            index = 1
            num = (Total_investment - 1) / 20

            #while Total_investment>20:
            for i in range(num):
                index += 1
                token_num += 1
                while True:
                    try:
                        js = requests.get(
                            'https://xcx.qichacha.com/wxa/v1/base/getInvestments?unique=%s&token=%s&pageIndex=%s'
                            % (unique, token, index),
                            headers=config.headers,
                            proxies=self.proxy,
                            verify=False,
                            timeout=2)
                    except:
                        self.proxy = proxy_pool.change_proxy()
                        continue
                    break
                js = js.text
                js = json.loads(js)
                investments = js.get('result').get('Result')
                Total_investment = js.get('result').get('Paging').get(
                    'TotalRecords')
                for i in investments:
                    keyno = i.get('KeyNo')
                    keyno = json.dumps(keyno,
                                       encoding="utf-8",
                                       ensure_ascii=False)
                    name = i.get('Name')
                    name = json.dumps(name,
                                      encoding="utf-8",
                                      ensure_ascii=False)
                    #company_id = self.company_id

                    cursor.execute(
                        'insert into company_investment(investment_no,company_id,name) values(%s,%s,%s)'
                        % (keyno, company_id, name))
                    #db.commit()
                    #time.sleep(3)
                time.sleep(2)
                #db.commit()

            #一个商业公司插入完成后,将source_company中的status更新为1,如果中间中断,直接从status=0的开始重新插入。到这里,一条商业公司的信息就插入完成了
            unique = json.dumps(unique, encoding="utf-8", ensure_ascii=False)
            cursor.execute(
                'update source_company set status=1 where key_no=%s' % unique)
            #在三个表中都插入数据后,也就是一个商业公司插入完成后,提交事务
            db.commit()
            time.sleep(2)
            print '已插入%s条,剩余%s条' % (n, len(uniques) - n)
            n += 1
Example #5
0
    def insert_data(self):
        unique = self.get_uniques()
        print 'unique = ' + unique
        create_time = time.time()
        # 获取包含所有字段的元组
        p_num = 1
        #self.proxy = proxy_pool.proxy
        while True:
            try:
                cookie = {
                    'Hm_lpvt_3456bee468c83cc63fb5147f119f1075':
                    str(int(time.time())),
                    'QCCSESSID':
                    'kh50oeettqgbbphg0k57p37t40'
                }

                self.proxy = proxy_pool.change_proxy()
                proxy = self.proxy
                html = requests.get('https://www.qichacha.com/firm_%s' %
                                    unique,
                                    headers=headers_pool.requests_headers(),
                                    proxies=proxy,
                                    cookies=cookie,
                                    timeout=2)
            except (requests.exceptions.ProxyError,
                    requests.exceptions.ConnectTimeout,
                    requests.exceptions.ReadTimeout,
                    requests.exceptions.SSLError,
                    requests.exceptions.ConnectionError):
                self.proxy = proxy_pool.change_proxy()
                global proxy
                proxy = self.proxy
                print 'changing proxy...%s...%s' % (p_num, self.proxy)
                p_num += 1
                continue
            break
        response = html.content
        print response

        #有时会获取不到页面,触发异常处理
        if '<script>window.location.href' in response:
            raise UnboundLocalError

        s1 = etree.HTML(response)

        #获取字段

        #注册资本
        registered_capital = s1.xpath(
            '//*[@id="Cominfo"]/table[2]/tr[1]/td[2]//text()')[0].strip(
            ).encode('utf-8')
        if registered_capital == '-':
            registered_capital = None
        registered_capital = json.dumps(registered_capital,
                                        encoding="utf-8",
                                        ensure_ascii=False)

        test = s1.xpath(
            '//*[@id="company-top"]/div[2]/div[2]/div[3]/div[1]/span[1]/span[2]/span//text()'
        )
        print test
        print registered_capital

        #unique = json.dumps(unique, encoding="utf-8", ensure_ascii=False)
        #cursor.execute('update branch_test set registered_capital=%s,status=1 where branch_no=%s' %(registered_capital,unique))
        #db.commit()

        time.sleep(1)