Python postの例、EnterpriseCreditCrawler.common.url_requests.post Pythonの例

コード例 #1

0

ファイルを表示

ファイル: d1_guangdong_data.py プロジェクト: yuelinsoft/uf-spider

    def basicinfo_execute(self):

        url = ('http://%s/aiccips/GSpublicity/GSpublicityList'
               '.html?service=entInfo') % self.host

        response = url_requests.post(url=url,
                                     data=self.data,
                                     headers=self.headers,
                                     proxies=proxies)
        soup = BeautifulSoup(response.text, 'lxml')

        self.soup = soup  # 赋值到属性，可获取营业执照、主要成员、分支机构三项信息

        body = soup.find('div', {'style': 'padding-left: 14px'})
        info = {}
        labels = body.find_all('span', class_='label')
        for label in labels:
            key = label.text.replace('：', '').strip()
            value = label.next_sibling.text.strip()
            info[key] = value
        info = common.basicinfo_dict(info, '广东省')

        self.qyxx_basicinfo.append(info)

コード例 #2

0

ファイルを表示

ファイル: f4_yunnan_query.py プロジェクト: yuelinsoft/uf-spider

def verifyCode(checkCode, token):
    """检查验证码是否识别正确

    :param checkCode:
    :param token:
    :return:
    """

    url = 'http://gsxt.ynaic.gov.cn/notice/security/verify_captcha'
    data = {'captcha': checkCode, 'session.token': token}
    headers = {
        'Host':
        'gsxt.ynaic.gov.cn',
        'User-Agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) '
                       'AppleWebKit/537.36 (KHTML, like Gecko) '
                       'Chrome/54.0.2840.71 Safari/537.36'),
        'Referer':
        'http://gsxt.ynaic.gov.cn/notice /'
    }

    response = url_requests.post(url, data=data, headers=headers)

    return response.text

コード例 #3

0

ファイルを表示

ファイル: c2_jiangsu_data.py プロジェクト: yuelinsoft/uf-spider

 def black_info_execute(self):
     url = ('http://www.jsgsj.gov.cn:58888/ecipplatform'
            '/publicInfoQueryServlet.json?queryYzwf=true')
     resp = url_requests.post(url=url,
                              data=self.post_data,
                              headers=self.headers,
                              proxies=proxies)
     data = json.loads(resp.content)
     data = data.get('data')  # list
     key_list = [
         'reason_in', 'date_in', 'authority_in', 'reason_out', 'date_out',
         'authority'
     ]
     # key_list = ['登记编号','登记日期','登记机关','被担保债权数额','状态','详情']
     for each_data in data:
         info = {}
         info['reason_in'] = each_data.get('FACT_REASON', '')
         info['date_in'] = each_data.get('MARK_DATE', '')
         info['authority_in'] = each_data.get('MARK_ORG', '')
         info['reason_out'] = each_data.get('REMOVE_DATE', '')
         info['date_out'] = each_data.get('REMOVE_DATE', '')
         info['authority'] = each_data.get('REMOVE_ORG', '')
         self.qyxx_black_info.append(info)

コード例 #4

0

ファイルを表示

ファイル: c2_jiangsu_data.py プロジェクト: yuelinsoft/uf-spider

    def stock_freeze_execute(self):
        url = ('http://www.jsgsj.gov.cn:58888/ecipplatform'
               '/publicInfoQueryServlet.json?querySfxz=true')
        resp = url_requests.post(url=url,
                                 data=self.post_data,
                                 headers=self.headers,
                                 proxies=proxies)
        data = json.loads(resp.content)
        data = data.get('data')  # list
        key_list = [
            'person', 'stock', 'court', 'notice_number', 'status', 'details'
        ]
        # key_list = ['序号',	'被执行人',	'股权数额',	'执行法院',	'协助公示通知书文号',	'状态',	'详情']

        for each_data in data:
            info = {}
            info['person'] = each_data.get('ASSIST_NAME', '')
            info['stock'] = each_data.get('FREEZE_AMOUNT', '')
            info['court'] = each_data.get('EXECUTE_COURT', '')
            info['notice_number'] = each_data.get('NOTICE_NO', '')
            info['status'] = each_data.get('FREEZE_STATUS', '')
            info['details'] = each_data  # 带有访问详情所需的参数
            self.qyxx_stock_freeze.append(info)

コード例 #5

0

ファイルを表示

ファイル: c2_jiangsu_data.py プロジェクト: yuelinsoft/uf-spider

 def mortgage_basic_execute(self):
     url = ('http://www.jsgsj.gov.cn:58888/ecipplatform'
            '/publicInfoQueryServlet.json?queryDcdy=true')
     resp = url_requests.post(url=url,
                              data=self.post_data,
                              headers=self.headers,
                              proxies=proxies)
     data = json.loads(resp.content)
     data = data.get('data')  # list
     key_list = [
         'mortgage_reg_num', 'date_reg', 'authority', 'amount', 'status',
         'details'
     ]
     # key_list = ['登记编号','登记日期','登记机关','被担保债权数额','状态','详情']
     for each_data in data:
         info = {}
         info['mortgage_reg_num'] = each_data.get('GUARANTY_REG_NO', '')
         info['date_reg'] = each_data.get('START_DATE', '')
         info['authority'] = each_data.get('CREATE_ORG', '')
         info['amount'] = each_data.get('ASSURE_CAPI', '')
         info['status'] = each_data.get('STATUS', '')
         info['details'] = each_data
         self.qyxx_mortgage_basic.append(info)

コード例 #6

0

ファイルを表示

ファイル: c2_jiangsu_data.py プロジェクト: yuelinsoft/uf-spider

 def abnormal_execute(self):
     url = ('http://www.jsgsj.gov.cn:58888/ecipplatform'
            '/publicInfoQueryServlet.json?queryJyyc=true')
     resp = url_requests.post(url=url,
                              data=self.post_data,
                              headers=self.headers,
                              proxies=proxies)
     data = json.loads(resp.content)
     data = data.get('data')  # list
     key_list = [
         'xuhao', 'reason', 'date_occurred', 'juedinglierujiguan',
         'reason_out', 'date_out', 'authority'
     ]
     # key_list = ['序号', '列入异常原因', '列入日期', '作出决定机关（列入）', '移出异常原因', '移出日期', '作出决定机关（列出）']
     for each_data in data:
         info = {}
         info['xuhao'] = each_data.get('RN', '')
         info['reason'] = each_data.get('FACT_REASON', '')
         info['date_occurred'] = each_data.get('MARK_DATE', '')
         info['juedinglierujiguan'] = each_data.get('CREATE_ORG', '')
         info['reason_out'] = each_data.get('REMOVE_REASON', '')
         info['date_out'] = each_data.get('CREATE_DATE', '')
         info['authority'] = each_data.get('YICHU_ORG', '')
         self.qyxx_abnormal.append(info)

コード例 #7

0

ファイルを表示

def verify(cookies, geetest_data):
    """将验证码得到的结果三个参数进行验证"""

    url = 'http://www.hebscztxyxx.gov.cn/notice/pc-geetest/validate'
    headers = {
            'Host': 'www.hebscztxyxx.gov.cn',
            'Referer': 'http://www.hebscztxyxx.gov.cn/notice/',
            'User-Agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) '
                           'AppleWebKit/537.36 (KHTML, like Gecko) '
                           'Chrome/54.0.2840.99 Safari/537.36')
        }

    resp = url_requests.post(url=url,
                             data=geetest_data,
                             headers=headers,
                             cookies=cookies)
    try:
        js = json.loads(resp.content)

        status = js.get('status')

        return status
    except:
        return None

コード例 #8

0

ファイルを表示

def get_result(url_result, company, cookies, geetest_data):
    '''
    主要传值包括：
    params: urlflag, challenge
    data: urlflag, nowNum, keyword, clear
    :return: search result
    '''
    #   only need them:  'Host: gx.gsxt.gov.cn
    # User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36
    # Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
    # Referer: http://gx.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml
    # Cookie: JSESSIONID=hQBmYrkWfpL2lT4cG1HFwdJnTNjJ1bkQMHPXb95J34LH14SwGVZQ!1832446435; insert_cookie=48249391
    # Content-Length: 231
    # '
    headers = {
        'Host':
        'gx.gsxt.gov.cn',
        'Origin':
        'http://gx.gsxt.gov.cn',
        'Referer':
        'http://gx.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml',
        'Cookie':
        cookies,
        'Cache-Control':
        'max-age=0',
        'Upgrade-Insecure-Requests':
        '1',
        'Connection':
        'keep-alive',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.8',
        'User-Agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) '
                       'AppleWebKit/537.36 (KHTML, like Gecko) '
                       'Chrome/54.0.2840.99 Safari/537.36'),
        'Content-Length':
        '250',
        'Content-Type':
        'application/x-www-form-urlencoded'
    }
    params = dict(urlflag=0, challenge=geetest_data["geetest_challenge"])
    data = dict(urlflag='', nowNum='', keyword=company, clear='true')
    print cookies
    # print dict(insert_cookie=cookies.split(";")[0].split("=")[1],JSESSIONID=cookies.split(";")[1].split("=")[1])
    # 这个log里面也有出现报错的，应该要加判断的
    # Cookie = dict(insert_cookie=cookies.split(";")[0].split("=")[1],JSESSIONID=cookies.split(";")[1].split("=")[1]) # ,cookies=Cookie
    resp = url_requests.post(url=url_result,
                             params=params,
                             data=data,
                             headers=headers,
                             proxies=proxies)

    html = resp.content
    soup = BeautifulSoup(html, 'lxml')
    if not soup:
        raise Exception('Query failed. Please try it again.')
    elif soup.find(
            'p',
        {'class': 'p01'}) != None:  #u'技术人员'in soup.find('p',{'class': 'p01'}):
        raise Exception('Too frequent. Please try it again.')
    else:
        if soup.find('div', {'class': 'search-result'}):
            result_num = soup.select('div.search-result > p > span')
            if len(result_num) == 0:
                raise Exception('query failed, please try it again.')

            if int(result_num[0].text) == 0:
                print soup.select('div.contentA1 > p')[0].text
                return []

            links = soup.select('div.search-result > ul > li > h3')
            if not links:
                raise Exception('query failed, please try it again.')
            result_list = []
            for each_link in links:
                link = each_link['onclick'].split('"')[1]
                result_list.append(link)
            return result_list

コード例 #9

0

ファイルを表示

def main(**kwargs):
    """主函数， 被总控调用， 不得随意更改名称。

    :param kwargs: name=企业名称
    :return:
    """

    company_name = kwargs.get('name')
    # 获取访问验证码时的cookies
    cookies = get_cookie()

    # 加载训练集(相对路径转绝对路径)
    path_a = os.path.dirname(__file__)
    path_a = os.path.abspath(os.path.join(path_a, '..'))
    abs_path = os.path.join(path_a,
                            'train_operator/d1_guangdong_train_operator.csv')

    trainX, trainY = loadTrainSet(abs_path)
    image = get_checkCode(cookies)
    code = main_check(image, trainX, trainY)
    if code == None:  # 如果数字被识别成运算符，则回调main重来
        return main(name=company_name)
    # print code

    # 先访问这个网站获取到 textfield
    url = 'http://gsxt.gdgs.gov.cn/aiccips/CheckEntContext/checkCode.html'
    data = {'textfield': company_name, 'code': code}
    headers = {
        'Host':
        'gsxt.gdgs.gov.cn',
        'User-Agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) '
                       'AppleWebKit/537.36 (KHTML, like Gecko) '
                       'Chrome/47.0.2526.73 Safari/537.36'),
        'Referer':
        'http://gsxt.gdgs.gov.cn/'
    }
    response = url_requests.post(url,
                                 headers=headers,
                                 data=data,
                                 cookies=cookies,
                                 timeout=10)
    if response != None:
        if 'textfield' in response.text:
            textfield = json.loads(response.text)['textfield']
            # print textfield
        else:
            print '验证码识别错误'
            return main(name=company_name)

        # 再用 textfield 参数，访问下面的网址，获取搜索结果（企业列表）
        url = 'http://gsxt.gdgs.gov.cn/aiccips/CheckEntContext/showInfo.html '
        data = {'textfield': textfield, 'code': str(code)}
        response = url_requests.post(url,
                                     headers=headers,
                                     data=data,
                                     cookies=cookies,
                                     timeout=10)
        if response != None:
            html = response.content
            soup = BeautifulSoup(html, 'lxml')
            list = soup.find_all('li', class_='font16')
            link_list = []
            for each in list:
                link = each.a['href'].decode('utf-8')  # 为了统一unicode编码，
                if '..' in link:
                    link = 'http://gsxt.gdgs.gov.cn/aiccips' + link[2:]
                link_list.append(link)
            # print link_list
            return link_list
    else:
        return main(name=company_name)

コード例 #10

0

ファイルを表示

def guangdong(**kwargs):
    link = kwargs.get('id_tag')
    # 先访问query脚本返回的url获取其中的三个参数entNo， entType， regOrg，并组成data用于访问每个模块。
    headers = {
        'Host': 'gsxt.gdgs.gov.cn',
        'User-Agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) '
                       'AppleWebKit/537.36 (KHTML, like Gecko) '
                       'Chrome/53.0.2785.143 Safari/537.36'),
        'Referer': 'http://gsxt.gdgs.gov.cn/aiccips/main/initJyycInfoList.html'
    }
    response = url_requests.get(link, headers=headers, timeout=10)
    # print response.text
    soup = BeautifulSoup(response.text, 'lxml')
    req_data = {}
    req_data['entNo'] = soup.find('input', {'name': 'entNo'})['value']
    req_data['entType'] = soup.find('input', {'name': 'entType'})['value']
    req_data['regOrg'] = soup.find('input', {'name': 'regOrg'})['value']

    # 接着用这个req_data作为参数，传给每一个模块的url，获取信息。
    executeA = ['basicinfo_execute', 'abnormal_execute','black_info_execute',
                'adm_punishment_execute', 'b_c_execute',
                'branch_execute', 'member_execute', 'mortgage_basic_execute',
                'pledge_execute','s_h_execute', 'spot_check_execute',
                'stock_freeze_execute', 'stockholder_change_execute'
                ]

    execute_d = ['c_mortgage_execute', 's_creditor_execute',
                 'mortgage_execute'
                 ]

    businessInfo = BusinessInfo()
    loop = True  # 判断一次是否吊销，已吊销，则loop=False
    active = True  # 默认未吊销
    for each in executeA:
        print "%r %r %r" % ("*" * 20, each, "*" * 20)
        getattr(businessInfo, each)(req_data)
        # businessInfo.pledge_execute(req_data)
        if businessInfo.qyxx_basicinfo:
            while loop:
                loop = False
                if '已吊销' in businessInfo.qyxx_basicinfo[0]['check_type']:
                    active = False
        if not active:
            break

    "暂时未发现有动产抵押的"
    # 此处声明的三个变量是用于当有多个动产抵押项目的时候能把三个表的结果放到一个列表里面
    if businessInfo.qyxx_mortgage_basic:
        L = businessInfo.qyxx_mortgage_basic
        headers = {
            'Host': 'gsxt.gdgs.gov.cn',
            'Referer': link,
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/54.0.2840.71 Safari/537.36'
        }
        for dict in L:
            url = ('http://gsxt.gdgs.gov.cn/aiccips/GSpublicity'
                   '/GSpublicityList.html')

            pleNo = dict['detail'].split("'")[1]

            data = {
                'service':'pleInfoData',
                'pleNo': pleNo,
                'entNo':req_data.get('entNo'),
                'entType':req_data.get('entType'),
                'regOrg':req_data.get('regOrg')
            }

            html = url_requests.post(url,
                                     data=data,
                                     headers=headers,
                                     cookies=response.cookies).text

            pageSoup = BeautifulSoup(html, 'lxml')

            for c in execute_d:
                print "%s %s %s" % ("*" * 20, c, "*" * 20)
                getattr(businessInfo, c)(pageSoup)

    results = businessInfo.returnData()

    return results