Example #1
0
def judgementnoinfo(jsons, varietyinfo):
    try:
        # 将字符串转化成json
        jsons = json.loads(jsons)[0]
        # 品种来源
        varietysource = jsons['VarietySource']
        # 品种特征
        varietycharacter = jsons['VarietyCharacter']
        # 产量表现
        outputexpression = jsons['OutputExpression']
        # 栽培要求
        plantrequirment = jsons['PlantRequirment']
        # 审定意见
        judgementsuggestion = jsons['JudgementSuggestion']

        varietyinfo['varietysource'] = varietysource
        varietyinfo['varietycharacter'] = varietycharacter
        varietyinfo['outputexpression'] = outputexpression
        varietyinfo['plantrequirment'] = plantrequirment
        varietyinfo['judgementsuggestion'] = judgementsuggestion
        return varietyinfo
    except Exception as e:
        print(e)
        logger.warning(e)
        varietyinfo['varietysource'] = ''
        varietyinfo['varietycharacter'] = ''
        varietyinfo['outputexpression'] = ''
        varietyinfo['plantrequirment'] = ''
        varietyinfo['judgementsuggestion'] = ''
        return varietyinfo
Example #2
0
def get_search(search_url, data):
    response = requests.get(search_url, allow_redirects=False, headers=headers)
    try:
        if response.status_code == 200:
            search_url = re.search(
                r'URL=\'(.*?)\'', response.text.encode('utf-8'), re.S)
        elif response.status_code == 302:
            search_url = response.headers.get('location')
        data['search_url'] = search_url
        # 将整个网页数据写入数据库,中间要加降噪算法处理,这里先放在这里
        response = requests.get('https://www.cmeii.com/xingdaoshulei/2418.html', headers=baidu_headers,
                                allow_redirects=False, timeout=1)

        print(chardet.detect(response.content))
        print(response.text.encode('UTF-8-SIG'))

        # print(extract_article('https://www.cmeii.com/xingdaoshulei/2418.html', response.text))
        print('111')

        if response.status_code == 200:
            text = requests.get(search_url, headers=headers,
                                allow_redirects=False, timeout=1).text

            # 降噪,存储数据
            data_save(data, text)
            return search_url, 1
        else:
            return search_url, -1
    except Exception as e:
        logger.warning(e)
        return search_url, -1
Example #3
0
def get_city():
    # 获取json形式的编码
    try:
        code_url = 'https://www.zhipin.com/common/data/city.json'
        r = requests.get(code_url, headers=headers).json()
        city_list = r['data']['cityList']
        code_dict = {}
        for city in city_list:
            sub_list = city['subLevelModelList']
            for sub in sub_list:
                code = sub['code']
                name = sub['name']
                code_dict[name] = code
        return code_dict
    except Exception as e:
        logger.warning(e)
Example #4
0
def parse(text):
    detail_url = 'https://www.zhipin.com'
    x_res = etree.HTML(text)
    try:
        url_xpath = x_res.xpath(
            '//*[@id="main"]/div/div[@class="job-list"]/ul/li')
    except Exception as e:
        logger.warning(e)
    try:
        for url in url_xpath:
            time.sleep(10.1)
            a = url.xpath('./div/div[@class="info-primary"]/h3/a/@href')[0]
            url = detail_url + a
            response = requests.get(url, headers=headers)
            parse_detail(response.text)
    except Exception as e:
        logger.warning(e)
        print(e)
Example #5
0
def to_db(varietyinfo):
    col_name = "judgementno,cropID,varietyname,judgementregionID,judgementyear,applycompany,istransgenosis,varietyhaslincense," \
        "companyhaslincense,hasgrant,haspromotion,varietysource,varietycharacter,outputexpression,plantrequirment," \
        "judgementsuggestion,grants,promotion"
    valuses = "'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s'" % \
        (varietyinfo['judgementno'], varietyinfo['cropID'],
         varietyinfo['varietyname'], varietyinfo['judgementregionID'],
         varietyinfo['judgementyear'], varietyinfo['applycompany'],
         varietyinfo['istransgenosis'], varietyinfo['varietyhaslincense'],
         varietyinfo['companyhaslincense'], varietyinfo['hasgrant'],
         varietyinfo['haspromotion'], varietyinfo['varietysource'],
         varietyinfo['varietycharacter'], varietyinfo['outputexpression'],
         varietyinfo['plantrequirment'], varietyinfo['judgementsuggestion'],
         varietyinfo['grant'], varietyinfo['promotion'])
    sql = "insert into t_crawl_seed(%s) values(%s);" % (col_name, valuses)
    try:
        insertTB(sql)
    except Exception as e:
        print('写入数据库失败', e)
        logger.warning(e)
Example #6
0
def get_baidu(search_data, page):
    print('aaa')
    start_url = 'https://www.baidu.com/s?wd={}&pn={}'.format(search_data, page)
    try:
        text = requests.get(start_url, headers=headers,
                            allow_redirects=False, timeout=1).text
    except TimeoutError as e:
        logger.warning(e)
    x_res = etree.HTML(text)
    # 存储爬取的数据,字典
    data = {}
    data['search_data'] = search_data
    for i in range(1, 11):
        try:
            search_url = x_res.xpath('//*[@id="{}"]/h3/a/@href'.format(i))[0]
            title = x_res.xpath('//*[@id="{}"]/h3'.format(i))
            title = title[0].xpath(
                'string(.)').replace(' ', '').replace('\n', '')
            data['title'] = title
            get_search(search_url, data)
        except IndexError as e:
            print(e)
            break
Example #7
0
def variety(jsons):
    jsons = eval(jsons)
    for i in jsons:
        licence_info = {}
        # 许可证号
        licence_no = i['LicenceNo']
        # 公司名称
        apply_company_name = i['ApplyCompanyName']
        # 经营范围
        production_manage_crops = i['ProductionManageCrops']
        # 发证机关
        issuing_uthority_caption = i['IssuingAuthorityCaption']
        # 发证日期
        publish_date = i['PublishDate']
        # 有效日期
        expire_date = i['ExpireDate']

        # 主证
        try:
            main_how = i['MainShow']
            # 主证id
            main_id = main_how.split('id=')[-1]
            time.sleep(0.3)
            res = requests.get(
                'http://202.127.42.178:4000/SeedSearch/SeedSolution/Business/TempLicenseSelect.ashx',
                data={
                    'Type': 'SLImpLicence',
                    'LicenceID': main_id
                },
                headers=headers).json()
            main_info = re.findall("left: 48.5%;'\s?>(.*?)</span>",
                                   res['ResultData'])
        except AttributeError as e:
            logger.warning(e)
            continue

        # 副证
        try:
            deputy_show = i['DeputyShow']
            # 副证id
            deputy_id = deputy_show.split('id=')[-1]
            deputy_url = 'http://202.127.42.47:8016/TwoLicenceManage/MainLicence/TwoLincenceSubWordBigData.aspx?showall=1&id='
            url = deputy_url + deputy_id
            time.sleep(0.3)
            res = requests.get(url,
                               headers=headers,
                               data={
                                   'showall': '1',
                                   'id': deputy_id
                               })
        except AttributeError as e:
            logger.warning(e)
            continue

        res_xpath = etree.HTML(res.text)
        text = res_xpath.xpath('/html/body/div[2]/div/div//text()')
        # 副证信息
        deputy_info = []
        for i in text:
            i = i.replace(' ', '').replace('\r', '').replace('\n', '')
            if i != '':
                deputy_info.append(i)

        # 写入数据库
        licence_info['licence_no'] = licence_no
        licence_info['apply_company_name'] = apply_company_name
        licence_info['production_manage_crops'] = production_manage_crops
        licence_info['issuing_uthority_caption'] = issuing_uthority_caption
        licence_info['publish_date'] = publish_date
        licence_info['expire_date'] = expire_date
        licence_info['main_info'] = main_info
        licence_info['deputy_info'] = deputy_info

        col_name = "licence_no,apply_company_name,production_manage_crops,issuing_uthority_caption,publish_date,expire_date,main_info,deputy_info"
        valuses = "'%s','%s','%s','%s','%s','%s','%s','%s'" % \
            (licence_info['licence_no'], licence_info['apply_company_name'],
                licence_info['production_manage_crops'], licence_info['issuing_uthority_caption'],
                licence_info['publish_date'], licence_info['expire_date'],
                str(licence_info['main_info']).replace("'", "''"), str(licence_info['deputy_info']).replace("'", "''"))
        sql = "insert into t_crawl_company_licence(%s) values(%s);" % (
            col_name, valuses)
        try:
            insertTB(sql)
        except Exception as e:
            print('写入数据库失败', e)
            logger.warning(e)