def from_url_bairong(url, data, targetPath):
    # w文件命名规则
    year = datetime.datetime.now().strftime('%Y')
    month = datetime.datetime.now().strftime('%m')
    day = datetime.datetime.now().strftime('%d')
    file_path = file_target_path + os.path.sep + year + os.path.sep + month + os.path.sep + day + os.path.sep + data[
        "apply_no"] + os.path.sep + 'bairong'
    pdf_file_name = data['apply_no'] + '_bairong_' + str(
        data["file_seq_no"]) + '.pdf'

    # 报告输出位置
    file_path_sql = file_target_path + "/" + year + "/" + month + "/" + day + "/" + data[
        "apply_no"] + "/" + 'bairong'

    if not os.path.exists(file_path):
        os.makedirs(file_path)

    try:
        print("百融PDF转化中")
        pdfkit.from_url(url, file_path + os.path.sep + pdf_file_name)
        print("百融PDF转化成功")
        # 爬虫成功任务结束,更新数据库
        code = "S"
        updata_craw_state(code, data, file_path_sql, pdf_file_name)

        send_report_kafka(data, file_path, pdf_file_name)

    except Exception as e:
        # 爬虫成功任务结束,更新数据库
        code = "F"
        updata_craw_state(code, data, file_path_sql, pdf_file_name)
Exemple #2
0
def from_url_ceg(url, data, target_path):
    logger = logging.getLogger('root')

    pdf_file_name = data['apply_no'] + '_ceg_100.pdf'

    # w文件命名规则
    year = datetime.datetime.now().strftime('%Y')
    month = datetime.datetime.now().strftime('%m')
    day = datetime.datetime.now().strftime('%d')
    file_path = target_path + os.path.sep + year + os.path.sep + month + os.path.sep + day + os.path.sep + data[
        "apply_no"] + os.path.sep + 'ceg'

    file_path_sql = target_path + "/" + year + "/" + month + "/" + day + "/" + data[
        "apply_no"] + "/" + 'ceg'

    if not os.path.exists(file_path):
        os.makedirs(file_path)

    local_file_path = file_path + os.path.sep + pdf_file_name
    urllib.request.urlretrieve(url, local_file_path, callback)
    logger.info('success')
    code = "S"
    updata_craw_state(code, data, file_path_sql, pdf_file_name)

    send_report_kafka(data, file_path, pdf_file_name)
def get_page_source(url, data, targetPath):
    logger = logging.getLogger('root')

    # 数据魔盒报告保存位置
    year = datetime.datetime.now().strftime('%Y')
    month = datetime.datetime.now().strftime('%m')
    day = datetime.datetime.now().strftime('%d')
    file_path = targetPath + os.path.sep + year + os.path.sep + month + os.path.sep + day + os.path.sep + \
                data["apply_no"] + os.path.sep + 'sjmh'

    # 报告输出位置
    file_path_sql = targetPath + "/" + year + "/" + month + "/" + day + "/" + data[
        "apply_no"] + "/" + 'sjmh'

    # 判断文件夹是否存在,不存在创建文件夹
    if not os.path.exists(file_path):
        os.makedirs(file_path)

    chromeOptions = webdriver.ChromeOptions()
    prefs = {"download.default_directory": file_path}
    chromeOptions.add_experimental_option("prefs", prefs)
    driver = webdriver.Chrome(chrome_options=chromeOptions)
    driver.implicitly_wait(15)
    start_time = datetime.datetime.now()
    driver.get(url)
    while True:
        try:
            text = driver.page_source
            tree = etree.HTML(text)
            # 获取报告姓名
            str1 = '//*[@id="reportContant"]/div/div[2]/div/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]'
            customer_name = tree.xpath(str1 + '//text()')

            if len(customer_name) > 0:
                driver.find_element_by_xpath(
                    '//*[@id="pdfArea"]/div/div[1]/button[1]').click()
                break
        except Exception as e:
            logger.info("网页超时")
            traceback.print_exc()
        finally:
            if (datetime.datetime.now() - start_time).seconds > 30:
                break

    code = "F"
    # 修改文件命名规则
    modify_name = data['apply_no'] + '_sjmh_' + str(
        data["file_seq_no"]) + '.pdf'
    if len(customer_name) > 0:
        logger.info("数据魔盒数据获取成功")

        try:
            if is_file(file_path, modify_name, data):
                # 爬虫成功任务结束,更新数据库
                code = "S"

                send_report_kafka(data, file_path, modify_name)
        except Exception as e:
            logger.info("写入文件错误")
            traceback.print_exc()
        finally:
            driver.close()
    else:
        driver.close()

    updata_craw_state(code, data, file_path_sql, modify_name)