def from_url_bairong(url, data, targetPath): # w文件命名规则 year = datetime.datetime.now().strftime('%Y') month = datetime.datetime.now().strftime('%m') day = datetime.datetime.now().strftime('%d') file_path = file_target_path + os.path.sep + year + os.path.sep + month + os.path.sep + day + os.path.sep + data[ "apply_no"] + os.path.sep + 'bairong' pdf_file_name = data['apply_no'] + '_bairong_' + str( data["file_seq_no"]) + '.pdf' # 报告输出位置 file_path_sql = file_target_path + "/" + year + "/" + month + "/" + day + "/" + data[ "apply_no"] + "/" + 'bairong' if not os.path.exists(file_path): os.makedirs(file_path) try: print("百融PDF转化中") pdfkit.from_url(url, file_path + os.path.sep + pdf_file_name) print("百融PDF转化成功") # 爬虫成功任务结束,更新数据库 code = "S" updata_craw_state(code, data, file_path_sql, pdf_file_name) send_report_kafka(data, file_path, pdf_file_name) except Exception as e: # 爬虫成功任务结束,更新数据库 code = "F" updata_craw_state(code, data, file_path_sql, pdf_file_name)
def from_url_ceg(url, data, target_path): logger = logging.getLogger('root') pdf_file_name = data['apply_no'] + '_ceg_100.pdf' # w文件命名规则 year = datetime.datetime.now().strftime('%Y') month = datetime.datetime.now().strftime('%m') day = datetime.datetime.now().strftime('%d') file_path = target_path + os.path.sep + year + os.path.sep + month + os.path.sep + day + os.path.sep + data[ "apply_no"] + os.path.sep + 'ceg' file_path_sql = target_path + "/" + year + "/" + month + "/" + day + "/" + data[ "apply_no"] + "/" + 'ceg' if not os.path.exists(file_path): os.makedirs(file_path) local_file_path = file_path + os.path.sep + pdf_file_name urllib.request.urlretrieve(url, local_file_path, callback) logger.info('success') code = "S" updata_craw_state(code, data, file_path_sql, pdf_file_name) send_report_kafka(data, file_path, pdf_file_name)
def get_page_source(url, data, targetPath): logger = logging.getLogger('root') # 数据魔盒报告保存位置 year = datetime.datetime.now().strftime('%Y') month = datetime.datetime.now().strftime('%m') day = datetime.datetime.now().strftime('%d') file_path = targetPath + os.path.sep + year + os.path.sep + month + os.path.sep + day + os.path.sep + \ data["apply_no"] + os.path.sep + 'sjmh' # 报告输出位置 file_path_sql = targetPath + "/" + year + "/" + month + "/" + day + "/" + data[ "apply_no"] + "/" + 'sjmh' # 判断文件夹是否存在,不存在创建文件夹 if not os.path.exists(file_path): os.makedirs(file_path) chromeOptions = webdriver.ChromeOptions() prefs = {"download.default_directory": file_path} chromeOptions.add_experimental_option("prefs", prefs) driver = webdriver.Chrome(chrome_options=chromeOptions) driver.implicitly_wait(15) start_time = datetime.datetime.now() driver.get(url) while True: try: text = driver.page_source tree = etree.HTML(text) # 获取报告姓名 str1 = '//*[@id="reportContant"]/div/div[2]/div/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]' customer_name = tree.xpath(str1 + '//text()') if len(customer_name) > 0: driver.find_element_by_xpath( '//*[@id="pdfArea"]/div/div[1]/button[1]').click() break except Exception as e: logger.info("网页超时") traceback.print_exc() finally: if (datetime.datetime.now() - start_time).seconds > 30: break code = "F" # 修改文件命名规则 modify_name = data['apply_no'] + '_sjmh_' + str( data["file_seq_no"]) + '.pdf' if len(customer_name) > 0: logger.info("数据魔盒数据获取成功") try: if is_file(file_path, modify_name, data): # 爬虫成功任务结束,更新数据库 code = "S" send_report_kafka(data, file_path, modify_name) except Exception as e: logger.info("写入文件错误") traceback.print_exc() finally: driver.close() else: driver.close() updata_craw_state(code, data, file_path_sql, modify_name)