def run_file_reader_write(name, write_file_path): ''' :return: ''' um = nm.url_manager(name) fr = excel_rw.File_Reader(um) fr.write(write_file_path) um.clear(clear_self=True)
def run_thread(name, file_path): ''' 启动方法: 主机器启动:会读写excel :param name: :param file_path: :return: ''' # name="test" # file_path="C:/temp/gruyter2018-2019待采全文的文章清单.xls" list = [] um = nm.url_manager(name) tm = nm.template_manager() execl = excel_rw.excels(file_path, um) # delte_error_pdf(um) execl.write() um.clear() execl.read() dir = create_dir(name) url_set_names = um.get_sourcenames() for url_set_name in url_set_names: if url_set_name == "Elsevier": th = threads.Elsevier_download(url_set_name, um, tm, dir) elif url_set_name == "IEEE": th = threads.IEEE_download(url_set_name, um, tm, dir) # elif url_set_name == "Doaj": # # pass # th = threads.Single_thread(url_set_name, um, tm, dir) elif url_set_name == "osti": th = threads.OSTI(url_set_name, um, tm, dir) else: th = threads.download_url(url_set_name, um, tm, dir=first_dir) list.append(th) sns = um.get_sourcenames() for sn in sns: if sn == "Elsevier" or sn == "IEEE" or sn == "osti": continue th = threads.download(sn, um, dir) list.append(th) for t in list: t.start() for t in list: t.join() execl.write() execl.report() um.clear()
def run_file_reader(name, file_path, url_thread_num=1, pdf_thread_num=1): ''' 从txt文件中直接读取url,并开始下载 采用读写分离的模式,当前方法只负责读取url并下载pdf,结果存储在redis中,要导出结果使用run_file_reader_write方法 :param name: :param file_path: :param thread_num: :return: ''' list = [] um = nm.url_manager(name) tm = nm.template_manager() if not um.exist(): logger.info("上传数据...") fr = excel_rw.File_Reader(um) fr.read(file_path) dir = create_dir(name) url_set_names = um.get_sourcenames() for url_set_name in url_set_names: for i in range(url_thread_num): if url_set_name == "Elsevier": th = threads.Elsevier_download(url_set_name, um, tm, dir) elif url_set_name == "IEEE": th = threads.IEEE_download(url_set_name, um, tm, dir) # elif url_set_name == "Doaj": # # pass # th = threads.Single_thread(url_set_name, um, tm, dir) elif url_set_name == "osti": th = threads.OSTI(url_set_name, um, tm, dir) else: th = threads.download_url(url_set_name, um, tm, dir=first_dir) list.append(th) sns = um.get_sourcenames() for sn in sns: if sn == "Elsevier" or sn == "IEEE" or sn == "osti": continue for i in range(pdf_thread_num): th = threads.download(sn, um, dir) list.append(th) for t in list: t.start() for t in list: t.join()
def start(name, file_path): thread_list = [] um = nm.url_manager(name) tm = nm.template_manager() execl = excel_rw.excels(file_path, um) execl.write() um.clear() execl.read() dir = create_dir(name) thread_list = init_download_url_thread(um, tm, thread_list, dir) thread_list = init_download_and_check_thread(um, thread_list, dir) for th in thread_list: th.join() execl.write() execl.report() um.clear()
def run_d_thread(name): um = nm.url_manager(name) tm = nm.template_manager()
def check_finsh_task(name): um = nm.url_manager(name) um.query_finsh_url()
def check_task(name): um = nm.url_manager(name) um.query()