def get_from_pmcid_mp(id_file_path,pdf_output_dir,kickback_path,num_thread=10): pdf_output_dir = _clean_path(pdf_output_dir) pool = PoolMP(num_thread) pack = [] for line in open(id_file_path,'r'): pack.append(line+"+"+pdf_output_dir+"+"+kickback_path) results = pool.map(_unpack,pack) pool.close()
def get_error_mp(id_file_path,pdf_output_dir,kickback_dir,num_thread=2): pdf_output_dir = _clean_path(pdf_output_dir) kickback_dir = _clean_path(kickback_dir) pool = PoolMP(num_thread) pack = [] for line in open(id_file_path,'r'): pack.append(line+"+"+pdf_output_dir+"+"+kickback_dir) results = pool.map(_unpack_error,pack) pool.close()
def get_error_mp(id_file_path,pdf_output_dir,kickback_dir,num_thread=2): pdf_output_dir = _clean_path(pdf_output_dir) kickback_dir = _clean_path(kickback_dir) pool = PoolMP(num_thread) pack = [] for line in open(id_file_path,'r'): line=line.strip() pack.append(line.strip()) results = pool.map(partial(_download_pdf_errors, pdf_output_dir, kickback_dir),pack) pool.close()
def get_pdf_json_mp(pdf_dir,out_dir,num_thread=2): pdf_dir = _clean_path(pdf_dir) out_dir = _clean_path(out_dir) pool = PoolMP(num_thread) pack = [] try: for line in os.listdir(pdf_dir): pack.append(pdf_dir+line+"+"+out_dir) except Exception as e: print(str(e)) results = pool.map(_post_science_parse,pack) pool.close()
def sort_url_mp(id_txt, out_csv, num_threads=8): pool = PoolMP(num_threads) #use Pool.map to have the worker pool take ids in chunks from txt #and run them though ruby script using the process_line function #results will be failed with the ids that failed to dl with open(id_txt) as source_file: results = pool.map(_sort_url_process, source_file) pool.close() #write the list of failed ids to file f = csv.writer(open(out_csv, 'w'), lineterminator="\n") for i in results: f.writerow(i.split("||"))
def get_abstracts_mp(txt_ids_in,csv_abstracts_out,num_threads=10): pool = PoolMP(num_threads) #use Pool.map to have the worker pool take ids in chunks from txt #and run them though ruby script using the process_line function #results will be failed with the ids that failed to dl with open(txt_ids_in) as source_file: results = pool.map(_process_get_abstracts, source_file) #write the list of failed ids to file csvf =csv.writer(open(csv_abstracts_out,'w',encoding='utf-8-sig'),lineterminator="\n") csvf.writerow(["PubMed Id","Title","Journal URL","Abstract"]) for i in results: id_index = i.index("**") title_index = i.index("^&^&") abst_index = i.index("$%$%") csvf.writerow([i[:id_index],i[id_index+2:title_index],i[title_index+4:abst_index],i[abst_index+4:]])
def run_id_ruby_mp(file_path, kickback_path, num_threads=8): #record start time to calculate total runtime later start = datetime.now() #init pool of workers from specified number #this is how many downloads will run in parallel pool = PoolMP(num_threads) #use Pool.map to have the worker pool take ids in chunks from txt #and run them though ruby script using the _process_line function #results will be failed with the ids that failed to download with open(file_path) as source_file: results = pool.map(_process_line, source_file) #write the list of failed ids to file with open(kickback_path, 'w') as f: for i in results: f.write(i) return datetime.now() - start