Ejemplo n.º 1
0
def get_from_pmcid_mp(id_file_path,pdf_output_dir,kickback_path,num_thread=10):
    pdf_output_dir = _clean_path(pdf_output_dir)
    pool = PoolMP(num_thread)
    pack = []
    for line in open(id_file_path,'r'):
        pack.append(line+"+"+pdf_output_dir+"+"+kickback_path)
    results = pool.map(_unpack,pack)
    pool.close()
Ejemplo n.º 2
0
def get_error_mp(id_file_path,pdf_output_dir,kickback_dir,num_thread=2):
    pdf_output_dir = _clean_path(pdf_output_dir)
    kickback_dir = _clean_path(kickback_dir)
    pool = PoolMP(num_thread)
    pack = []
    for line in open(id_file_path,'r'):
        pack.append(line+"+"+pdf_output_dir+"+"+kickback_dir)
    results = pool.map(_unpack_error,pack)
    pool.close()
Ejemplo n.º 3
0
def get_error_mp(id_file_path,pdf_output_dir,kickback_dir,num_thread=2):
    pdf_output_dir = _clean_path(pdf_output_dir)
    kickback_dir = _clean_path(kickback_dir)
    pool = PoolMP(num_thread)
    pack = []
    for line in open(id_file_path,'r'):
        line=line.strip()
        pack.append(line.strip())
    results = pool.map(partial(_download_pdf_errors, pdf_output_dir, kickback_dir),pack)
    pool.close()
Ejemplo n.º 4
0
def get_pdf_json_mp(pdf_dir,out_dir,num_thread=2):
    pdf_dir = _clean_path(pdf_dir)
    out_dir = _clean_path(out_dir)
    pool = PoolMP(num_thread)
    pack = []
    try:
        for line in os.listdir(pdf_dir):
            pack.append(pdf_dir+line+"+"+out_dir)
    except Exception as e:
        print(str(e))
    results = pool.map(_post_science_parse,pack)
    pool.close()
Ejemplo n.º 5
0
def sort_url_mp(id_txt, out_csv, num_threads=8):
    pool = PoolMP(num_threads)

    #use Pool.map to have the worker pool take ids in chunks from txt
    #and run them though ruby script using the process_line function
    #results will be failed with the ids that failed to dl
    with open(id_txt) as source_file:
        results = pool.map(_sort_url_process, source_file)
    pool.close()
    #write the list of failed ids to file
    f = csv.writer(open(out_csv, 'w'), lineterminator="\n")
    for i in results:
        f.writerow(i.split("||"))
Ejemplo n.º 6
0
def get_abstracts_mp(txt_ids_in,csv_abstracts_out,num_threads=10):
    pool = PoolMP(num_threads)

    #use Pool.map to have the worker pool take ids in chunks from txt
    #and run them though ruby script using the process_line function
    #results will be failed with the ids that failed to dl
    with open(txt_ids_in) as source_file:
        results = pool.map(_process_get_abstracts, source_file)

    #write the list of failed ids to file
    csvf =csv.writer(open(csv_abstracts_out,'w',encoding='utf-8-sig'),lineterminator="\n")
    csvf.writerow(["PubMed Id","Title","Journal URL","Abstract"])
    for i in results:
        id_index = i.index("**")
        title_index = i.index("^&^&")
        abst_index = i.index("$%$%")
        csvf.writerow([i[:id_index],i[id_index+2:title_index],i[title_index+4:abst_index],i[abst_index+4:]])
Ejemplo n.º 7
0
def run_id_ruby_mp(file_path, kickback_path, num_threads=8):
    #record start time to calculate total runtime later
    start = datetime.now()

    #init pool of workers from specified number
    #this is how many downloads will run in parallel
    pool = PoolMP(num_threads)

    #use Pool.map to have the worker pool take ids in chunks from txt
    #and run them though ruby script using the _process_line function
    #results will be failed with the ids that failed to download
    with open(file_path) as source_file:
        results = pool.map(_process_line, source_file)

    #write the list of failed ids to file
    with open(kickback_path, 'w') as f:
        for i in results:
            f.write(i)
    return datetime.now() - start