def main(args):

    file_exists_or_exit(args.medline_paragraph_path)
    file_exists_or_exit(args.pmid_file)
    ensure_path_exists(args.output_directory)

    pmids_of_interest = get_pmid_set(args)
    print('Reading input files...')
    all_files = (f for f in Path(args.medline_paragraph_path).glob('**/*')
                 if f.is_file())

    found_count = 0
    for medline_file in tqdm(all_files):
        if medline_file.name in pmids_of_interest:
            if found_count % 1000 == 0:
                current_subdir = '{0:0>4}'.format(found_count)
                ensure_path_exists(
                    str(Path(args.output_directory) / current_subdir))
            found_count += 1
            new_file_path = str(
                Path(args.output_directory) / current_subdir /
                medline_file.name)
            shutil.copyfile(str(medline_file), new_file_path)

    print('Copied {} files (by PMID) to directory {}'.format(
        found_count, args.output_directory))
    def test_file_exists_or_exit(self):

        # silence STDERR output from this test
        sys.stderr = StringIO()

        with self.assertRaises(SystemExit) as e:
            util.file_exists_or_exit('/this/file/doesnt/exist')

        self.assertEqual(e.exception.code, 1)
    def test_file_exists_or_exit(self):

        # silence STDERR output from this test
        sys.stderr = StringIO()

        with self.assertRaises(SystemExit) as e:
            util.file_exists_or_exit('/this/file/doesnt/exist')

        self.assertEqual(e.exception.code, 1)
Beispiel #4
0
def main(args):

    file_exists_or_exit(os.path.join(args.tmchem, 'tmChem.pl'))

    args.paragraph_path = os.path.abspath(args.paragraph_path)

    # glob.glob doesn't support double star expressions in Python 3.4, so using this:
    print('Reading input files...')
    all_files = [
        str(f) for f in tqdm(Path(args.paragraph_path).glob('**/*'),
                             disable=args.notqdm) if f.is_file()
    ]

    filelist_with_sublists = create_n_sublists(all_files,
                                               mp.cpu_count() * 1000)

    # check if save_directory exists and create if necessary
    if not os.path.isdir(args.output_directory):
        os.makedirs(args.output_directory)

    log_filename = os.path.join(args.logdir, 'chem_ner.log')
    logging_format = '%(asctime)s %(name)-15s %(levelname)-8s %(processName)-10s %(message)s'
    logging.basicConfig(filename=log_filename,
                        format=logging_format,
                        level=logging.INFO,
                        filemode='w')

    print('Using {} cores to process {} files...'.format(
        args.poolsize, len(all_files)))

    with mp.Pool(args.poolsize) as pool:
        mgr = mp.Manager()
        q = mgr.Queue()
        log_thread = threading.Thread(target=logging_thread, args=(q, ))
        log_thread.start()
        imap_gen = pool.imap_unordered(
            process_and_run_chunk,
            zip(filelist_with_sublists, repeat(args), repeat(q)))
        for i in tqdm(imap_gen,
                      total=len(filelist_with_sublists),
                      disable=args.notqdm):
            pass

    logging.info('Done processing {} files'.format(len(all_files)))

    # reorganize a directory with a huge number of files into a bunch of
    # subdirectories containing those same files, with a max of 10k files per
    # subdirectory
    reorganize_directory(args.output_directory,
                         max_files_per_subdir=10000,
                         quiet=args.notqdm)

    # end logging_thread
    q.put(None)
    log_thread.join()
Beispiel #5
0
def main(args):
    
    file_exists_or_exit(os.path.join(args.dnorm, 'ApplyDNorm.sh'))

    all_files = glob(os.path.join(args.paragraph_path, '*'))
    filelist_with_sublists = create_n_sublists(all_files, mp.cpu_count()*10)

    # check if save_directory exists and create if necessary
    if not os.path.isdir(args.output_directory):
        os.makedirs(args.output_directory)

    log_filename = os.path.join(args.logdir, 'disease_ner.log')
    logging_format = '%(asctime)s %(name)-15s %(levelname)-8s %(processName)-10s %(message)s'
    logging.basicConfig(filename=log_filename,
                        format=logging_format,
                        level=logging.INFO,
                        filemode='w')

    # calculate how many Java/DNorm processes to start, given available RAM
    # (DNorm requires 10GB RAM per process)
    num_java_processes = calc_dnorm_num_processes(num_cores=args.poolsize,
                                                  ram_gb=None)

    print('Using {} cores to process {} files...'.format(num_java_processes, 
                                                         len(all_files)))
    print('Allocating {} GB RAM'.format(num_java_processes*10))

    with mp.Pool(num_java_processes) as pool:
        mgr = mp.Manager()
        q = mgr.Queue()
        log_thread = threading.Thread(target=logging_thread, args=(q,))
        log_thread.start()
        imap_gen = pool.imap_unordered(process_and_run_chunk, 
                                       zip(filelist_with_sublists, 
                                           repeat(args),
                                           repeat(q)))
        for i in tqdm(imap_gen,
                      total=len(filelist_with_sublists),
                      disable=args.notqdm):
            pass

    logging.info('Done processing {} files'.format(len(all_files)))

    # reorganize a directory with a huge number of files into a bunch of
    # subdirectories containing those same files, with a max of 10k files per
    # subdirectory
    reorganize_directory(args.output_directory,
                         max_files_per_subdir=10000,
                         quiet=args.notqdm)
    
    # end logging_thread
    q.put(None)
    log_thread.join()
Beispiel #6
0
def main(args):

    file_exists_or_exit(os.path.join(args.tmchem,'tmChem.pl'))

    args.paragraph_path = os.path.abspath(args.paragraph_path)

    # glob.glob doesn't support double star expressions in Python 3.4, so using this:
    print('Reading input files...')
    all_files = [str(f) for f in tqdm(Path(args.paragraph_path).glob('**/*'), disable=args.notqdm) if f.is_file()]

    filelist_with_sublists = create_n_sublists(all_files, mp.cpu_count()*1000)

    # check if save_directory exists and create if necessary
    if not os.path.isdir(args.output_directory):
        os.makedirs(args.output_directory)

    log_filename = os.path.join(args.logdir, 'chem_ner.log')
    logging_format = '%(asctime)s %(name)-15s %(levelname)-8s %(processName)-10s %(message)s'
    logging.basicConfig(filename=log_filename,
                        format=logging_format,
                        level=logging.INFO,
                        filemode='w')

    print('Using {} cores to process {} files...'.format(args.poolsize,
                                                         len(all_files)))

    with mp.Pool(args.poolsize) as pool:
        mgr = mp.Manager()
        q = mgr.Queue()
        log_thread = threading.Thread(target=logging_thread, args=(q,))
        log_thread.start()
        imap_gen = pool.imap_unordered(process_and_run_chunk, 
                                        zip(filelist_with_sublists, 
                                            repeat(args),
                                            repeat(q)))
        for i in tqdm(imap_gen,
                      total=len(filelist_with_sublists),
                      disable=args.notqdm):
            pass

    logging.info('Done processing {} files'.format(len(all_files)))

    # reorganize a directory with a huge number of files into a bunch of
    # subdirectories containing those same files, with a max of 10k files per
    # subdirectory
    reorganize_directory(args.output_directory,
                         max_files_per_subdir=10000,
                         quiet=args.notqdm)
    
    # end logging_thread
    q.put(None)
    log_thread.join()
Beispiel #7
0
def main(args):

    file_exists_or_exit(os.path.join(args.dnorm, 'ApplyDNorm.sh'))

    all_files = glob(os.path.join(args.paragraph_path, '*'))
    filelist_with_sublists = create_n_sublists(all_files, mp.cpu_count() * 10)

    # check if save_directory exists and create if necessary
    if not os.path.isdir(args.output_directory):
        os.makedirs(args.output_directory)

    log_filename = os.path.join(args.logdir, 'disease_ner.log')
    logging_format = '%(asctime)s %(name)-15s %(levelname)-8s %(processName)-10s %(message)s'
    logging.basicConfig(filename=log_filename,
                        format=logging_format,
                        level=logging.INFO,
                        filemode='w')

    # calculate how many Java/DNorm processes to start, given available RAM
    # (DNorm requires 10GB RAM per process)
    num_java_processes = calc_dnorm_num_processes(num_cores=args.poolsize,
                                                  ram_gb=None)

    print('Using {} cores to process {} files...'.format(
        num_java_processes, len(all_files)))
    print('Allocating {} GB RAM'.format(num_java_processes * 10))

    with mp.Pool(num_java_processes) as pool:
        mgr = mp.Manager()
        q = mgr.Queue()
        log_thread = threading.Thread(target=logging_thread, args=(q, ))
        log_thread.start()
        imap_gen = pool.imap_unordered(
            process_and_run_chunk,
            zip(filelist_with_sublists, repeat(args), repeat(q)))
        for i in tqdm(imap_gen,
                      total=len(filelist_with_sublists),
                      disable=args.notqdm):
            pass

    logging.info('Done processing {} files'.format(len(all_files)))

    # reorganize a directory with a huge number of files into a bunch of
    # subdirectories containing those same files, with a max of 10k files per
    # subdirectory
    reorganize_directory(args.output_directory,
                         max_files_per_subdir=10000,
                         quiet=args.notqdm)

    # end logging_thread
    q.put(None)
    log_thread.join()
Beispiel #8
0
def main(args):

    # ensure that `qsub` command exists on this machine...
    shell_command_exists_or_exit('qsub')

    # make paths absolute so that all other paths are also absolute
    args.paragraph_path = os.path.abspath(args.paragraph_path)
    args.output_directory = os.path.abspath(args.output_directory)
    args.corenlp = os.path.abspath(args.corenlp)

    file_exists_or_exit(os.path.join(args.corenlp, 'corenlp.sh'))

    input_dir = os.path.join(args.output_directory, 'input_files')
    job_dir = os.path.join(args.output_directory, 'job_files')
    output_dir = os.path.join(args.output_directory, 'output_files')
    for dirpath in (args.output_directory, input_dir, job_dir, output_dir):
        ensure_path_exists(dirpath)

    all_files = glob(os.path.join(args.paragraph_path, '*'))
    number_of_chunks = len(all_files) // 100
    filelist_with_sublists = create_n_sublists(all_files, number_of_chunks)
    jobs_submitted_success = 0
    job_file_paths_failed = []
    print('Creating {} input files in {} groups with matched job files...'.
          format(len(all_files), number_of_chunks))
    if args.submit:
        print('and submitting jobs to queue \'{}\'...'.format(args.queue))
    for chunk_num, subfilelist in enumerate(tqdm(filelist_with_sublists)):
        new_input_files = create_corenlp_input_files(subfilelist, input_dir)
        new_job_file_path = create_job_file(new_input_files, job_dir,
                                            output_dir, chunk_num, args)
        if args.submit:
            result = submit_pbs_job(new_job_file_path, queue=args.queue)
            if result:
                jobs_submitted_success += 1
            else:
                job_file_paths_failed.append(new_job_file_path)

    print('Created {} PBS job files in {}'.format(chunk_num + 1, job_dir))

    if args.submit:
        print(
            'Successfully submitted {} jobs with {} failed submissions'.format(
                jobs_submitted_success, len(job_file_paths_failed)))
        for path in job_file_paths_failed:
            print('FAILED TO SUBMIT', path)
def main(args):

    file_exists_or_exit(args.pmid_file)
    file_exists_or_exit(args.bioconcepts_file)
    pmids_set = get_file_lines_set(args.pmid_file)

    records = produce_records(args.bioconcepts_file)
    print('Found {} distinct PMIDs in file {}'.format(len(pmids_set),
                                                      args.bioconcepts_file))

    if not args.c:
        # create output subdirectories
        ensure_path_exists(args.output_directory)

        pubtator_outdir = os.path.join(args.output_directory, 'pubtator')
        ensure_path_exists(pubtator_outdir)
        abstract_outdir = os.path.join(args.output_directory, 'abstracts')
        ensure_path_exists(abstract_outdir)

    found_count = 0
    not_found_count = 0
    for record in tqdm(records):
        title, abstract, *ner_lines = record
        pmid = title.split('|')[0]
        if pmid in pmids_set:
            found_count += 1
            if not args.c:
                save_file(pmid, [
                    line + '\n'
                    for line in (title, abstract, '\n'.join(ner_lines))
                ], pubtator_outdir)
                _, parform_output = pubtator_to_parform(title,
                                                        abstract,
                                                        newlines=True)
                save_file(pmid, parform_output, abstract_outdir)
        else:
            not_found_count += 1

    print('Out of {} abstracts...'.format(found_count + not_found_count))
    print('- {} records in {}'.format(found_count, args.pmid_file))
    print('- {} records NOT in {}'.format(not_found_count, args.pmid_file))
    if not args.c:
        print('Parsed/paragraph abstracts saved to {}'.format(abstract_outdir))
        print('NER-annotated abstracts saved to {}'.format(pubtator_outdir))
def main(args):

    file_exists_or_exit(args.pmid_file)
    file_exists_or_exit(args.bioconcepts_file)
    pmids_set = get_file_lines_set(args.pmid_file)

    records = produce_records(args.bioconcepts_file)
    print('Found {} distinct PMIDs in file {}'.format(len(pmids_set),
                                                      args.bioconcepts_file))

    if not args.c:
        # create output subdirectories
        ensure_path_exists(args.output_directory)

        pubtator_outdir = os.path.join(args.output_directory, 'pubtator')
        ensure_path_exists(pubtator_outdir)
        abstract_outdir = os.path.join(args.output_directory, 'abstracts')
        ensure_path_exists(abstract_outdir)

    found_count = 0
    not_found_count = 0
    for record in tqdm(records):
        title, abstract, *ner_lines = record
        pmid = title.split('|')[0]
        if pmid in pmids_set:
            found_count += 1
            if not args.c:
                save_file(pmid,
                          [line+'\n' for line in (title, abstract, '\n'.join(ner_lines))],
                          pubtator_outdir)
                _, parform_output = pubtator_to_parform(title,
                                                        abstract,
                                                        newlines=True)
                save_file(pmid, parform_output, abstract_outdir)
        else:
            not_found_count += 1

    print('Out of {} abstracts...'.format(found_count+not_found_count))
    print('- {} records in {}'.format(found_count, args.pmid_file))
    print('- {} records NOT in {}'.format(not_found_count, args.pmid_file))
    if not args.c:
        print('Parsed/paragraph abstracts saved to {}'.format(abstract_outdir))
        print('NER-annotated abstracts saved to {}'.format(pubtator_outdir))
def main(args):

    file_exists_or_exit(args.medline_paragraph_path)
    file_exists_or_exit(args.pmid_file)
    ensure_path_exists(args.output_directory)

    pmids_of_interest = get_pmid_set(args)
    print('Reading input files...')
    all_files = (f for f in Path(args.medline_paragraph_path).glob('**/*') if f.is_file())

    found_count = 0
    for medline_file in tqdm(all_files):
        if medline_file.name in pmids_of_interest:
            if found_count % 1000 == 0:
                current_subdir = '{0:0>4}'.format(found_count)
                ensure_path_exists(str(Path(args.output_directory) / current_subdir))
            found_count += 1
            new_file_path = str(Path(args.output_directory) / current_subdir / medline_file.name)
            shutil.copyfile(str(medline_file), new_file_path)

    print('Copied {} files (by PMID) to directory {}'.format(found_count, args.output_directory))
Beispiel #12
0
def main(args):
    
    file_exists_or_exit(os.path.join(args.gnormplus, 'GNormPlus.pl'))

    all_files = glob(os.path.join(args.paragraph_path, '*'))
    filelist_with_sublists = create_n_sublists(all_files, mp.cpu_count()*10)

    # check if save_directory exists and create if necessary
    if not os.path.isdir(args.output_directory):
        os.makedirs(args.output_directory)

    log_filename = os.path.join(args.logdir, 'gene_ner.log')
    logging_format = '%(asctime)s %(name)-15s %(levelname)-8s %(processName)-10s %(message)s'
    logging.basicConfig(filename=log_filename,
                        format=logging_format,
                        level=logging.INFO,
                        filemode='w')

    print('Using {} cores to process {} files...'.format(mp.cpu_count(), 
                                                         len(all_files)))

    with mp.Pool() as pool:
        mgr = mp.Manager()
        q = mgr.Queue()
        log_thread = threading.Thread(target=logging_thread, args=(q,))
        log_thread.start()
        imap_gen = pool.imap_unordered(process_and_run_chunk, 
                                        zip(filelist_with_sublists, 
                                            repeat(args),
                                            repeat(q)))
        for i in tqdm(imap_gen, total=len(filelist_with_sublists)):
            pass

    logging.info('Done processing {} files'.format(len(all_files)))
    
    # end logging_thread
    q.put(None)
    log_thread.join()
Beispiel #13
0
def main(args):

    # make all paths absolute (to simplify things later)
    args.paragraph_path = os.path.abspath(args.paragraph_path)
    args.output_directory = os.path.abspath(args.output_directory)
    args.logdir = os.path.abspath(args.logdir)
    args.dnorm = os.path.abspath(args.dnorm)
    args.bioshovel = os.path.abspath(args.bioshovel)

    file_exists_or_exit(os.path.join(args.dnorm, 'ApplyDNorm.sh'))

    # calculate how many Java/DNorm processes to start, given available RAM
    # (DNorm requires 10GB RAM per process)
    args.poolsize = calc_dnorm_num_processes(num_cores=args.poolsize,
                                             ram_gb=args.memgb)

    # get all files (recursive)
    print('Organizing input files and submitting jobs...')
    if args.resume:
        # filter out filenames that are already done...
        args.resume = os.path.abspath(args.resume)
        file_exists_or_exit(args.resume)
        print('Resuming job submission from path: {}'.format(args.resume))

        if os.path.exists(args.output_directory):
            print('Output directory changed to avoid file conflicts:')
            print('OLD: '+args.output_directory)
            args.output_directory += '_resume'
            print('NEW: '+args.output_directory)

        print('Reading previously completed files...')
        # this set may use a lot of RAM if args.resume path contains a ton of files...
        done_files = set(p.name for p in tqdm(Path(args.resume).glob('**/*')) if p.is_file())

        all_files = (str(f) for f in tqdm(Path(args.paragraph_path).glob('**/*')) if f.is_file() and f.name not in done_files)
    else:
        all_files = (str(f) for f in tqdm(Path(args.paragraph_path).glob('**/*')) if f.is_file())

    # divide list into chunks of size n
    filelist_with_sublists = create_sublists_sized_n(all_files, args.nfiles)

    # move job files into the appropriate subdirectories
    #   (don't store more than 1k per sub-subdirectory)
    # create job file for each subdirectory
    # submit jobs
    base_input_directory = os.path.join(args.output_directory, 'input_files')
    job_dir = os.path.join(args.output_directory, 'job_files')
    output_directory = os.path.join(args.output_directory, 'output')
    for path in (base_input_directory, job_dir, output_directory):
        ensure_path_exists(path)
    job_file_paths_failed = []
    jobs_submitted_success = 0
    for sublist_num, sublist in enumerate(filelist_with_sublists):
        sublist_dir = os.path.join(base_input_directory, 
                                   'sublist_{0:0>4}'.format(sublist_num))
        ensure_path_exists(sublist_dir)
        create_sublist_symlinks(sublist, sublist_dir, 1000)
        job_file_path = create_job_file(job_dir, sublist_dir, output_directory, sublist_num, args)
        if args.submit:
            result = submit_pbs_job(job_file_path, queue=args.queue)
            if result:
                jobs_submitted_success += 1
            else:
                job_file_paths_failed.append(job_file_path)

    print('Successfully submitted {} jobs with {} failed submissions'.format(jobs_submitted_success,
                                                                             len(job_file_paths_failed)))
    for path in job_file_paths_failed:
        print('FAILED TO SUBMIT', path)
Beispiel #14
0
def process_and_run_chunk(filepaths_args_tuple):

    ''' Generates reformatted files for each file path in 
        list_of_file_paths, saves them to a single temp directory,
        and calls DNorm on each individual file using subprocess
    '''

    list_of_file_paths, args, q = filepaths_args_tuple

    if not list_of_file_paths:
        return

    qh = logging.handlers.QueueHandler(q)
    l = logging.getLogger()

    parsed_files = [parse_parform_file(file_path)
                    for file_path in list_of_file_paths]

    # filter out files with no title line 
    # (for which parse_parform_file returned None)
    parsed_files = [f for f in parsed_files if f]

    reformatted_files = [parform_to_pubtator(escaped_doi, title_line, body)
                         for escaped_doi, title_line, body in parsed_files]

    reqs = {'banner_ncbidisease': os.path.join(args.dnorm,
                                               'config', 
                                               'banner_NCBIDisease_UMLS2013AA_TEST.xml'),
            'ctd_diseases': os.path.join(args.dnorm,
                                         'data',
                                         'CTD_diseases.tsv'),
            'simmatrix': os.path.join(args.dnorm,
                                      'output',
                                      'simmatrix_NCBIDisease_e4.bin'),
            'ab3p_path': os.path.join(args.dnorm,
                                      '..',
                                      'Ab3P-v1.5')}
    for required_file in reqs:
        file_exists_or_exit(reqs[required_file])

    with tempfile.TemporaryDirectory() as input_tempdir, tempfile.TemporaryDirectory() as output_tempdir, tempfile.TemporaryDirectory() as dnorm_tempdir:
        for doi_filename, file_info in reformatted_files:
            save_file(doi_filename, file_info, input_tempdir)

            try:
                out = subprocess.check_output(['bash',
                                               'ApplyDNorm.sh',
                                               reqs['banner_ncbidisease'],
                                               reqs['ctd_diseases'],
                                               reqs['simmatrix'],
                                               reqs['ab3p_path'],
                                               dnorm_tempdir,
                                               os.path.join(input_tempdir,
                                                            doi_filename),
                                               os.path.join(output_tempdir,
                                                            doi_filename)],
                                               cwd=args.dnorm,
                                               stderr=subprocess.STDOUT)
            except subprocess.CalledProcessError as err:
                string_error = err.output.decode(encoding='UTF-8').rstrip('\n')
                l.critical('DNorm error: {}'.format(string_error))
                l.critical('DNorm error while processing chunk: {}'.format(list_of_file_paths))

        # grab all new output files and copy to args.output_directory
        all_tempfiles = glob(os.path.join(output_tempdir, '*'))

        try:
            subprocess.check_output(['cp', '-t', args.output_directory+'/'] + all_tempfiles)
        except subprocess.CalledProcessError:
            l.critical('Copy error, chunk: {}'.format(list_of_file_paths))
Beispiel #15
0
def main(args):

    ensure_path_exists(args.output_directory)
    # pmid_path = os.path.join(args.output_directory, 'by_pmid')
    # ensure_path_exists(pmid_path)

    if args.doiindex:
        file_exists_or_exit(args.doiindex)
        # doi_path = os.path.join(args.output_directory, 'by_doi')
        # ensure_path_exists(doi_path)

    pmid_doi_map = create_pmid_doi_mapping(args)

    all_files = glob(os.path.join(args.xml_file_directory, '*'))
    doi_filenames = []
    pmid_filenames = []
    skipped_files = 0

    total_citation_count = 0
    print('Processing {} XML files...'.format(len(all_files)))
    for xml_filepath in tqdm(all_files):

        root = get_root_object(xml_filepath)
        citations = get_element('MedlineCitation', root)

        for citation in citations:

            # update save directories every 10k files.
            #
            # directory structure will be:
            #   [base_dir]/0000/by_pmid/
            #   [base_dir]/0000/by_doi/
            #   [base_dir]/0001/by_pmid/
            #   [base_dir]/0001/by_doi/
            #   ...
            #
            # (no subdirectory will have > 10000 files)
            if total_citation_count % 10000 == 0:
                pmid_path = os.path.join(args.output_directory,
                                         '{0:0>4}'.format(total_citation_count//10000),
                                         'by_pmid')
                ensure_path_exists(pmid_path)

                if args.doiindex:
                    doi_path = os.path.join(args.output_directory,
                                            '{0:0>4}'.format(total_citation_count//10000),
                                            'by_doi')
                    ensure_path_exists(doi_path)

            total_citation_count += 1

            abstracts = get_element('AbstractText', citation)
            if not abstracts: # not all MedlineCitations have Abstracts...
                continue

            # grab one of the abstracts identified in this citation and identify
            # its parent XML elements
            first_abstract_section = abstracts[0]
            d = get_abstract_parent_info(first_abstract_section)

            pmid = d['pmid']
            title = d['title']
            if not pmid or not d['title']:
                # if no PMID, no easy way to identify article (skip)
                # if no title, nothing to include in PubTator title line (skip)
                skipped_files += 1
                continue

            combined_abstract = combine_all_abstract_text_tags(d['parent_abstract'])
            file_lines = create_filelines(title, combined_abstract)

            doi_file_name = pmid_doi_map.get(pmid)
            if doi_file_name:
                # use DOI file name and file path
                doi_filenames.append(save_file(doi_file_name, file_lines, doi_path))
            else:
                # use PMID file name and file path
                pmid_filenames.append(save_file(pmid, file_lines, pmid_path))

    print('{} files saved by PMID to {}'.format(len(pmid_filenames), pmid_path))
    if args.doiindex:
        print('{} files saved by DOI to {}'.format(len(doi_filenames),
                                                   doi_path))
    print('{} MEDLINE citations skipped'.format(skipped_files))
Beispiel #16
0
def process_and_run_chunk(filepaths_args_tuple):
    ''' Generates reformatted files for each file path in 
        list_of_file_paths, saves them to a single temp directory,
        and calls DNorm on each individual file using subprocess
    '''

    list_of_file_paths, args, q = filepaths_args_tuple

    if not list_of_file_paths:
        return

    qh = logging.handlers.QueueHandler(q)
    l = logging.getLogger()

    parsed_files = [
        parse_parform_file(file_path) for file_path in list_of_file_paths
    ]

    # filter out files with no title line
    # (for which parse_parform_file returned None)
    parsed_files = [f for f in parsed_files if f]

    reformatted_files = [
        parform_to_pubtator(escaped_doi, title_line, body)
        for escaped_doi, title_line, body in parsed_files
    ]

    reqs = {
        'banner_ncbidisease':
        os.path.join(args.dnorm, 'config',
                     'banner_NCBIDisease_UMLS2013AA_TEST.xml'),
        'ctd_diseases':
        os.path.join(args.dnorm, 'data', 'CTD_diseases.tsv'),
        'simmatrix':
        os.path.join(args.dnorm, 'output', 'simmatrix_NCBIDisease_e4.bin'),
        'ab3p_path':
        os.path.join(args.dnorm, '..', 'Ab3P-v1.5')
    }
    for required_file in reqs:
        file_exists_or_exit(reqs[required_file])

    with tempfile.TemporaryDirectory(
    ) as input_tempdir, tempfile.TemporaryDirectory(
    ) as output_tempdir, tempfile.TemporaryDirectory() as dnorm_tempdir:
        for doi_filename, file_info in reformatted_files:
            save_file(doi_filename, file_info, input_tempdir)

            try:
                out = subprocess.check_output([
                    'bash', 'ApplyDNorm.sh', reqs['banner_ncbidisease'],
                    reqs['ctd_diseases'], reqs['simmatrix'], reqs['ab3p_path'],
                    dnorm_tempdir,
                    os.path.join(input_tempdir, doi_filename),
                    os.path.join(output_tempdir, doi_filename)
                ],
                                              cwd=args.dnorm,
                                              stderr=subprocess.STDOUT)
            except subprocess.CalledProcessError as err:
                string_error = err.output.decode(encoding='UTF-8').rstrip('\n')
                l.critical('DNorm error: {}'.format(string_error))
                l.critical('DNorm error while processing chunk: {}'.format(
                    list_of_file_paths))

        # grab all new output files and copy to args.output_directory
        all_tempfiles = glob(os.path.join(output_tempdir, '*'))

        try:
            subprocess.check_output(['cp', '-t', args.output_directory + '/'] +
                                    all_tempfiles)
        except subprocess.CalledProcessError:
            l.critical('Copy error, chunk: {}'.format(list_of_file_paths))
Beispiel #17
0
def main(args):

    ensure_path_exists(args.output_directory)
    # pmid_path = os.path.join(args.output_directory, 'by_pmid')
    # ensure_path_exists(pmid_path)

    if args.doiindex:
        file_exists_or_exit(args.doiindex)
        # doi_path = os.path.join(args.output_directory, 'by_doi')
        # ensure_path_exists(doi_path)

    pmid_doi_map = create_pmid_doi_mapping(args)

    all_files = glob(os.path.join(args.xml_file_directory, '*'))
    doi_filenames = []
    pmid_filenames = []
    skipped_files = 0

    total_citation_count = 0
    print('Processing {} XML files...'.format(len(all_files)))
    for xml_filepath in tqdm(all_files):

        root = get_root_object(xml_filepath)
        citations = get_element('MedlineCitation', root)

        for citation in citations:

            # update save directories every 10k files.
            #
            # directory structure will be:
            #   [base_dir]/0000/by_pmid/
            #   [base_dir]/0000/by_doi/
            #   [base_dir]/0001/by_pmid/
            #   [base_dir]/0001/by_doi/
            #   ...
            #
            # (no subdirectory will have > 10000 files)
            if total_citation_count % 10000 == 0:
                pmid_path = os.path.join(
                    args.output_directory,
                    '{0:0>4}'.format(total_citation_count // 10000), 'by_pmid')
                ensure_path_exists(pmid_path)

                if args.doiindex:
                    doi_path = os.path.join(
                        args.output_directory,
                        '{0:0>4}'.format(total_citation_count // 10000),
                        'by_doi')
                    ensure_path_exists(doi_path)

            total_citation_count += 1

            abstracts = get_element('AbstractText', citation)
            if not abstracts:  # not all MedlineCitations have Abstracts...
                continue

            # grab one of the abstracts identified in this citation and identify
            # its parent XML elements
            first_abstract_section = abstracts[0]
            d = get_abstract_parent_info(first_abstract_section)

            pmid = d['pmid']
            title = d['title']
            if not pmid or not d['title']:
                # if no PMID, no easy way to identify article (skip)
                # if no title, nothing to include in PubTator title line (skip)
                skipped_files += 1
                continue

            combined_abstract = combine_all_abstract_text_tags(
                d['parent_abstract'])
            file_lines = create_filelines(title, combined_abstract)

            doi_file_name = pmid_doi_map.get(pmid)
            if doi_file_name:
                # use DOI file name and file path
                doi_filenames.append(
                    save_file(doi_file_name, file_lines, doi_path))
            else:
                # use PMID file name and file path
                pmid_filenames.append(save_file(pmid, file_lines, pmid_path))

    print('{} files saved by PMID to {}'.format(len(pmid_filenames),
                                                pmid_path))
    if args.doiindex:
        print('{} files saved by DOI to {}'.format(len(doi_filenames),
                                                   doi_path))
    print('{} MEDLINE citations skipped'.format(skipped_files))
Beispiel #18
0
def main(args):

    # make all paths absolute (to simplify things later)
    args.paragraph_path = os.path.abspath(args.paragraph_path)
    args.output_directory = os.path.abspath(args.output_directory)
    args.logdir = os.path.abspath(args.logdir)
    args.tmchem = os.path.abspath(args.tmchem)
    args.bioshovel = os.path.abspath(args.bioshovel)

    file_exists_or_exit(os.path.join(args.tmchem, 'tmChem.pl'))

    # get all files (recursive)
    print('Organizing input files and submitting jobs...')
    if args.resume:
        # filter out filenames that are already done...
        args.resume = os.path.abspath(args.resume)
        file_exists_or_exit(args.resume)
        print('Resuming job submission from path: {}'.format(args.resume))

        if os.path.exists(args.output_directory):
            print('Output directory changed to avoid file conflicts:')
            print('OLD: ' + args.output_directory)
            args.output_directory += '_resume'
            print('NEW: ' + args.output_directory)

        print('Reading previously completed files...')
        # this set may use a lot of RAM if args.resume path contains a ton of files...
        done_files = set(
            p.name.rstrip('.tmChem')
            for p in tqdm(Path(args.resume).glob('**/*')) if p.is_file())

        all_files = (str(f)
                     for f in tqdm(Path(args.paragraph_path).glob('**/*'))
                     if f.is_file() and f.name not in done_files)
    else:
        all_files = (str(f)
                     for f in tqdm(Path(args.paragraph_path).glob('**/*'))
                     if f.is_file())

    # divide list into chunks of size n
    filelist_with_sublists = create_sublists_sized_n(all_files, args.nfiles)

    # move job files into the appropriate subdirectories
    #   (don't store more than 1k per sub-subdirectory)
    # create job file for each subdirectory
    # submit jobs
    base_input_directory = os.path.join(args.output_directory, 'input_files')
    job_dir = os.path.join(args.output_directory, 'job_files')
    output_directory = os.path.join(args.output_directory, 'output')
    for path in (base_input_directory, job_dir, output_directory):
        ensure_path_exists(path)
    job_file_paths_failed = []
    jobs_submitted_success = 0
    for sublist_num, sublist in enumerate(filelist_with_sublists):
        sublist_dir = os.path.join(base_input_directory,
                                   'sublist_{0:0>4}'.format(sublist_num))
        ensure_path_exists(sublist_dir)
        create_sublist_symlinks(sublist, sublist_dir, 1000)
        job_file_path = create_job_file(job_dir, sublist_dir, output_directory,
                                        sublist_num, args)
        if args.submit:
            result = submit_pbs_job(job_file_path, queue=args.queue)
            if result:
                jobs_submitted_success += 1
            else:
                job_file_paths_failed.append(job_file_path)

    print('Successfully submitted {} jobs with {} failed submissions'.format(
        jobs_submitted_success, len(job_file_paths_failed)))
    for path in job_file_paths_failed:
        print('FAILED TO SUBMIT', path)