Esempio n. 1
0
def main(data_dir, target_list, fastas_dir, pssms_dir, psfms_dir,
         secstructs_dir, num_cpus, blast_path, nr_path, psipred_path):
    """ Run psipreds to generate the PSSMs and secondary structure predictions
    for each residue in a protein structure.
    """
    logger = logging.getLogger(__name__)

    with open(target_list, 'r') as f:
        targets = [t.strip() for t in f.readlines()]

    logger.info("Running psipreds on {:} structures in {:}".format(
        len(targets), target_list))

    def __run(id, blast_path, nr_path, psipred_path, pssms_dir, psfms_dir,
              secstructs_dir, tmp_dir, target, fasta_file):
        logger.info("Processing target {:} ({:}/{:})".format(
            target, id, len(targets)))
        run_psipred(blast_path, nr_path, psipred_path, pssms_dir, psfms_dir,
                    secstructs_dir, tmp_dir, target, fasta_file)

    os.makedirs(pssms_dir, exist_ok=True)
    os.makedirs(psfms_dir, exist_ok=True)
    os.makedirs(secstructs_dir, exist_ok=True)
    os.makedirs(tmp_dir, exist_ok=True)

    inputs = []
    for i, target in enumerate(targets):
        fasta_file = os.path.join(fastas_dir, '{:}.fasta'.format(target))
        if os.path.exists(fasta_file):
            inputs.append((i+1, blast_path, nr_path, psipred_path, pssms_dir,
                           psfms_dir, secstructs_dir, tmp_dir, target, fasta_file))
        else:
            logger.warning("FASTA for {:} does not exist".format(target))

    par.submit_jobs(__run, inputs, num_cpus)
Esempio n. 2
0
def map_all_pssms(pdb_dataset, blastdb, output_dir, num_cpus):
    ext = '.pkl'
    requested_filenames = \
        db.get_structures_filenames(pdb_dataset, extension=ext)
    requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
    produced_filenames = db.get_structures_filenames(output_dir,
                                                     extension='.pkl')
    produced_keys = [db.get_pdb_name(x) for x in produced_filenames]
    work_keys = [key for key in requested_keys if key not in produced_keys]
    work_filenames = [
        x[0] for x in db.get_all_filenames(work_keys,
                                           pdb_dataset,
                                           extension=ext,
                                           keyer=lambda x: db.get_pdb_name(x))
    ]

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
        output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) +
                                ".pkl")

    logging.info("{:} requested keys, {:} produced keys, {:} work keys".format(
        len(requested_keys), len(produced_keys), len(work_keys)))
    inputs = [(key, blastdb, output)
              for key, output in zip(work_filenames, output_filenames)]
    par.submit_jobs(map_pssms, inputs, num_cpus)
Esempio n. 3
0
def parse_all(pdb_dataset, output_dir, num_cpus):
    """Parse pdb dataset (pdb files) to pandas dataframes."""
    requested_filenames = db.get_structures_filenames(pdb_dataset)
    produced_filenames = db.get_structures_filenames(
        output_dir, extension='.pkl')

    requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
    produced_keys = [db.get_pdb_name(x) for x in produced_filenames]
    work_keys = [key for key in requested_keys if key not in produced_keys]
    work_filenames = [x[0] for x in
                      db.get_all_filenames(work_keys, pdb_dataset, enforcement=2)]

    logging.info("{:} requested keys, {:} produced keys, {:} work keys"
                 .format(len(requested_keys), len(produced_keys),
                         len(work_keys)))

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
        output_filenames.append(
            sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".pkl")

    inputs = [(key, output)
              for key, output in zip(work_filenames, output_filenames)]
    par.submit_jobs(parse, inputs, num_cpus)
Esempio n. 4
0
def main(data_dir, target_list, labels_dir, struct_format,
         num_cpus, overwrite, tmscore_exe):
    """ Compute rmsd, tm-score, gdt-ts, gdt-ha of decoy structures
    """
    logger = logging.getLogger(__name__)
    logger.info("Compute rmsd, tm-score, gdt-ts, gdt-ha of decoys in {:}".format(
        data_dir))

    os.makedirs(labels_dir, exist_ok=True)

    with open(target_list, 'r') as f:
        requested_filenames = \
            [os.path.join(labels_dir, '{:}.dat'.format(x.strip())) for x in f]
    logger.info("{:} requested keys".format(len(requested_filenames)))

    produced_filenames = []
    if not overwrite:
        produced_filenames = [f for f in fi.find_files(labels_dir, 'dat') \
                              if 'targets' not in f]
    logger.info("{:} produced keys".format(len(produced_filenames)))

    inputs = []
    for filename in requested_filenames:
        if filename in produced_filenames:
            continue
        target_name = util.get_target_name(filename)
        target_dir = os.path.join(data_dir, target_name)
        inputs.append((tmscore_exe, filename, target_name,
                       target_dir, struct_format))

    logger.info("{:} work keys".format(len(inputs)))
    par.submit_jobs(run_tmscore_per_target, inputs, num_cpus)
Esempio n. 5
0
def main(pair_dir, tfrecord_dir, num_cpus):
    """Run write_pairs on all provided complexes."""
    requested_filenames = \
        db.get_structures_filenames(pair_dir, extension='.dill')
    requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
    produced_filenames = \
        db.get_structures_filenames(tfrecord_dir, extension='.tfrecord')
    produced_keys = [db.get_pdb_name(x) for x in produced_filenames]

    work_keys = [key for key in requested_keys if key not in produced_keys]
    logging.info("{:} requested keys, {:} produced keys, {:} work keys".format(
        len(requested_keys), len(produced_keys), len(work_keys)))
    work_filenames = [
        x[0]
        for x in db.get_all_filenames(work_keys, pair_dir, extension='.dill')
    ]

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = tfrecord_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
        output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) +
                                ".tfrecord")

    inputs = [(i, o) for i, o in zip(work_filenames, output_filenames)]
    par.submit_jobs(pairs_to_tfrecord, inputs, num_cpus)
Esempio n. 6
0
def shard_envs(input_path, output_path, num_threads=8, subsample=True):
    input_sharded = sh.Sharded.load(input_path)
    keys = input_sharded.get_keys()
    if keys != ['ensemble']:
        raise RuntimeError('Can only apply to sharded by ensemble.')
    output_sharded = sh.Sharded(output_path, keys)
    input_num_shards = input_sharded.get_num_shards()

    tmp_path = output_sharded.get_prefix() + f'_tmp@{input_num_shards:}'
    tmp_sharded = sh.Sharded(tmp_path, keys)

    not_written = []
    for i in range(input_num_shards):
        shard = output_sharded._get_shard(i)
        if not os.path.exists(shard):
            not_written.append(i)

    print(f'Using {num_threads:} threads')

    inputs = [(input_sharded, tmp_sharded, shard_num, subsample)
              for shard_num in range(8)]

    # with multiprocessing.Pool(processes=num_threads) as pool:
    #     pool.starmap(_shard_envs, inputs)
    par.submit_jobs(_shard_envs, inputs, num_threads)

    sho.reshard(tmp_sharded, output_sharded)
    tmp_sharded.delete_files()
Esempio n. 7
0
def save_graphs(sharded, out_dir, num_threads=8):
    num_shards = sharded.get_num_shards()
    inputs = [(sharded, shard_num, out_dir) for shard_num in range(num_shards)]

    # with multiprocessing.Pool(processes=num_threads) as pool:
    #     pool.starmap(_shard_envs, inputs)
    par.submit_jobs(_save_graphs, inputs, num_threads)
    _rename(out_dir)
Esempio n. 8
0
def bsa_db(sharded_path, output_bsa, num_threads):
    sharded = sh.Sharded.load(sharded_path)
    num_shards = sharded.get_num_shards()

    dirname = os.path.dirname(output_bsa)
    if dirname != '':
        os.makedirs(dirname, exist_ok=True)

    inputs = [(sharded, x, output_bsa) for x in range(num_shards)]
    logger.info(f'{num_shards:} shards to do.')
    logger.info(f'Using {num_threads:} threads')

    par.submit_jobs(_bsa_db, inputs, num_threads)
Esempio n. 9
0
def all_complex_to_pairs(complexes, get_pairs, output_dir, num_cpus):
    """Reads in structures and produces appropriate pairings."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    requested_keys = complexes['data'].keys()
    produced_keys = complexes_from_pair_dir(output_dir)
    work_keys = [key for key in requested_keys if key not in produced_keys]

    inputs = [(complexes['data'][key], get_pairs, output_dir)
              for key in work_keys]
    logging.info("{:} requested keys, {:} produced keys, {:} work keys".format(
        len(requested_keys), len(produced_keys), len(work_keys)))
    par.submit_jobs(complex_to_pairs, inputs, num_cpus)
Esempio n. 10
0
def main(pair_dir, to_keep_dir, output_dir, num_cpus):
    """Run write_pairs on all provided complexes."""
    to_keep_filenames = \
        db.get_structures_filenames(to_keep_dir, extension='.txt')
    if len(to_keep_filenames) == 0:
        logging.warning(
            "There is no to_keep file in {:}. All pair files from {:} "
            "will be copied into {:}".format(to_keep_dir, pair_dir,
                                             output_dir))

    to_keep_df = __load_to_keep_files_into_dataframe(to_keep_filenames)
    logging.info("There are {:} rows, cols in to_keep_df".format(
        to_keep_df.shape))

    logging.info("Looking for all pairs in {:}".format(pair_dir))
    work_filenames = \
        db.get_structures_filenames(pair_dir, extension='.dill')
    work_keys = [db.get_pdb_name(x) for x in work_filenames]
    logging.info("Found {:} pairs in {:}".format(len(work_keys), output_dir))

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
        output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) +
                                ".dill")

    inputs = [(i, o, to_keep_df)
              for i, o in zip(work_filenames, output_filenames)]
    ncopied = 0
    ncopied += np.sum(par.submit_jobs(process_pairs_to_keep, inputs, num_cpus))
    logging.info("{:} out of {:} pairs was copied".format(
        ncopied, len(work_keys)))
Esempio n. 11
0
def generate_all_clean_complexes(args):
    """Clean all complexes in input_dir, writing them out to output_dir."""
    requested_keys = get_complex_pdb_codes(args.pdb_dataset)
    produced_filenames = db.get_structures_filenames(args.output_dir)
    produced_keys = []
    for pdb_code in requested_keys:
        res = get_files_for_complex(pdb_code, produced_filenames, 'db5')
        if len([x for x in res if x is None]) == 0:
            produced_keys.append(pdb_code)
    work_keys = [key for key in requested_keys if key not in produced_keys]

    logging.info("{:} requested keys, {:} produced keys, {:} work keys".format(
        len(requested_keys), len(produced_keys), len(work_keys)))

    inputs = [(pc, args.pdb_dataset, args.output_dir + '/' + pc, args.style)
              for pc in work_keys]
    par.submit_jobs(_generate_clean_complex, inputs, args.c)
Esempio n. 12
0
def shard_pairs(input_path, output_path, cutoff, cutoff_type, num_threads):
    input_sharded = sh.Sharded.load(input_path)
    keys = input_sharded.get_keys()
    if keys != ['ensemble']:
        raise RuntimeError('Can only apply to sharded by ensemble.')
    output_sharded = sh.Sharded(output_path, keys)
    input_num_shards = input_sharded.get_num_shards()

    tmp_path = output_sharded.get_prefix() + f'_tmp@{input_num_shards:}'
    tmp_sharded = sh.Sharded(tmp_path, keys)

    logger.info(f'Using {num_threads:} threads')

    inputs = [(input_sharded, tmp_sharded, shard_num, cutoff, cutoff_type)
              for shard_num in range(input_num_shards)]

    par.submit_jobs(_shard_pairs, inputs, num_threads)

    sho.reshard(tmp_sharded, output_sharded)
    tmp_sharded.delete_files()
Esempio n. 13
0
def gen_labels_sharded(sharded_path, data_csv, num_threads, overwrite):
    sharded = sh.Sharded.load(sharded_path)
    num_shards = sharded.get_num_shards()

    requested_shards = list(range(num_shards))
    if not overwrite:
        produced_shards = [
            x for x in requested_shards if sharded.has(x, 'labels')
        ]
    else:
        produced_shards = []

    work_shards = set(requested_shards).difference(produced_shards)
    logger.info(f'{len(requested_shards):} requested, '
                f'{len(produced_shards):} already produced, '
                f'{len(work_shards):} left to do.')
    logger.info(f'Using {num_threads:} threads')

    inputs = [(sharded, shard_num, data_csv) for shard_num in work_shards]

    par.submit_jobs(_gen_labels_shard, inputs, num_threads)
Esempio n. 14
0
def main(raw_pdb_dir, pruned_pairs_dir, output_dir, neighbor_def, cutoff,
         num_cpus):
    """Run postprocess_pruned_pairs on all provided complexes."""
    logging.info("Looking for all pairs in {:}".format(pruned_pairs_dir))
    work_filenames = \
        db.get_structures_filenames(pruned_pairs_dir, extension='.dill')
    work_keys = [db.get_pdb_name(x) for x in work_filenames]
    logging.info("Found {:} pairs in {:}".format(len(work_keys), output_dir))

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
        output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) +
                                ".dill")

    inputs = [(raw_pdb_dir, neighbor_def, cutoff, i, o)
              for i, o in zip(work_filenames, output_filenames)]
    n_copied = 0
    n_copied += np.sum(
        par.submit_jobs(postprocess_pruned_pairs, inputs, num_cpus))
    logging.info("{:} out of {:} pairs was copied".format(
        n_copied, len(work_keys)))
Esempio n. 15
0
def map_all_profile_hmms(pkl_dataset, pruned_dataset, output_dir, hhsuite_db,
                         num_cpu_jobs, num_cpus_per_job, source_type, num_iter,
                         rank, size, write_file):
    ext = '.pkl'
    if write_file:
        if source_type.lower() == 'rcsb':
            # Filter out pairs that did not survive pruning previously to reduce complexity
            pruned_pdb_names = [
                db.get_pdb_name(filename)
                for filename in db.get_structures_filenames(pruned_dataset,
                                                            extension='.dill')
            ]
            requested_filenames = [
                os.path.join(pkl_dataset,
                             db.get_pdb_code(pruned_pdb_name)[1:3],
                             pruned_pdb_name.split('_')[0] + ext)
                for pruned_pdb_name in pruned_pdb_names
            ]
        else:  # DB5 does not employ pair pruning, so there are no pairs to filter
            requested_filenames = [
                filename
                for filename in db.get_structures_filenames(pkl_dataset,
                                                            extension=ext)
            ]

        # Filter DB5 filenames to unbound type and get all work filenames
        requested_filenames = [
            filename for filename in requested_filenames
            if (source_type.lower() == 'db5' and '_u_' in filename) or (
                source_type.lower() in
                ['rcsb', 'evcoupling', 'casp_capri', 'input'])
        ]
        requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
        produced_filenames = db.get_structures_filenames(output_dir,
                                                         extension='.pkl')
        produced_keys = [db.get_pdb_name(x) for x in produced_filenames]
        work_keys = [key for key in requested_keys if key not in produced_keys]
        establish_pdb_code_case = lambda pdb_code, source_type: pdb_code.lower() \
            if source_type.lower() == 'casp_capri' \
            else pdb_code.upper()
        work_filenames = [
            os.path.join(
                pkl_dataset,
                establish_pdb_code_case(db.get_pdb_code(work_key),
                                        source_type)[1:3], work_key + ext)
            for work_key in work_keys
        ]

        # Remove any duplicate filenames
        work_filenames = list(set(work_filenames))
        logging.info(
            "{:} requested keys, {:} produced keys, {:} work filenames".format(
                len(requested_keys), len(produced_keys), len(work_filenames)))

        if source_type.lower() == 'input':
            # Directly generate profile HMM features after aggregating input filenames
            logging.info("{:} work filenames".format(len(work_filenames)))

            output_filenames = []
            for pdb_filename in work_filenames:
                sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
                if not os.path.exists(sub_dir):
                    os.makedirs(sub_dir, exist_ok=True)
                output_filenames.append(sub_dir + '/' +
                                        db.get_pdb_name(pdb_filename) + '.pkl')

            inputs = [(num_cpus_per_job, key, output, hhsuite_db, source_type,
                       num_iter)
                      for key, output in zip(work_filenames, output_filenames)]
            par.submit_jobs(map_profile_hmms, inputs, num_cpu_jobs)
        else:
            # Write out a local file containing all work filenames
            temp_df = pd.DataFrame({'filename': work_filenames})
            temp_df.to_csv(f'{source_type}_work_filenames.csv')
            logging.info(
                'File containing work filenames written to storage. Exiting...'
            )

    # Read from previously-created work filenames CSV
    else:
        work_filenames = pd.read_csv(
            f'{source_type}_work_filenames.csv').iloc[:, 1].to_list()
        work_filenames = list(
            set(work_filenames))  # Remove any duplicate filenames

        # Reserve an equally-sized portion of the full work load for a given rank in the MPI world
        work_filename_rank_batches = slice_list(work_filenames, size)
        work_filenames = work_filename_rank_batches[rank]

        logging.info("{:} work filenames".format(len(work_filenames)))

        output_filenames = []
        for pdb_filename in work_filenames:
            sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
            if not os.path.exists(sub_dir):
                os.makedirs(sub_dir, exist_ok=True)
            output_filenames.append(sub_dir + '/' +
                                    db.get_pdb_name(pdb_filename) + '.pkl')

        inputs = [(num_cpus_per_job, key, output, hhsuite_db, source_type,
                   num_iter)
                  for key, output in zip(work_filenames, output_filenames)]
        par.submit_jobs(map_profile_hmms, inputs, num_cpu_jobs)
Esempio n. 16
0
def regenerate_edge(edge, filelist, root, dryrun, job_server):
    def fileListSize(files):
        size = 0
        for f in files:
            stat = os.stat(f)
            size += stat.st_size
        return size

    if not edge.check_sources(filelist, root):
        return
    tgtdir = '%(root)s/%(lang)s/%(to_what)s' % {
        'root': root,
        'lang': edge.target['lang'],
        'to_what': edge.target['tag']
    }

    # printout
    msg = "# Doing %s" % str(edge)
    msg += ' (%s %s)' % ('.'.join([str(i) for i in time.localtime()[:3]]),
                         ':'.join([str(i) for i in time.localtime()[3:6]]))
    if dryrun:
        print msg
        if not os.path.exists(tgtdir):
            print 'mkdir %s 2>/dev/null' % tgtdir
        if hasattr(edge, 'collector') and edge.collector:
            print '# Collecting before running giza'
        else:
            #regenerating files one by one
            for f in filelist:
                regenerate_file(filename=f,
                                edge=edge,
                                root=root,
                                dryrun=dryrun)
    else:
        logg(msg)

        # Starting daemon if needed
        if edge.daemon and not job_server:
            start_daemon(edge)

        if not os.path.exists(tgtdir):
            os.mkdir(tgtdir)

        # checking if edge is in "collector" mode
        if hasattr(edge, 'collector') and edge.collector:
            dirs_to_clear = [tgtdir]
            # collecting all the data first
            sourceDirs = edge.sourceDirs(root)
            for i, src in enumerate(edge.sources):
                dirs_to_clear.append(sourceDirs[i])
                collecting.collect(sourceDirs[i],
                                   suffix='.%s.%s' % (src['lang'], src['tag']))

            # run the command on the collected data
            regenerate_file(filename=collecting.DATA,
                            edge=edge,
                            root=root,
                            dryrun=dryrun)

            # separating file
            src0dir = sourceDirs[0]
            src0_catalog = '%s/%s.%s.%s' % (src0dir, collecting.CATALOG,
                                            edge.sources[0]['lang'],
                                            edge.sources[0]['tag'])
            suffix = '.%s.%s' % (edge.target['lang'], edge.target['tag'])
            collecting.separate(tgtdir, suffix=suffix, catalog=src0_catalog)

            collecting.clear_dirs(dirs_to_clear)
        else:
            #regenerating files one by one
            if not job_server:
                sourceFiles0 = edge.sourceFiles(root, filelist)[0]
                targetSize = fileListSize(sourceFiles0)
                actualSize = 0
                for sourceFile, filename in zip(sourceFiles0, filelist):
                    regenerate_file(filename=filename,
                                    edge=edge,
                                    root=root,
                                    dryrun=dryrun)
                    actualSize += os.stat(sourceFile).st_size
                    try:
                        percent = float(actualSize) / float(targetSize) * 100.0
                    except ZeroDivisionError:
                        sys.stdout.write(
                            'Nothing to be done. Only empty files.')
                        break
                    sys.stdout.write('%.2f%% ' % percent)
                    sys.stdout.flush()
                sys.stdout.write('\n')
            else:
                jobs = []
                sourceFiles0 = edge.sourceFiles(root, filelist)[0]
                for sourceFile, filename in zip(sourceFiles0, filelist):
                    regenerate_file(filename=filename,
                                    edge=edge,
                                    root=root,
                                    dryrun=dryrun,
                                    job_server=job_server,
                                    jobs=jobs)
                if not edge.daemon:
                    for i, job in enumerate(jobs):
                        job()
                        percent = float(i) / float(len(jobs)) * 100.0
                        sys.stdout.write('%.2f%% ' % percent)
                        sys.stdout.flush()
                    sys.stdout.write('\n')
                else:
                    submit_jobs(job_server, edge.command, 6, edge.port, jobs)
Esempio n. 17
0
def regenerate_edge(edge, filelist, root, dryrun, job_server):
    def fileListSize(files):
        size = 0
        for f in files:
            stat = os.stat(f)
            size += stat.st_size
        return size
    
    if not edge.check_sources(filelist, root):
        return
    tgtdir = '%(root)s/%(lang)s/%(to_what)s' % {'root': root,
                                                'lang': edge.target['lang'],
                                                'to_what': edge.target['tag']
                                               }
    
    # printout
    msg = "# Doing %s" % str(edge)
    msg += ' (%s %s)' % ('.'.join([str(i) for i in time.localtime()[:3]]),
                        ':'.join([str(i) for i in time.localtime()[3:6]]))
    if dryrun:
        print msg
        if not os.path.exists(tgtdir):
            print 'mkdir %s 2>/dev/null' % tgtdir
        if hasattr(edge, 'collector') and edge.collector:
            print '# Collecting before running giza'
        else:
            #regenerating files one by one
            for f in filelist:
                regenerate_file(filename=f, edge=edge, root=root, dryrun=dryrun)
    else:
        logg(msg)
        
        # Starting daemon if needed
        if edge.daemon and not job_server:
            start_daemon(edge)
        
        if not os.path.exists(tgtdir):
            os.mkdir(tgtdir)
        
        # checking if edge is in "collector" mode
        if hasattr(edge, 'collector') and edge.collector:
            dirs_to_clear = [tgtdir]
            # collecting all the data first
            sourceDirs = edge.sourceDirs(root)
            for i, src in enumerate(edge.sources):
                dirs_to_clear.append(sourceDirs[i])
                collecting.collect(sourceDirs[i], suffix='.%s.%s' % (
                                                          src['lang'],
                                                          src['tag']
                                                          )
                                  )
            
            # run the command on the collected data
            regenerate_file(filename=collecting.DATA, edge=edge,
                            root=root, dryrun=dryrun)
            
            # separating file
            src0dir = sourceDirs[0]
            src0_catalog = '%s/%s.%s.%s' % (src0dir,
                                           collecting.CATALOG,
                                           edge.sources[0]['lang'],
                                           edge.sources[0]['tag']
                                          )
            suffix = '.%s.%s' % (edge.target['lang'], edge.target['tag'])
            collecting.separate(tgtdir, suffix=suffix, catalog=src0_catalog)
            
            collecting.clear_dirs(dirs_to_clear)
        else:
            #regenerating files one by one
            if not job_server:
                sourceFiles0 = edge.sourceFiles(root, filelist)[0]
                targetSize = fileListSize(sourceFiles0)
                actualSize = 0
                for sourceFile, filename in zip(sourceFiles0, filelist):
                    regenerate_file(filename=filename, edge=edge, root=root,
                                    dryrun=dryrun)
                    actualSize += os.stat(sourceFile).st_size
                    try:
                        percent = float(actualSize)/float(targetSize) * 100.0
                    except ZeroDivisionError:
                        sys.stdout.write('Nothing to be done. Only empty files.')
                        break
                    sys.stdout.write('%.2f%% ' % percent)
                    sys.stdout.flush()
                sys.stdout.write('\n')
            else:
                jobs = []
                sourceFiles0 = edge.sourceFiles(root, filelist)[0]
                for sourceFile, filename in zip(sourceFiles0, filelist):
                    regenerate_file(filename=filename, edge=edge, root=root,
                                    dryrun=dryrun, job_server=job_server,
                                    jobs=jobs)
                if not edge.daemon:
                    for i, job in enumerate(jobs):
                        job()
                        percent = float(i)/float(len(jobs)) * 100.0
                        sys.stdout.write('%.2f%% ' % percent)
                        sys.stdout.flush()
                    sys.stdout.write('\n')
                else:
                    submit_jobs(job_server, edge.command, 6, edge.port, jobs)
Esempio n. 18
0
def map_all_protrusion_indices(psaia_dir, psaia_config_file, pdb_dataset,
                               pkl_dataset, pruned_dataset, output_dir,
                               source_type):
    ext = '.pkl'
    if source_type.lower() == 'rcsb':
        # Filter out pairs that did not survive pruning previously to reduce complexity
        pruned_pdb_names = [
            db.get_pdb_name(filename)
            for filename in db.get_structures_filenames(pruned_dataset,
                                                        extension='.dill')
        ]
        requested_filenames = [
            os.path.join(pkl_dataset,
                         db.get_pdb_code(pruned_pdb_name)[1:3],
                         pruned_pdb_name.split('_')[0] + ext)
            for pruned_pdb_name in pruned_pdb_names
        ]
    else:  # DB5 does not employ pair pruning, so there are no pairs to filter
        requested_filenames = [
            filename for filename in db.get_structures_filenames(pkl_dataset,
                                                                 extension=ext)
        ]

    # Filter DB5 filenames to unbound type and get all work filenames
    requested_filenames = [
        filename for filename in requested_filenames
        if (source_type.lower() == 'db5' and '_u_' in filename) or
        (source_type.lower() in ['rcsb', 'evcoupling', 'casp_capri', 'input'])
    ]
    requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
    requested_pdb_codes = [db.get_pdb_code(x) for x in requested_filenames]
    produced_filenames_path = os.path.join(output_dir, 'PSAIA',
                                           source_type.upper())
    produced_filenames = [
        path.as_posix()
        for path in Path(produced_filenames_path).rglob('*.tbl')
    ]
    produced_keys = [db.get_pdb_code(x) for x in produced_filenames]
    work_keys = [
        key for key, pdb_code in zip(requested_keys, requested_pdb_codes)
        if pdb_code not in produced_keys
    ]
    format_pdb_code_for_inputs = lambda pdb_code, source_type: pdb_code[1:3] \
        if source_type.lower() in ['input'] \
        else pdb_code.upper()
    if source_type.lower() == 'rcsb' or source_type.lower() == 'casp_capri':
        work_filenames = [
            os.path.join(pdb_dataset,
                         db.get_pdb_code(work_key)[1:3], work_key)
            for work_key in work_keys
        ]
    else:
        work_filenames = [
            os.path.join(
                pdb_dataset,
                format_pdb_code_for_inputs(db.get_pdb_code(work_key),
                                           source_type), work_key)
            for work_key in work_keys
        ]

    # Remove any duplicate filenames
    work_filenames = list(set(work_filenames))

    # Exit early if no inputs need to processed
    logging.info("{:} PDB files to process with PSAIA".format(
        len(work_filenames)))

    # Create comprehensive filename list for PSAIA to single-threadedly process for requested features (e.g. protrusion)
    file_list_file = os.path.join(output_dir, 'PSAIA', source_type.upper(),
                                  'pdb_list.fls')
    with open(file_list_file, 'w') as file:
        for requested_pdb_filename in work_filenames:
            file.write(f'{requested_pdb_filename}\n')

    inputs = [(psaia_dir, psaia_config_file, file_list_file)]
    par.submit_jobs(map_protrusion_indices, inputs,
                    1)  # PSAIA is inherently single-threaded in execution
Esempio n. 19
0
def map_all_pssms(pkl_dataset, pruned_dataset, blastdb, output_dir, num_cpus,
                  source_type, rank, size):
    ext = '.pkl'
    if source_type.lower(
    ) == 'rcsb':  # Filter out pairs that did not survive pruning previously to reduce complexity
        pruned_pdb_names = [
            db.get_pdb_name(filename)
            for filename in db.get_structures_filenames(pruned_dataset,
                                                        extension='.dill')
        ]
        requested_filenames = [
            os.path.join(pkl_dataset,
                         db.get_pdb_code(pruned_pdb_name)[1:3],
                         pruned_pdb_name.split('_')[0] + ext)
            for pruned_pdb_name in pruned_pdb_names
        ]
    else:  # DB5 does not employ pair pruning, so there are no pairs to filter
        requested_filenames = [
            filename for filename in db.get_structures_filenames(pkl_dataset,
                                                                 extension=ext)
        ]

    # Filter DB5 filenames to unbound type and get all work filenames
    requested_filenames = [
        filename for filename in requested_filenames
        if (source_type.lower() == 'db5' and '_u_' in filename) or (
            source_type.lower() == 'rcsb') or (source_type.lower(
            ) == 'evcoupling') or (source_type.lower() == 'casp_capri')
    ]
    requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
    produced_filenames = db.get_structures_filenames(output_dir,
                                                     extension='.pkl')
    produced_keys = [db.get_pdb_name(x) for x in produced_filenames]
    work_keys = [key for key in requested_keys if key not in produced_keys]
    if source_type.lower() == 'rcsb' or source_type.lower() == 'casp_capri':
        work_filenames = [
            os.path.join(pkl_dataset,
                         db.get_pdb_code(work_key)[1:3], work_key + ext)
            for work_key in work_keys
        ]
    else:
        work_filenames = [
            os.path.join(pkl_dataset,
                         db.get_pdb_code(work_key)[1:3].upper(),
                         work_key + ext) for work_key in work_keys
        ]

    # Reserve an equally-sized portion of the full work load for a given rank in the MPI world
    work_filenames = list(set(work_filenames))
    work_filename_rank_batches = slice_list(work_filenames, size)
    work_filenames = work_filename_rank_batches[rank]

    # Remove any duplicate filenames
    logging.info(
        "{:} requested keys, {:} produced keys, {:} work filenames".format(
            len(requested_keys), len(produced_keys), len(work_filenames)))

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir, exist_ok=True)
        output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) +
                                ".pkl")

    inputs = [(key, blastdb, output)
              for key, output in zip(work_filenames, output_filenames)]
    par.submit_jobs(map_pssms, inputs, num_cpus)