コード例 #1
0
def parse_all(pdb_dataset, output_dir, num_cpus):
    """Parse pdb dataset (pdb files) to pandas dataframes."""
    requested_filenames = db.get_structures_filenames(pdb_dataset)
    produced_filenames = db.get_structures_filenames(
        output_dir, extension='.pkl')

    requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
    produced_keys = [db.get_pdb_name(x) for x in produced_filenames]
    work_keys = [key for key in requested_keys if key not in produced_keys]
    work_filenames = [x[0] for x in
                      db.get_all_filenames(work_keys, pdb_dataset, enforcement=2)]

    logging.info("{:} requested keys, {:} produced keys, {:} work keys"
                 .format(len(requested_keys), len(produced_keys),
                         len(work_keys)))

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
        output_filenames.append(
            sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".pkl")

    inputs = [(key, output)
              for key, output in zip(work_filenames, output_filenames)]
    par.submit_jobs(parse, inputs, num_cpus)
コード例 #2
0
def map_all_pssms(pdb_dataset, blastdb, output_dir, num_cpus):
    ext = '.pkl'
    requested_filenames = \
        db.get_structures_filenames(pdb_dataset, extension=ext)
    requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
    produced_filenames = db.get_structures_filenames(output_dir,
                                                     extension='.pkl')
    produced_keys = [db.get_pdb_name(x) for x in produced_filenames]
    work_keys = [key for key in requested_keys if key not in produced_keys]
    work_filenames = [
        x[0] for x in db.get_all_filenames(work_keys,
                                           pdb_dataset,
                                           extension=ext,
                                           keyer=lambda x: db.get_pdb_name(x))
    ]

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
        output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) +
                                ".pkl")

    logging.info("{:} requested keys, {:} produced keys, {:} work keys".format(
        len(requested_keys), len(produced_keys), len(work_keys)))
    inputs = [(key, blastdb, output)
              for key, output in zip(work_filenames, output_filenames)]
    par.submit_jobs(map_pssms, inputs, num_cpus)
コード例 #3
0
ファイル: tfrecord.py プロジェクト: amorehead/DIPS
def main(pair_dir, tfrecord_dir, num_cpus):
    """Run write_pairs on all provided complexes."""
    requested_filenames = \
        db.get_structures_filenames(pair_dir, extension='.dill')
    requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
    produced_filenames = \
        db.get_structures_filenames(tfrecord_dir, extension='.tfrecord')
    produced_keys = [db.get_pdb_name(x) for x in produced_filenames]

    work_keys = [key for key in requested_keys if key not in produced_keys]
    logging.info("{:} requested keys, {:} produced keys, {:} work keys".format(
        len(requested_keys), len(produced_keys), len(work_keys)))
    work_filenames = [
        x[0]
        for x in db.get_all_filenames(work_keys, pair_dir, extension='.dill')
    ]

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = tfrecord_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
        output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) +
                                ".tfrecord")

    inputs = [(i, o) for i, o in zip(work_filenames, output_filenames)]
    par.submit_jobs(pairs_to_tfrecord, inputs, num_cpus)