Example #1
0
def main():
    import argparse
    import sys

    parser = argparse.ArgumentParser()
    parser.add_argument('--download_threads', type=int, default=1, help='number of threads')

    # parser.add_argument('--pdb_dir', type=str, action='store_true',
    #                     help='now pdb_codes_or_directory is a path to a directory with mmcif files. Whole tree structure is inspected, all files are assumed to be mmcifs.')
    parser.add_argument('-i', '--input_type', default='json', choices=['json', 'pdb_codes'], help='comma-delimited list of pdb_codes, or if `-d` option is present, a directory with mmcif files.')
    parser.add_argument('input', help='comma-delimited list of pdb_codes, or if `-d` option is present, a directory with mmcif files.')
    add_loglevel_args(parser)

    args = parser.parse_args()
    project_logger.setLevel(args.loglevel)
    logger.setLevel(args.loglevel)  # bohužel musim specifikovat i tohle, protoze takhle to s __name__ funguje...
    logging.basicConfig()

    if args.input_type == 'pdb_codes':
        pdb_codes = args.input.strip().split(',')

        if not pdb_codes:
            logger.error('No pdb codes specified')
            sys.exit(1)

    elif args.input_type == 'json':
        # todo to accept just list of pdb_codes, add column chain_id? And won't that break chain_whitelists (want empty set)
        chains = pd.read_json(args.input)
        chains_gb_pdb_code = chains.groupby('pdb_code')
        pdb_codes = chains_gb_pdb_code.indices.keys()

    download_structures(list(pdb_codes), args.download_threads)
def main():
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('--workers',
                        type=int,
                        default=1,
                        help='number of threads for concurrent API requests')
    parser.add_argument(
        'structures_json',
        help=
        'annotate the list of structures with isoform data. File needs to contain list of objects with pdb_code and chain_id'
    )
    parser.add_argument(
        'output_file',
        help='writes input json annotated with isoform uniprot id')
    add_loglevel_args(parser)
    args = parser.parse_args()
    project_logger.setLevel(args.loglevel)
    logger.setLevel(
        args.loglevel
    )  # bohužel musim specifikovat i tohle, protoze takhle to s __name__ funguje...
    logging.basicConfig()

    with open(args.structures_json) as f:
        structures_info = json.load(f)

    with concurrent.futures.ThreadPoolExecutor(
            max_workers=args.workers) as executor:

        def get_isoform_or_none(ordinal, s):
            logger.info(f'processing {ordinal}-th structure {s["pdb_code"]}')

            try:
                return get_isoform(s['pdb_code'], s['chain_id'])
            except APIException as e:
                if '404' in str(
                        e.__cause__
                ):  # todo thihs does not work EDIT it does catch 404, so why I wrote that?
                    logger.info(f'isoform not found for {s["pdb_code"]}')
                else:
                    logger.exception(f'api error for {s["pdb_code"]}')
            except Exception:
                logger.exception(f'unexpected error for {s["pdb_code"]}')

            return None

        # todo if more chains, for a structure, cache (problem with multithreading though)
        results = executor.map(get_isoform_or_none, itertools.count(),
                               structures_info)

    # update the dict (json) with isoform information
    for isoform, structure_info in zip(results, structures_info):
        structure_info['isoform'] = isoform

    with open(args.output_file, 'w') as f:
        json.dump(structures_info, f)
def main():
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('--workers',
                        type=int,
                        default=1,
                        help='number of subprocesses')
    parser.add_argument(
        'structures_json',
        help=
        'annotate the list of structures with is_holo bool. File needs to contain list of objects with pdb_code and chain_id and path to the structure'
    )
    parser.add_argument(
        'output_file',
        help='writes input json annotated with boolean "is holo"')
    add_loglevel_args(parser)
    args = parser.parse_args()
    project_logger.setLevel(args.loglevel)
    logger.setLevel(
        args.loglevel
    )  # bohužel musim specifikovat i tohle, protoze takhle to s __name__ funguje...
    logging.basicConfig()

    with open(args.structures_json) as f:
        structures_info = json.load(f)

    # todo multiprocessing logging will only work to stdout: "Although logging is thread-safe, and logging to a single file from multiple threads in a single process is supported, logging to a single file from multiple processes is not supported, because there is no standard way to serialize access to a single file across multiple processes in Python.
    # todo ma-li struktura vic chainu, nacist jen jednou (ale budu to vsechno delat po skupinach, tak preci musim vedet, jaky chain tak nejak chci, tak by byl jen ten. Ale muze jich byt vic..)
    with concurrent.futures.ProcessPoolExecutor(
            max_workers=args.workers) as executor:

        def is_holo(ordinal, s: Dict):
            logger.info(f'processing {ordinal}-th structure {s["pdb_code"]}')

            is_holo_analyzer = IsHolo()

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                model = MMCIFParser().get_structure(s['pdb_code'],
                                                    s['path'])[0]

            return is_holo_analyzer(model, model[s['chain_id']])

        results = executor.map(is_holo, itertools.count(), structures_info)

    # add is_holo flag to the structure info dicts
    for is_holo, structure_info in zip(results, structures_info):
        structure_info['is_holo'] = is_holo

    with open(args.output_file, 'w') as f:
        json.dump(structures_info, f)
def main():
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('--workers', type=int, default=1, help='number of threads for concurrent API requests')
    parser.add_argument('structures_json', type=Path, help='File needs to contain list of objects with pdb_code, chain_id, isoform '
                                                'and is_holo flag')
    parser.add_argument('output_file', type=Path, help='writes apo-holo pairs in json')
    add_loglevel_args(parser)
    args = parser.parse_args()
    # todo combine this and put in logs (I only use this in scripts anyway)
    project_logger.setLevel(args.loglevel)
    logger.setLevel(args.loglevel)  # bohužel musim specifikovat i tohle, protoze takhle to s __name__ funguje...
    logging.basicConfig()

    json_files = glob.glob(str(args.structures_json))
    structures_metadata = read_jsons_with_seqs(json_files, quiet=False)#args.loglevel > logging.INFO)
    pairs = list(make_pairs_with_lcs(structures_metadata, args.workers))

    with args.output_file.open('w') as f:
        json.dump(pairs, f, cls=CustomJSONEncoder)
Example #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-u', '--uniprot_ids', help='comma-separated list of primary uniprot accessions')
    parser.add_argument('--limit_group_size_to', type=int, help='comma-separated list of primary uniprot accessions')
    parser.add_argument('--seed', default=42, type=int, help='comma-separated list of primary uniprot accessions')
    parser.add_argument('output_file', help='output filename for the json list of pdb_codes that passed the filter. Paths to mmcif files are relative to the working directory.')
    add_loglevel_args(parser)

    args = parser.parse_args()
    project_logger.setLevel(args.loglevel)
    logger.setLevel(args.loglevel)  # bohužel musim specifikovat i tohle, protoze takhle to s __name__ funguje...
    logging.basicConfig()

    if args.uniprot_ids:
        uniprot_ids = set((g.strip() for g in args.uniprot_ids.split(',')))
    else:
        # means all uniprot groups
        uniprot_ids = None

    chains = collect_chains_for_uniprot_ids(uniprot_ids, args.limit_group_size_to, args.seed)
    chains.to_json(args.output_file, orient='records')
Example #6
0
def main():
    # runs for all isoforms by default
    # optionally specify a single isoform with --isoform

    import argparse

    parser = argparse.ArgumentParser()
    # parser.add_argument('--limit_pairs_for_group', type=int, help='process only structures with main chain of that isoform')
    parser.add_argument(
        '--workers',
        default=4,
        type=int,
        help='process only structures with main chain of that isoform')
    parser.add_argument('--opt_input_dir', type=Path, default=Path())
    # parser.add_argument('chains_json', help='list of structures {pdb_code: , path: , isoform_id: , is_holo: bool, ?main_chain_id: }')
    parser.add_argument(
        'pairs_json',
        help=
        'list of structures {pdb_code: , path: , isoform_id: , is_holo: bool, ?main_chain_id: }'
    )
    add_loglevel_args(parser)

    args = parser.parse_args()
    project_logger.setLevel(args.loglevel)
    logger.setLevel(
        args.loglevel
    )  # bohužel musim specifikovat i tohle, protoze takhle to s __name__ funguje...
    logging.basicConfig()

    potential_pairs = load_pairs_json(args.pairs_json)
    print(potential_pairs)
    pairs = pairs_without_mismatches(potential_pairs)
    # pairs = pairs.iloc[:100]  # todo testing hack
    print(pairs)
    # if args.limit_pairs_for_group:

    # neni treba, nepotrebuju nutne uniprot ted..
    # chains = pd.read_json(args.chains_json)
    # pairs = pairs.merge(chains.set_index(['pdb_code', 'chain_id']), left_on=['pdb_code_apo', 'chain_id_apo'], right_index=True)

    # with open(args.output_file, 'w') as f:
    #     kdyby serializace analyz byla prubezna (csv rows/triples stream)

    # don't run analyses for each isoform group separately, as creating a process pool carries an overhead
    # median pairs per group is 6
    # but could reset the caches? No need, all are LRU..
    start_datetime = datetime.now()
    analyses_output_fpath = Path(
        f'output_apo_holo_{start_datetime.isoformat()}.json')
    domains_info_fpath = Path(
        f'output_domains_info_{start_datetime.isoformat()}.json')

    with Manager() as multiprocessing_manager:
        # get analyzers as configured
        # p = configure_pipeline(multiprocessing_manager)
        analyses_namespace = configure_pipeline(multiprocessing_manager,
                                                args.opt_input_dir)

        serializer = ConcurrentJSONAnalysisSerializer(analyses_output_fpath,
                                                      multiprocessing_manager)
        domains_info = multiprocessing_manager.list()
        one_struct_analyses_done_set = multiprocessing_manager.dict()

        # with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
        with concurrent.futures.ProcessPoolExecutor(
                max_workers=args.workers,
                initializer=worker_initializer,
                initargs=(analyses_namespace, serializer, domains_info,
                          one_struct_analyses_done_set)) as executor:

            def get_args():
                for row in pairs.itertuples():
                    # todo, musim poslat ještě lcs_result.i1, i2 a length minimálně...
                    # vůbec by bylo hezký mít možnost nejen tyhle uložený analýzy serializovat (jako casto delam), ale
                    # taky deserializovat. Pak bych tam poslal celej lcs_result? Rozhodně by zabíral mín paměti než samotnej dict, hádám
                    # stejně tak domény bych měl umět ideálně deserializovat nějak... (Mozna by ale pak celkem blbe fungovaly merge v pandas?)

                    yield (
                        row.pdb_code_apo,
                        row.pdb_code_holo,
                        row.chain_id_apo,
                        row.chain_id_holo,
                        row.lcs_result,
                    )
                    # [p.comparators_of_apo_holo__residues_param,
                    # p.comparators_of_apo_holo__residue_ids_param,
                    # p.comparators_of_apo_holo_domains__residues_param,
                    # p.comparators_of_apo_holo_domains__residue_ids_param,
                    # p.comparators_of_apo_holo_2DA__residues_param,],
                    # p.get_domains,
                    # p.get_rmsd,
                    # p.get_interdomain_surface,
                    # serializer, domains_info, one_struct_analyses_done_set)

            fn = partial(fn_wrapper_unpack_args, process_pair)
            futures = submit_tasks(executor, 40 * args.workers, fn, get_args())
            # wait for all futures to complete
            for i, f in enumerate(futures):
                f.result()
                # log progress
                logger.info(f'done {i+1} / {len(pairs)}')

            serializer.dump_data()
            with domains_info_fpath.open('w') as f:
                json.dump(list(domains_info), f)

            print(start_datetime.isoformat())
            print(datetime.now().isoformat())
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--limit_pairs_for_group',
        type=int,
        help='process only structures with main chain of that isoform')
    parser.add_argument(
        '--workers',
        default=12,
        type=int,
        help='process only structures with main chain of that isoform')
    parser.add_argument(
        'pairs_json',
        help=
        'list of structures {pdb_code: , path: , isoform_id: , is_holo: bool, ?main_chain_id: }'
    )

    add_loglevel_args(parser)

    args = parser.parse_args()
    project_logger.setLevel(args.loglevel)
    logger.setLevel(
        args.loglevel
    )  # bohužel musim specifikovat i tohle, protoze takhle to s __name__ funguje...
    logging.basicConfig()
    # to pairs bych mohl mít bez mismatchů, ušetřil bch v těhlech 2/3 skriptech třetinu párů/paměti.

    potential_pairs = load_pairs_json(args.pairs_json)
    print(potential_pairs)
    pairs = pairs_without_mismatches(potential_pairs)
    # pairs = pairs.iloc[:40]  # todo testing hack
    print(pairs)

    def rename_col(old_name):
        cols = ['pdb_code', 'chain_id']
        for col in cols:
            if old_name.startswith(col):
                return col
        raise ValueError('column not found')

    all_chains = pd.concat([
        pairs[['pdb_code_apo', 'chain_id_apo']].rename(columns=rename_col),
        pairs[['pdb_code_holo', 'chain_id_holo']].rename(columns=rename_col),
    ])

    all_structs = all_chains['pdb_code'].unique()
    print(all_structs)

    quiet = args.loglevel > logging.INFO

    # could open the shelf with flag fast and file sync once in a while, but API requests are slow anyway
    with ThreadPoolExecutor(max_workers=args.workers) as executor:

        logger.info(f'total structs: {len(all_structs)}')
        successes = errors = 0

        ss_futures = submit_tasks(executor, 40 * args.workers, get_ss,
                                  all_structs)

        with shelve.open('db_get_ss') as db:
            for i, pdb_code, f in zip(itertools.count(), all_structs,
                                      ss_futures):
                try:
                    db[pdb_code] = f.result()
                    successes += 1
                except Exception:
                    logger.exception(f'get_ss for {pdb_code} failed with:')
                    errors += 1

                maybe_print(
                    quiet,
                    f'\r suc {successes}, err {errors} done {i+1}/{len(all_structs)}',
                    end='')

        successes = errors = 0

        domain_futures = submit_tasks(executor, 40 * args.workers, get_domains,
                                      all_structs)

        with shelve.open('db_get_domains') as db:
            for i, pdb_code, f in zip(itertools.count(), all_structs,
                                      domain_futures):
                try:
                    db[pdb_code] = f.result()
                    successes += 1
                except Exception:
                    logger.exception(
                        f'get_domains for {pdb_code} failed with:')
                    errors += 1

                maybe_print(
                    quiet,
                    f'\r suc {successes}, err {errors} done {i+1}/{len(all_structs)}',
                    end='')
Example #8
0
def main():
    # chce, aby se tomu mohly dodat fily přes directory
    # nebo se tomu dá comma-delimited list of pdb_codes

    import argparse
    import sys

    parser = argparse.ArgumentParser()
    parser.add_argument('--workers',
                        type=int,
                        default=1,
                        help='number of subprocesses')
    parser.add_argument('--download_threads',
                        type=int,
                        default=1,
                        help='number of threads')
    parser.add_argument('--all_chains', default=False, action='store_true')
    parser.add_argument('--disallow_download',
                        default=False,
                        action='store_true')

    # parser.add_argument('--pdb_dir', type=str, action='store_true',
    #                     help='now pdb_codes_or_directory is a path to a directory with mmcif files. Whole tree structure is inspected, all files are assumed to be mmcifs.')
    parser.add_argument(
        '-i',
        '--input_type',
        default='json',
        choices=['json', 'pdb_dir', 'pdb_codes'],
        help=
        'comma-delimited list of pdb_codes, or if `-d` option is present, a directory with mmcif files.'
    )
    parser.add_argument(
        'input',
        help=
        'comma-delimited list of pdb_codes, or if `-d` option is present, a directory with mmcif files.'
    )
    parser.add_argument(
        'output_file',
        help=
        'output filename for the json list of pdb_codes that passed the filter. Paths to mmcif files are relative to the working directory.'
    )
    add_loglevel_args(parser)

    args = parser.parse_args()
    project_logger.setLevel(args.loglevel)
    logger.setLevel(
        args.loglevel
    )  # bohužel musim specifikovat i tohle, protoze takhle to s __name__ funguje...
    logging.basicConfig()

    assert args.input_type == 'json'  # todo temporary hack (so that contains uniprotkb_id metadata)

    chain_whitelists = None

    # translate input into structure filenames
    if args.input_type == 'pdb_dir':
        directory = args.input

        if not os.path.isdir(directory):
            logger.error(f'Directory {directory} does not exist')
            sys.exit(1)

        structure_filenames = get_structure_filenames_in_dir(directory)
    elif args.input_type == 'pdb_codes':
        pdb_codes = args.input.strip().split(',')

        if not pdb_codes:
            logger.error('No pdb codes specified')
            sys.exit(1)

        if args.disallow_download:
            structure_filenames = find_structures(pdb_codes)
        else:
            structure_filenames = download_structures(pdb_codes,
                                                      args.download_threads)
        # structure_filenames = (retrieve_structure_file_from_pdb(pdb_code) for pdb_code in pdb_codes)
    elif args.input_type == 'json':
        # todo to accept just list of pdb_codes, add column chain_id? And won't that break chain_whitelists (want empty set)
        chains = pd.read_json(args.input)
        # chains = chains.iloc[:100]  # todo test hack
        # chimeric - more UNP to one chain (or could be in-vivo chimeric?
        # skip them all, (using one unp does not make sense) Or put it them with both unps?
        # todo obviously doesn't work, if this file is run with batches...
        chains = chains.drop_duplicates(subset=['pdb_code', 'chain_id'],
                                        keep=False)
        chains_gb_pdb_code = chains.groupby('pdb_code')

        metadata_gb_structure = chains_gb_pdb_code.apply(
            lambda df: df.set_index('chain_id').to_dict(orient='index'))
        chain_whitelists = chains_gb_pdb_code['chain_id'].apply(
            lambda series: set(series.to_list()))
        pdb_codes = chains_gb_pdb_code.indices.keys()
        if args.disallow_download:
            structure_filenames = find_structures(pdb_codes)
        else:
            structure_filenames = download_structures(pdb_codes,
                                                      args.download_threads)
    else:
        raise ValueError('Unknown input type argument')

    structure_filenames = list(structure_filenames)
    logger.info(f'total structures to process: {len(structure_filenames)}')

    # load and filter structures
    extra_args = [metadata_gb_structure]
    if not args.all_chains and chain_whitelists is not None:
        extra_args.append(chain_whitelists)

    with ProcessPoolExecutor(max_workers=args.workers) as executor:
        chain_metadata_futures = submit_tasks(
            executor, 40 * args.workers, get_chains_metadata_for_structure,
            itertools.count(), structure_filenames, *extra_args)

        # iterate over the futures and flatten the chain metadata
        # result of a single task is a list of chain metadata for each structure, flatten tasks results into a list of chains
        chains_of_structures_that_passed = []
        for struct_filename, chains_future in zip(structure_filenames,
                                                  chain_metadata_futures):
            try:
                chain_metadata = chains_future.result()
            except Exception:
                logger.exception(
                    f'Exception when a task for a structure `{struct_filename}` was executed.'
                )
                continue

            # flatten
            chains_of_structures_that_passed.extend(chain_metadata)

    # with concurrent.futures.ProcessPoolExecutor(max_workers=args.workers) as executor:
    #
    #
    #     # map to list of futures, so I can handle exceptions (with exeutor.map whole iterator stops in that case)
    #     chain_metadata_futures = list(map(
    #         lambda *args: executor.submit(get_chains_metadata_for_structure, *args),
    #         itertools.count(), structure_filenames, *extra_args,
    #     ))

    # iterate over the futures and flatten the chain metadata
    # result of a single task is a list of chain metadata for each structure, flatten tasks results into a list of chains
    # chains_of_structures_that_passed = []
    # for struct_filename, chains_future in zip(structure_filenames, chain_metadata_futures):
    #     try:
    #         chain_metadata = chains_future.result()
    #     except Exception:
    #         logger.exception(f'Exception when a task for a structure `{struct_filename}` was executed.')
    #         continue
    #
    #     # flatten
    #     chains_of_structures_that_passed.extend(chain_metadata)

    with open(args.output_file, 'w') as f:
        json.dump(chains_of_structures_that_passed, f)