コード例 #1
0
def extend_meta_core(analyzed_hits,
                     query,
                     args_inner,
                     all_short,
                     multi_query,
                     iteration,
                     ih_model,
                     timeout=None):
    ml.debug(fname())
    # update params if different config is requested
    CONFIG.override(tools_paths(args_inner.config_file))

    blast_args = deepcopy(args_inner)
    locarna_args = deepcopy(args_inner)
    b_all_short = deepcopy(all_short)
    l_all_short = deepcopy(all_short)

    if args_inner.repredict_file is None:
        fd, repred_file = mkstemp(prefix='rba_',
                                  suffix='_18',
                                  dir=CONFIG.tmpdir)
        os.close(fd)
    else:
        repred_file = args_inner.repredict_file

    for i, args in enumerate([blast_args, locarna_args]):
        args.prediction_method = []
        args.pred_params = dict()
        args.dump = None
        args.pdf_out = None
        args.pandas_dump = None
        args.repredict_file = repred_file + str(i)
        args.dev_pred = False
        args.logfile = None
        args.json = None
        args.html = None
        args.cm_file = ih_model

    analyzed_hits_simple = deepcopy(analyzed_hits)
    analyzed_hits_locarna = deepcopy(analyzed_hits)

    analyzed_hits_simple, _, _, _ = extend_simple_core(analyzed_hits_simple,
                                                       query, blast_args,
                                                       b_all_short,
                                                       multi_query, iteration,
                                                       ih_model)
    analyzed_hits_locarna, _, _, _ = extend_locarna_core(analyzed_hits_locarna,
                                                         query,
                                                         locarna_args,
                                                         l_all_short,
                                                         multi_query,
                                                         iteration,
                                                         ih_model,
                                                         timeout=timeout)

    # add cmstat to query
    analyzed_hits.query = analyzed_hits_simple.query

    order_out = []

    b_dict = {BA_support.get_hit_n(h): h for h in analyzed_hits_simple.hits}
    l_dict = {BA_support.get_hit_n(h): h for h in analyzed_hits_locarna.hits}
    ok_keys = sorted(set(b_dict.keys()) | set(l_dict.keys()))
    for inum in ok_keys:
        bh = b_dict.get(inum, None)
        lh = l_dict.get(inum, None)

        hits = [bh, lh]
        # fallback to simple if locarna returned empty hit
        # deal with the situation when both ways returned empty hits
        filtered_hits = [h for h in hits if h is not None]
        if len(filtered_hits) == 1:
            msg = 'Only one extension method completed successfully for {}. ' \
                  'Choosing the successfully extended sequence to the output.'.format(filtered_hits[0].extension.id)
            ml.info(msg)
            if ml.getEffectiveLevel() < 20:
                print(msg)
            analyzed_hits.hits.append(filtered_hits[0])
            continue
        elif len(filtered_hits) == 0:
            # append empty extension
            analyzed_hits.hits_failed.append(lh)
            continue

        bit_scores = [
            i.extension.annotations['cmstat']['bit_sc'] for i in hits
        ]

        mb = max(bit_scores)
        bit_index = [i for i, j in enumerate(bit_scores) if j == mb][0]
        order_out.append(bit_index)

        analyzed_hits.hits.append(hits[bit_index])

    # build failed hits
    b_dict_failed = {
        BA_support.get_hit_n(h): h
        for h in analyzed_hits_simple.hits_failed
    }
    l_dict_failed = {
        BA_support.get_hit_n(h): h
        for h in analyzed_hits_locarna.hits_failed
    }
    for inum in sorted(set(b_dict_failed) | set(l_dict_failed)):
        if inum not in ok_keys:
            if inum in b_dict_failed:
                analyzed_hits.hits_failed.append(b_dict_failed[inum])
            elif inum in l_dict_failed:
                analyzed_hits.hits_failed.append(l_dict_failed[inum])
            else:
                raise KeyError(
                    "Failed to find inum key in failed extensions. This should not happen."
                )

    # build the repredict file here if needed
    if args_inner.repredict_file:
        b_repredict = BA_support.iter2file_name(blast_args.repredict_file,
                                                multi_query, iteration)
        l_repredict = BA_support.iter2file_name(blast_args.repredict_file,
                                                multi_query, iteration)
        o_repredict = BA_support.iter2file_name(args_inner.repredict_file,
                                                multi_query, iteration)
        with open(b_repredict,
                  'r') as barf, open(l_repredict,
                                     'r') as larf, open(o_repredict,
                                                        'w') as reprf:
            """
            please note that order of files to merge must be same as the order of methods in previous for cycle
            ie same as the one in which order_out var is set
            """
            bb = (barf, larf)

            fl = bb[0].readline()
            reprf.write(fl)
            fl = bb[0].readline()
            reprf.write(fl)
            # dump first line of the other documents
            [[i.readline() for _ in range(1)] for i in bb[1:]]

            for o in order_out:
                lll = [i.readline() for i in bb]
                reprf.write(lll[o])

    # recreate needed data from selected hits
    homology_prediction = []
    homol_seqs = []
    for hit in analyzed_hits.hits:
        homology_prediction.append(hit.hpred)
        if hit.hpred:
            homol_seqs.append(hit.extension)

        # add default prediction if it is not present
        if 'ss0' not in hit.extension.letter_annotations:
            if 'sss' not in hit.extension.annotations:
                hit.extension.anotations['sss'] = []
            hit.extension.annotations['sss'] += ['ss0']
            hit.extension.letter_annotations['ss0'] = '.' * len(
                hit.extension.seq)

    # recreate needed data from selected hits
    homology_prediction = []
    homol_seqs = []
    for hit in analyzed_hits.hits:
        homology_prediction.append(hit.hpred)
        if hit.hpred:
            homol_seqs.append(hit.extension)

        # add default prediction if it is not present
        if 'ss0' not in hit.extension.letter_annotations:
            if 'sss' not in hit.extension.annotations:
                hit.extension.anotations['sss'] = []
            hit.extension.annotations['sss'] += ['ss0']
            hit.extension.letter_annotations['ss0'] = '.' * len(
                hit.extension.seq)

    # remove description from hits and sources
    for hit in analyzed_hits.hits:
        hit.extension.description = ''

    if args_inner.cm_file or args_inner.use_rfam:
        cm_file_rfam_user = ih_model
    else:
        cm_file_rfam_user = None
        BA_support.remove_one_file_with_try(ih_model)
    return analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user
コード例 #2
0
def infer_homology(analyzed_hits,
                   args,
                   cm_model_file,
                   multi_query=False,
                   iteration=0):
    """
    This is wrapper for infer homology methods. It deals with different options for generating CM.
    :return:
    """
    ml.info('Infering homology...')
    ml.debug(fname())
    bits, eval, loc_score, alig_length = hit_cons_characteristic(
        analyzed_hits.hits)

    # always run cmscan on rfam for informative reasons
    #  but use inferred CM only if --use_rfam was given
    #  if CM provided, also run inference but use provided file
    # print explanation alongside this information

    # find and extract cm model
    # This code is moved to each extension method to allow fail-fast if model is found in RFAM
    # cm_model_file, analyzed_hits = find_and_extract_cm_model(args, analyzed_hits)

    # include query seq in fasta file to get relevant bit score
    fd_f, fd_fasta = mkstemp(prefix='rba_', suffix='_28', dir=CONFIG.tmpdir)
    with os.fdopen(fd_f, 'w') as f:
        for seq in [analyzed_hits.query] + analyzed_hits.res_2_record_list():
            f.write('>{}\n{}\n'.format(seq.id, str(seq.seq)))

    cm_msa, cm_align_scores = run_cmalign_with_scores(fd_fasta,
                                                      cm_model_file,
                                                      threads=args.threads)

    _add_rsearch_align_scores2anal_hits(analyzed_hits, cm_align_scores)

    # remove first 1 (query) from the prediction scores
    prediction = infer_hits_cm(cm_align_scores[1:].bit_sc)

    # write scores to a table, compute it for all data and run some correlation statistics
    if args.repredict_file:
        # note that the first score is for the query and act as a benchmark here
        cm_msa_conservation = alignment_sequence_conservation(cm_msa,
                                                              gap_chars='-.')

        repredict_file = BA_support.iter2file_name(args.repredict_file,
                                                   multi_query, iteration)
        with open(repredict_file, 'w') as f:
            _print_table_for_corelation(f, cm_align_scores.seq_name[1:], bits,
                                        eval, loc_score, alig_length,
                                        cm_msa_conservation[1:],
                                        cm_align_scores.bit_sc[1:],
                                        cm_msa_conservation[0],
                                        cm_align_scores.bit_sc[0])

    BA_support.remove_one_file_with_try(fd_fasta)

    selected_hits = [
        hit.extension for b, hit in zip(prediction, analyzed_hits.hits) if b
    ]

    if args.cm_file or args.use_rfam:
        r_cm_file = cm_model_file
    else:
        r_cm_file = None
        BA_support.remove_one_file_with_try(cm_model_file)

    return prediction, selected_hits, r_cm_file