コード例 #1
0
def repredict_structures_for_homol_seqs(
    query,
    seqs2predict_fasta,
    threads=None,
    prediction_method=None,
    pred_method_params=None,
    all_hits_list=None,
    seqs2predict_list=None,
    use_cm_file=None,
):
    """Run RNA structure prediction based on chosen method and parameters.
    """

    default_sim_tr_perc = 90
    default_score_tr = 0.0
    query_max_len_diff = 0.1

    try:
        if 'default' == prediction_method:
            # do nothing
            return None, None, []

        elif 'rfam-Rc' == prediction_method:
            if use_cm_file is None:
                msg = "No CM model. Can't use {}.".format(prediction_method)
                ml.warning(msg)
                return None, None, [msg]
            else:
                structures, exec_time = cmmodel_rnafold_c(
                    seqs2predict_fasta,
                    use_cm_file,
                    threads=threads,
                    params=pred_method_params.get(prediction_method, {}))
                return structures, exec_time, []

        elif 'rfam-centroid' == prediction_method:
            # run cmscan if needed
            # run cmfetch
            # run cmemit -> homologous seqs
            # run centroid_homfold

            method_parameters = pred_method_params.get(prediction_method, {})
            if use_cm_file is None:
                msg = "No CM model. Can't use {}.".format(prediction_method)
                ml.warning(msg)
                return None, None, [msg]
            else:
                cep = method_parameters.get('cmemit', '')
                if '-u' not in cep:
                    cep += ' -u'
                if '-N' not in cep:
                    cep += ' -N {}'.format(method_parameters.get('n_seqs', 10))

                hf_file = run_cmemit(use_cm_file, params=cep)

                structures, exec_time = me_centroid_homfold(
                    seqs2predict_fasta, hf_file, params=method_parameters)

                BA_support.remove_one_file_with_try(hf_file)
                return structures, exec_time, []

        elif 'rfam-sub' == prediction_method:
            if use_cm_file is None:
                msg = "No CM model. Can't use {}.".format(prediction_method)
                ml.warning(msg)
                return None, None, [msg]
            else:
                ref_structure = extract_ref_from_cm(use_cm_file)

                structures, exec_time = rfam_subopt_pred(
                    seqs2predict_fasta,
                    ref_structure,
                    params=pred_method_params.get(prediction_method, None),
                    threads=threads,
                )
                return structures, exec_time, []

        elif 'rnafold' == prediction_method:
            structures, exec_time = rnafold_wrap_for_predict(
                seqs2predict_fasta,
                params=pred_method_params.get(prediction_method,
                                              {}).get('RNAfold', ''))
            return structures, exec_time, []

        elif 'fq-sub' == prediction_method:
            a, qf = mkstemp(prefix='rba_', suffix='_55', dir=CONFIG.tmpdir)
            with os.fdopen(a, 'w') as fd:
                fd.write('>query\n{}\n'.format(str(query.seq)))

            structures, exec_time = subopt_fold_query(
                seqs2predict_fasta,
                qf,
                params=pred_method_params.get(prediction_method, None),
                threads=threads)
            BA_support.remove_one_file_with_try(qf)
            return structures, exec_time, []

        elif 'C-A-sub' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, homologous_seqs, msgs = create_nr_trusted_hits_file_MSA_safe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file

            f, homologous_sequence_file = mkstemp(prefix='rba_',
                                                  suffix='_64',
                                                  dir=CONFIG.tmpdir)
            with os.fdopen(f, 'w') as fh:
                SeqIO.write(homologous_seqs, fh, 'fasta')

            structures, exec_time = subopt_fold_alifold(
                seqs2predict_fasta,
                homologous_sequence_file,
                aligner='clustalo',
                params=method_parameters,
                threads=threads)
            BA_support.remove_one_file_with_try(homologous_sequence_file)
            del homologous_sequence_file
            del homologous_seqs
            return structures, exec_time, msgs

        elif 'M-A-sub' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, homologous_seqs, msgs = create_nr_trusted_hits_file_MSA_safe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file

            f, homologous_sequence_file = mkstemp(prefix='rba_',
                                                  suffix='_65',
                                                  dir=CONFIG.tmpdir)
            with os.fdopen(f, 'w') as fh:
                SeqIO.write(homologous_seqs, fh, 'fasta')

            structures, exec_time = subopt_fold_alifold(
                seqs2predict_fasta,
                homologous_sequence_file,
                aligner='muscle',
                params=method_parameters,
                threads=threads,
            )

            BA_support.remove_one_file_with_try(homologous_sequence_file)
            del homologous_sequence_file
            del homologous_seqs
            return structures, exec_time, msgs

        elif 'C-A-r-Rc' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, _, msgs = create_nr_trusted_hits_file_MSA_safe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )

            structures, exec_time = alifold_refold_prediction(
                nr_homo_hits_file,
                seqs2predict_fasta,
                refold='refold_rnafoldc',
                threads=threads,
                params=method_parameters,
                msa_alg='clustalo')

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file
            return structures, exec_time, msgs

        elif 'M-A-r-Rc' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, _, msgs = create_nr_trusted_hits_file_MSA_safe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )
            structures, exec_time = alifold_refold_prediction(
                nr_homo_hits_file,
                seqs2predict_fasta,
                refold='refold_rnafoldc',
                threads=threads,
                params=method_parameters,
                msa_alg='muscle')

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file
            return structures, exec_time, msgs

        elif 'C-A-U-r-Rc' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, _, msgs = create_nr_trusted_hits_file_MSA_safe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )
            structures, exec_time = alifold_refold_prediction(
                nr_homo_hits_file,
                seqs2predict_fasta,
                refold='conserved_ss_rnafoldc',
                threads=threads,
                params=method_parameters,
                msa_alg='clustalo')

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file
            return structures, exec_time, msgs

        elif 'M-A-U-r-Rc' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, _, msgs = create_nr_trusted_hits_file_MSA_safe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )
            structures, exec_time = alifold_refold_prediction(
                nr_homo_hits_file,
                seqs2predict_fasta,
                refold='conserved_ss_rnafoldc',
                threads=threads,
                params=method_parameters,
                msa_alg='muscle')

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file
            return structures, exec_time, msgs

        elif 'centroid' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, _, msgs = create_nr_homolog_hits_file_MSA_unsafe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )

            raw_structures, exec_time = me_centroid_homfold(
                seqs2predict_fasta,
                nr_homo_hits_file,
                params=method_parameters)

            # check noncanonical
            if prediction_method in pred_method_params and pred_method_params[
                    prediction_method]:
                allow_nc = pred_method_params[prediction_method].get(
                    'allow_noncanonical', False)
                allow_lp = pred_method_params[prediction_method].get(
                    'allow_lonely_pairs', False)
            else:
                allow_nc = False
                allow_lp = False
            if not allow_nc:
                for seq in raw_structures:
                    repstr = find_nc_and_remove(
                        str(seq.seq), structure=seq.letter_annotations['ss0'])
                    seq.letter_annotations['ss0'] = repstr

            # check lonely basepairs
            if not allow_lp:
                for seq in raw_structures:
                    repstr = check_lonely_bp(seq.letter_annotations['ss0'])
                    seq.letter_annotations['ss0'] = repstr

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file
            return raw_structures, exec_time, msgs

        elif 'centroid-fast' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})
            if query.annotations['ambiguous']:
                raise exceptions.AmbiguousQuerySequenceException

            raw_structures, exec_time = centroid_homfold_fast(
                all_seqs=all_hits_list,
                query=query,
                all_seqs_fasta=seqs2predict_fasta,
                n=method_parameters.get('max_seqs_in_prediction', 10),
                centroid_homfold_params=method_parameters,
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff))

            # check noncanonical
            if prediction_method in pred_method_params and pred_method_params[
                    prediction_method]:
                allow_nc = pred_method_params[prediction_method].get(
                    'allow_noncanonical', False)
                allow_lp = pred_method_params[prediction_method].get(
                    'allow_lonely_pairs', False)
            else:
                allow_nc = False
                allow_lp = False
            if not allow_nc:
                for seq in raw_structures:
                    repstr = find_nc_and_remove(
                        str(seq.seq), structure=seq.letter_annotations['ss0'])
                    seq.letter_annotations['ss0'] = repstr

            # check lonely basepairs
            if not allow_lp:
                for seq in raw_structures:
                    repstr = check_lonely_bp(seq.letter_annotations['ss0'])
                    seq.letter_annotations['ss0'] = repstr

            return raw_structures, exec_time, []

        elif 'TurboFold' == prediction_method:
            # set arbitrary sim_threshold_percent to 100, because we want to remove only identical sequences from prediction
            #  with TurboFold. The structure of redundant sequences will be set according to the one in prediction
            all_hits_filtered = BA_support.filter_ambiguous_seqs_from_list(
                all_hits_list)
            seqs2predict_filtered = BA_support.filter_ambiguous_seqs_from_list(
                seqs2predict_list)
            if len(seqs2predict_list) != len(seqs2predict_filtered):
                ml.warning(
                    'Some sequences contain ambiguous bases - they will not be predicted.'
                )

            if query.annotations['ambiguous']:
                raise exceptions.AmbiguousQuerySequenceException()

            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, _, msgs = create_nr_homolog_hits_file_MSA_unsafe(
                all_hits=all_hits_filtered,
                query=query,
                sim_threshold_percent=100,
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )

            with open(nr_homo_hits_file, 'r') as nrf:
                nr_homo_hits = [
                    seq for seq in SeqIO.parse(nrf, format='fasta')
                ]

            nh = sha1()
            nh.update(str(sorted(method_parameters.items())).encode())
            nh_str = nh.hexdigest()

            structures_t, exec_time = turbofold_with_homologous(
                all_sequences=seqs2predict_filtered,
                nr_homologous=nr_homo_hits,
                params=method_parameters.get('TurboFold', {}),
                n=method_parameters.get('max_seqs_in_prediction', 3),
                cpu=threads,
                pkey=prediction_method,
                sha1val=nh_str,
            )

            structures = BA_support.rebuild_structures_output_from_pred(
                seqs2predict_list, structures_t)

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del structures_t
            del nr_homo_hits
            del nr_homo_hits_file
            return structures, exec_time, msgs

        elif 'Turbo-fast' == prediction_method:
            if query.annotations['ambiguous']:
                raise exceptions.AmbiguousQuerySequenceException()

            nh = sha1()
            nh.update(
                str(
                    sorted(
                        pred_method_params.get(prediction_method,
                                               {}).items())).encode())
            nh_str = nh.hexdigest()

            structures_t, exec_time = turbofold_fast(
                all_seqs=all_hits_list,
                seqs2predict=seqs2predict_list,
                query=query,
                cpu=threads,
                n=pred_method_params.get(prediction_method,
                                         {}).get('max_seqs_in_prediction', 3),
                turbofold_params=pred_method_params.get(prediction_method,
                                                        {}).get(
                                                            'TurboFold', {}),
                len_diff=pred_method_params.get(prediction_method, {}).get(
                    'query_max_len_diff', query_max_len_diff),
                pkey=prediction_method,
                sha1val=nh_str,
            )

            structures = BA_support.rebuild_structures_output_from_pred(
                seqs2predict_list, structures_t)

            del structures_t
            return structures, exec_time, []

    except exceptions.NoHomologousSequenceException:
        msg = nonhomseqwarn(prediction_method)
        return None, None, [msg]
    except exceptions.AmbiguousQuerySequenceException:
        msgfail = "Query sequence contains ambiguous characters. Can't use {}.".format(
            prediction_method)
        ml.warning(msgfail)
        return None, None, [msgfail]
    except exceptions.SubprocessException as e:
        msg = "{} can't be used. Error message follows: {} \n{}".format(
            prediction_method, str(e), e.errors)
        ml.error(msg)
        return None, None, [str(e)]
    except Exception as e:
        ml.error("{} can't be used. Error message follows: \n{}.".format(
            prediction_method, str(e)))
        return None, None, [str(e)]

    assert False, "Should not reach here (bad prediction method name)."
コード例 #2
0
def turbofold_ext_nr_fast(all_seqs,
                          nrset,
                          cpu,
                          n,
                          turbofold_params,
                          pkey='Turbo-fast',
                          sha1val='',
                          timeout=None):
    # ambiguos sequence cannot be predicted with turbofold
    # do not predict sequence twice
    ml.debug(fname())

    msg_short_list = 'Turbo-fast: Number of sequences is less then required.'

    if not len(nrset) == len(
            BA_support.filter_ambiguous_seqs_from_list(nrset)) == len(
                BA_support.non_redundant_seqs(nrset)):
        msgfail = 'Wrong nr set specification.'
        if len(nrset) != len(
                BA_support.filter_ambiguous_seqs_from_list(nrset)):
            msgfail += 'nr set contains seq(s) with ambiguous character.'
        if len(nrset) != len(BA_support.non_redundant_seqs(nrset)):
            msgfail += 'nr set contain non redundant sequence(s).'
        ml.error(msgfail)
        raise AssertionError(msgfail)

    for seq in chain(all_seqs, nrset):
        if 'msgs' not in seq.annotations:
            seq.annotations['msgs'] = []

    list2predict = []
    for seq in all_seqs:
        if seq.annotations.get('ambiguous', False):
            ml.warning('Skipping prediction for {} (ambiguous base)'.format(
                seq.id))
            continue

        if pkey in seq.letter_annotations and seq.annotations.get(
                'sha1', {}).get(pkey) == sha1val:
            # nothing to do, the structure already computed
            continue

        seq_set = _prepare_set_n(seq, nrset, n)

        if len(seq_set) < 2:
            msgfail = "Turbo-fast can't be used with less then 2 sequences - {}".format(
                seq.id)
            ml.warning(msgfail)
            if ml.getEffectiveLevel() > 30:
                print(msgfail)
            continue

        if len(seq_set) < n:
            msg_short_list_custom = msg_short_list + ' n={} ({})'.format(
                len(seq_set), n)
            ml.info(msg_short_list_custom + ' ' + seq.id)
            seq.annotations['msgs'].append(msg_short_list_custom)
        list2predict.append((seq_set, turbofold_params, seq.id))

    if cpu == 1:
        pred_list = []
        for oneseqset, tpar, _ in list2predict:
            pred_list.append(run_turbofold(oneseqset, tpar, timeout=timeout))
    else:
        pool = multiprocessing.Pool(processes=cpu)
        pred_list = pool.map(_rt_wrapper, list2predict)
        pool.close()

    # rebuild predicted TurboFold structures
    # - take care that prediction might be empty if TurboFold fails
    out_list = []
    for out, l_in in zip(pred_list, list2predict):
        if isinstance(out, exceptions.SubprocessException):
            seq = next(s for s in all_seqs if s.id == l_in[2])
            seq.annotations['msgs'].append(str(out))
            # do not propagate call output
            # seq.annotations['msgs'].append(out.errors)
            continue
        elif isinstance(out, Exception):
            seq = next(s for s in all_seqs if s.id == l_in[2])
            seq.annotations['msgs'].append(str(out))
            continue

        sel = [o for o in out if o.id == l_in[2]]
        if len(sel) == 1:
            out_list.append(sel[0])

    return out_list
コード例 #3
0
def create_nr_trusted_hits_file_MSA_safe(
    sim_threshold_percent=None,
    all_hits=None,
    query=None,
    cmscore_tr=-2.03,
    cm_threshold_percent=None,
    check_unambiguous=False,
    len_diff=0.1,
):
    """
    create non redundant trusted hits file

    multiple at minimum (2) sequences are needed for profile alignment for some aligners
    so this function always return two or more sequences or raises exception

    :param sim_threshold_percent:   seq similarity threshold for homology exclusion
    :param all_hits:                list of hits
    :param query:                   blast query
    :param cmscore_tr:              threshold for homology inclusion in bits
    :param cm_threshold_percent:    threshold for homology inclusion in % of query bits
    :param check_unambiguous:       bool whether to check unambiguous seqs
    :param len_diff:                threshold for exclusion of hits lq=len(query) lq - diff*lq < len(seq) < lq + diff*lq
    :return:
    """
    ml.debug(fname())
    # i need to leave query, even if with ambiguous basepairs in
    # because it is used as an reference during distance computation and subsequence selection,
    # however i don't need to have all homologous seqs there
    if check_unambiguous:
        all_hits = BA_support.filter_ambiguous_seqs_from_list(all_hits)

    dist_table, homologous_seqs, msgs = _trusted_hits_selection_wrapper(
        all_hits, query, cmscore_tr, cm_threshold_percent, len_diff_=len_diff)

    if dist_table.size == 0:
        raise exceptions.NoHomologousSequenceException

    to_include = rna_blast_analyze.BR_core.predict_structures.select_sequences_from_similarity_rec(
        dist_table, sim_threshold_percent=sim_threshold_percent)
    nr_homolog_hits = [homologous_seqs[i] for i in to_include]

    # final checking of nr homologs
    # if sequence is filtered here, it is ambiguous basepair in query
    # removing it is fine if multiple homologous sequences are present
    # the problem will arise when only 1 homologous sequence will remain
    # if we added sequence in previous step, raise exception, else behave like in prev step
    # what if trusted hit is only one?

    msg = (
        'STATUS: Only one sequence remained under defined "pred_sim_threshold" parameter.\n'
        ' Mitigation: Adding the most dissimilar homologous sequence to the non redundant sequences list.'
    )
    if len(nr_homolog_hits) < 2 and not check_unambiguous:
        msgs.append(msg)
        ml.info(msg)
        if ml.level > 20:
            print(msg)

        dis_hom_index = dist_table[:, 0].argmin()
        nr_homolog_hits.append(
            SeqRecord(homologous_seqs[dis_hom_index].seq, id='dummy_seq_01'))
        del dis_hom_index

    elif len(nr_homolog_hits) < 2 and check_unambiguous:
        if len(BA_support.filter_ambiguous_seqs_from_list(
                nr_homolog_hits)) == 0:
            # this mean query contain ambiguous bases
            raise exceptions.NoHomologousSequenceException
        else:
            msgs.append(msg)
            ml.info(msg)
            if ml.level > 20:
                print(msg)

            dis_hom_index = dist_table[:, 0].argmin()
            nr_homolog_hits.append(
                SeqRecord(homologous_seqs[dis_hom_index].seq,
                          id='dummy_seq_01'))
            del dis_hom_index
        homologous_seqs = BA_support.filter_ambiguous_seqs_from_list(
            homologous_seqs)

    elif len(nr_homolog_hits) >= 2 and not check_unambiguous:
        pass

    elif len(nr_homolog_hits) > 2 and check_unambiguous:
        nr_homolog_hits = BA_support.filter_ambiguous_seqs_from_list(
            nr_homolog_hits)
        homologous_seqs = BA_support.filter_ambiguous_seqs_from_list(
            homologous_seqs)

    elif len(nr_homolog_hits) == 2 and check_unambiguous:
        homologous_seqs = BA_support.filter_ambiguous_seqs_from_list(
            homologous_seqs)
        if len(BA_support.filter_ambiguous_seqs_from_list(
                nr_homolog_hits)) == 1:
            # this mean that query contains ambiguous base
            raise exceptions.NoHomologousSequenceException

    else:
        raise Exception()

    fd_h, nr_homo_hits_file = mkstemp(prefix='rba_',
                                      suffix='_58',
                                      dir=CONFIG.tmpdir)
    with os.fdopen(fd_h, 'w') as f:
        SeqIO.write(nr_homolog_hits, f, 'fasta')

    return nr_homo_hits_file, homologous_seqs, msgs