Python BA_support Examples, rna_blast_analyze.BR_core.BA_support Python Examples

Example #1

0

Show file

File: infer_homology.py Project: ELIXIR-CZ/rboAnalyzer

def run_cmalign_with_scores(fasta_file, cm_file, threads=None):
    fd_sfile, cm_sfile_path = mkstemp(prefix='rba_',
                                      suffix='_29',
                                      dir=CONFIG.tmpdir)
    os.close(fd_sfile)
    if threads:
        cm_params = '--notrunc --cpu {} --sfile {}'.format(
            threads, cm_sfile_path)
    else:
        cm_params = '--notrunc --sfile {}'.format(cm_sfile_path)
    cm_msa_file = run_cmalign_on_fasta(fasta_file,
                                       cm_file,
                                       cmalign_params=cm_params)

    cm_msa = read_st(cm_msa_file)

    # combine the eval and cm_msa_conservation_score
    # the cmalign scores somehow, look into the scoring if those scores are accessible, maybe they are far better then
    # my made up msa_conservation
    # there is - by option --sfile
    cm_align_scores = read_cmalign_sfile(cm_sfile_path)
    # the bit score can be probably directly comparable with blast bit score
    # i can also leverage the fact, that the badly aligned sequences with cmalign have negative bitscore
    # so my score can be
    cm_align_scores.index = range(len(cm_align_scores.index))

    BA_support.remove_files_with_try([cm_sfile_path, cm_msa_file])

    return cm_msa, cm_align_scores

Example #2

0

Show file

def refold_stockholm(stockholm_alig, consensus_structure):
    """
    compute refold.pl from Vienna RNA package
    :param stockholm_alig:
    :param consensus_structure:
    :return:
    """
    ml.debug(fname())
    # convert to clustal alignment
    fd, clust_tempfile = mkstemp(prefix='rba_', suffix='_23', dir=CONFIG.tmpdir)
    with os.fdopen(fd, 'w') as f:
        stockholm_alig.write_clustal(f)

    # write fake alifold output with given consensus structure
    fd, alif_fake_file = mkstemp(prefix='rba_', suffix='_24', dir=CONFIG.tmpdir)
    with os.fdopen(fd, 'w') as f:
        # the consensus sequence in alifold file is really not used for anything
        f.write('A'*len(consensus_structure) + '\n')
        f.write(consensus_structure + '\n')

    # compute refold
    # refold_path = locate_refold()
    refold_constrained_file = compute_refold(clust_tempfile, alif_fake_file)

    parsed_seqs = []
    with open(refold_constrained_file, 'r') as f:
        # read the file
        for seq in BA_support.parse_seq_str(f):
            parsed_seqs.append(seq)

    # cleanup
    BA_support.remove_files_with_try([clust_tempfile, alif_fake_file, refold_constrained_file])

    return parsed_seqs

Example #3

0

Show file

File: infer_homology.py Project: ELIXIR-CZ/rboAnalyzer

def build_cm_model_rsearch(query_seq, path2selected_sim_array):
    ml.debug(fname())
    query_structure = rna_blast_analyze.BR_core.viennaRNA.RNAfold(
        str(query_seq.seq))[0]

    # remove any annotations from query:
    qs_clean = deepcopy(query_seq)
    qs_clean.annotations = dict()
    qs_clean.letter_annotations = dict()

    # query_structure = RNA.fold(str(analyzed_hits.query.seq))[0]
    # build stockholm like file for use in cm mohdel build
    st_like = StockholmAlig()
    st_like.append(qs_clean)
    st_like.column_annotations['SS_cons'] = query_structure

    fds, stock_file = mkstemp(prefix='rba_', suffix='_30', dir=CONFIG.tmpdir)
    with os.fdopen(fds, 'w') as f:
        st_like.write_stockholm(f)

    # run actual cmbuild
    cm_model_file = run_cmbuild(
        stock_file,
        cmbuild_params='--rsearch {}'.format(path2selected_sim_array))

    # cleanup
    BA_support.remove_one_file_with_try(stock_file)
    return cm_model_file

Example #4

0

Show file

File: centroid_homfold.py Project: ELIXIR-CZ/rboAnalyzer

def me_centroid_homfold(fasta2predict, fasta_homologous_seqs, params=None):
    """
    run centroid_homefold several times and vary -g parameter, to predict the best possible structure
    :param fasta2predict:
    :param fasta_homologous_seqs:
    :param params:
    :return:
    """

    # first run centroid homefold for several stages of g (-1)
    # find the most stable structure value of g
    # structure of output

    if params is None:
        params = dict()

    ch_params = ''
    if params and ('centroid_homfold' in params) and params['centroid_homfold']:
        ch_params += params['centroid_homfold']

    if '-g ' in ch_params and '-g -1' not in ch_params or '-t ' in params:
        print("We only allow to run centroid homfold in automatic mode where the structure is predicted with multiple"
              " weights and then best scoring structure is selected, threshold's are also forbidden as it implies -g.")
        raise AttributeError('Centroid homfold is not permitted to run with "-g" or "-t".')
    ch_params += ' -g -1'

    first_structures = run_centroid_homfold(fasta2predict, fasta_homologous_seqs, centroid_homfold_params=ch_params)
    structures2return = [ch_struc for ch_struc in centroid_homfold_select_best(first_structures)]
    BA_support.remove_one_file_with_try(first_structures)
    return structures2return

Example #5

0

Show file

def create_report_object_from_locarna(exp_hit, locarna_alig):
    """
    create object which will be appended to BlastSearchRecompute class
    This needs to be Subsequences object

    :param exp_hit:
    :param locarna_alig:
    :return:
    """
    ml.debug(fname())
    # chop alignment by seq
    query_ind = [i for i, j in enumerate(locarna_alig) if j.id == 'query']
    if len(query_ind) != 1:
        raise exceptions.SubseqMatchError('Got multiple hits with id "query" in the Locarna alignment.')
    trimmed_locarna_alig = trim_alignment_by_sequence(
        locarna_alig,
        str(locarna_alig[query_ind[0]].seq),
        structure_annotation='SS_cons'
    )

    aligned_subsequence = BA_support.select_analyzed_aligned_hit(trimmed_locarna_alig, exp_hit.id)

    # add annotations from exp hit
    aligned_subsequence.annotations = exp_hit.annotations
    aligned_subsequence.name = exp_hit.name

    # also add annotations from locarna, mainly score
    aligned_subsequence.annotations.update(locarna_alig.annotations)

    # get the structure
    # by refold
    refold_structures = refold_stockholm(trimmed_locarna_alig, trimmed_locarna_alig.column_annotations['SS_cons'])

    # select refold structure for my seq
    seq_refold_structure = _select_refold_structure(refold_structures, exp_hit.id)

    aligned_subsequence.letter_annotations['ss0'] = seq_refold_structure.letter_annotations['ss0']
    aligned_subsequence.annotations['sss'] = ['ss0']

    # prepare seq_record for subsequences
    aligned_subsequence.description = ''
    hit = BA_support.Subsequences(exp_hit)

    hit.extension = aligned_subsequence

    # find the matching sequence
    pos_match = re.search(str(aligned_subsequence.seq), str(exp_hit.seq), flags=re.IGNORECASE)
    if not pos_match:
        raise exceptions.SubseqMatchError(
            'Aligned portion of subject sequence in Locarna alignment was not found in parent sequence.'
        )

    hit.best_start, hit.best_end = compute_true_location_locarna(hit, pos_match)

    return hit

Example #6

0

Show file

File: centroid_homfold.py Project: ELIXIR-CZ/rboAnalyzer

def centroid_homfold_fast(all_seqs, query, all_seqs_fasta, n, centroid_homfold_params, len_diff):
    ml.debug(fname())

    selected_seqs = centroid_homfold_fast_prep(all_seqs, query, n, len_diff)

    ch, homologous_file = mkstemp(prefix='rba_', suffix='_74', dir=CONFIG.tmpdir)
    with os.fdopen(ch, 'w') as h:
        SeqIO.write(selected_seqs, h, 'fasta')

    structures, _ = me_centroid_homfold(all_seqs_fasta, homologous_file, params=centroid_homfold_params)
    BA_support.remove_one_file_with_try(homologous_file)
    return structures

Example #7

0

Show file

def _prepare_set_n(seq, nr_seqs, n):
    if len(nr_seqs) - n + 1 < 0:
        return BA_support.non_redundant_seqs([seq] + nr_seqs)

    i = 0
    while i != len(nr_seqs) - n + 1:
        seqs = [seq] + nr_seqs[:n - 1 + i]
        n_nr_seqs = BA_support.non_redundant_seqs(seqs)
        if len(n_nr_seqs) == n:
            return n_nr_seqs
        i += 1

    return BA_support.non_redundant_seqs([seq] + nr_seqs)

Example #8

0

Show file

def create_nr_homolog_hits_file_MSA_unsafe(sim_threshold_percent=None,
                                           all_hits=None,
                                           query=None,
                                           cmscore_tr=0.0,
                                           cm_threshold_percent=None,
                                           len_diff=0.1):
    """
    create non redundant homologous hits file
    """
    ml.debug(fname())
    dist_table, homologous_seqs, msgs = _trusted_hits_selection_wrapper(
        all_hits, query, cmscore_tr, cm_threshold_percent, len_diff_=len_diff)
    if dist_table.size == 0:
        nr_homolog_hits = [query]
    else:
        # normal execution
        to_include = BA_support.select_sequences_from_similarity_rec(
            dist_table, sim_threshold_percent=sim_threshold_percent)
        nr_homolog_hits = [homologous_seqs[i] for i in to_include]

    fd_h, nr_homo_hits_file = mkstemp(prefix='rba_',
                                      suffix='_59',
                                      dir=CONFIG.tmpdir)
    with os.fdopen(fd_h, 'w') as f:
        SeqIO.write(nr_homolog_hits, f, 'fasta')

    return nr_homo_hits_file, homologous_seqs, msgs

Example #9

0

Show file

File: centroid_homfold.py Project: ELIXIR-CZ/rboAnalyzer

def centroid_homfold_fast_prep(all_seqs, query, n, len_diff):
    ml.debug(fname())

    assert n >= 1, "Number of sequences for centroid-fast must be greater then 0."

    if query.annotations['ambiguous']:
        msgfail = "Query sequence contains ambiguous characters. Can't use centroid-fast."
        ml.error(msgfail)
        raise AmbiguousQuerySequenceException(msgfail)

    nr_na_ld = BA_support.sel_seq_simple(all_seqs, query, len_diff)
    nr_na_ld_n = nr_na_ld[:int(n)]
    return nr_na_ld_n

Example #10

0

Show file

File: centroid_homfold.py Project: ELIXIR-CZ/rboAnalyzer

def _parse_centroid_homefold_output_file(file):
    with open(file, 'r') as f:
        for sr in BA_support.parse_one_rec_in_multiline_structure(f):
            cf = sr.strip().splitlines()

            cfr = SeqRecord(Seq(cf[1]), id=cf[0])
            cfr.annotations['sss'] = []
            for i, ll in enumerate(cf[2:]):
                structure, ann = ll.split()
                cfr.letter_annotations['ss' + str(i)] = structure
                cfr.annotations['ss' + str(i)] = ann
                cfr.annotations['sss'].append('ss' + str(i))

            yield cfr

Example #11

0

Show file

def _turbofold_worker(sequences, params, timeout=None):
    env = os.environ.copy()
    if 'DATAPATH' not in env:
        env['DATAPATH'] = CONFIG.rnastructure_datapath

    tmpdir, con_file, output_structure_files = write_turbofold_confile(
        input_sequences=sequences,
        turbofold_params=params,
    )

    # run without prediction progress reporting output
    with TemporaryFile(mode='w+', encoding='utf-8') as tmp:
        cmd = ['{}TurboFold'.format(CONFIG.turbofold_path), con_file]
        ml.debug(cmd)
        r = call(cmd, env=env, stdout=tmp, stderr=tmp, timeout=timeout)

        if r:
            msgfail = 'Call to TurboFold failed, cmd below:'
            ml.info(msgfail)
            ml.info(cmd)
            tmp.seek(0)
            raise exceptions.TurboFoldException(msgfail, tmp.read())

        # now convert ct files produced by TurboFold
        new_structures = []
        for out_str_file, orig_seq in zip(output_structure_files, sequences):
            o = open(out_str_file, 'r')
            seq_with_pred_str = BA_support.ct2db(o, energy_txt='ENERGY')
            o.close()

            assert str(seq_with_pred_str[0].seq) == str(orig_seq.seq), \
                'Input and output sequences of TurboFold call does not match.\n{}\n{}'.format(
                    str(seq_with_pred_str[0].seq), str(orig_seq.seq)
                )

            new_structures.append(
                SeqRecord(orig_seq.seq,
                          id=orig_seq.id,
                          annotations={'sss': ['ss0']},
                          letter_annotations={
                              'ss0':
                              seq_with_pred_str[0].letter_annotations['ss0']
                          }))

        rmtree(tmpdir)
        return new_structures

Example #12

0

Show file

File: expand_by_BLAST.py Project: cas-bioinf/rboAnalyzer

def create_blast_only_report_object(exp_hit, query_len):
    # init new Subsequences object
    #  here the object source shadows the final hit
    hit = BA_support.Subsequences(exp_hit)

    # init new SeqRecord object
    ns = deepcopy(exp_hit)
    ann = ns.annotations
    tss = ann['trimmed_ss']
    tse = ann['trimmed_se']
    tes = ann['trimmed_es']
    tee = ann['trimmed_ee']

    ns.letter_annotations['ss0'] = '.' * len(ns.seq)
    ns.annotations['sss'] = ['ss0']
    ns.description = ''

    hit.extension = ns

    bl = ns.annotations['blast'][1]

    if bl.sbjct_start < bl.sbjct_end:
        bls = bl.sbjct_start - bl.query_start + 1
        ble = bl.sbjct_end + (query_len - bl.query_end)
    elif bl.sbjct_end < bl.sbjct_start:
        bls = bl.sbjct_end - (query_len - bl.query_end)
        ble = bl.sbjct_start + bl.query_start - 1
    else:
        raise exceptions.UnknownStrand(
            "Can't determine HSP strand (sbjct_start appears equal to sbjct_end)"
        )

    # if whole subject sequence too short, this assertion will fail
    if tss or tse or tes or tee:
        msg = 'STATUS: Skipping sequence check ({}) - subject sequence too short.'.format(
            ns.id)
        ml.info(msg)
    else:
        assert len(ns.seq) == abs(bls - ble) + 1
        assert bls == ns.annotations['extended_start']
        assert ble == ns.annotations['extended_end']

    hit.best_start, hit.best_end = compute_true_location_se(hit, query_len)

    return hit

Example #13

0

Show file

def turbofold_fast(all_seqs, seqs2predict, query, cpu, n, turbofold_params,
                   len_diff, pkey, sha1val):
    """Run Turbo-fast prediction
    - ambiguous sequences cannot be predicted with TurboFold
    - do not predict sequence twice
    - seqs2predict must be subset of all_seqs
    """
    ml.debug(fname())

    # this is backup (in rboAnalyzer this is handled on higher level)
    if query.annotations['ambiguous']:
        msgfail = "Query sequence contains ambiguous characters. Can't use Turbo-fast."
        ml.error(msgfail)
        raise exceptions.AmbiguousQuerySequenceException(msgfail)

    nr_na_ld = BA_support.sel_seq_simple(all_seqs, query, len_diff)

    return turbofold_ext_nr_fast(seqs2predict, nr_na_ld, cpu, n,
                                 turbofold_params, pkey, sha1val)

Example #14

0

Show file

File: expand_by_joined_pred_with_rsearch.py Project: cas-bioinf/rboAnalyzer

def extend_meta_core(analyzed_hits,
                     query,
                     args_inner,
                     all_short,
                     multi_query,
                     iteration,
                     ih_model,
                     timeout=None):
    ml.debug(fname())
    # update params if different config is requested
    CONFIG.override(tools_paths(args_inner.config_file))

    blast_args = deepcopy(args_inner)
    locarna_args = deepcopy(args_inner)
    b_all_short = deepcopy(all_short)
    l_all_short = deepcopy(all_short)

    if args_inner.repredict_file is None:
        fd, repred_file = mkstemp(prefix='rba_',
                                  suffix='_18',
                                  dir=CONFIG.tmpdir)
        os.close(fd)
    else:
        repred_file = args_inner.repredict_file

    for i, args in enumerate([blast_args, locarna_args]):
        args.prediction_method = []
        args.pred_params = dict()
        args.dump = None
        args.pdf_out = None
        args.pandas_dump = None
        args.repredict_file = repred_file + str(i)
        args.dev_pred = False
        args.logfile = None
        args.json = None
        args.html = None
        args.cm_file = ih_model

    analyzed_hits_simple = deepcopy(analyzed_hits)
    analyzed_hits_locarna = deepcopy(analyzed_hits)

    analyzed_hits_simple, _, _, _ = extend_simple_core(analyzed_hits_simple,
                                                       query, blast_args,
                                                       b_all_short,
                                                       multi_query, iteration,
                                                       ih_model)
    analyzed_hits_locarna, _, _, _ = extend_locarna_core(analyzed_hits_locarna,
                                                         query,
                                                         locarna_args,
                                                         l_all_short,
                                                         multi_query,
                                                         iteration,
                                                         ih_model,
                                                         timeout=timeout)

    # add cmstat to query
    analyzed_hits.query = analyzed_hits_simple.query

    order_out = []

    b_dict = {BA_support.get_hit_n(h): h for h in analyzed_hits_simple.hits}
    l_dict = {BA_support.get_hit_n(h): h for h in analyzed_hits_locarna.hits}
    ok_keys = sorted(set(b_dict.keys()) | set(l_dict.keys()))
    for inum in ok_keys:
        bh = b_dict.get(inum, None)
        lh = l_dict.get(inum, None)

        hits = [bh, lh]
        # fallback to simple if locarna returned empty hit
        # deal with the situation when both ways returned empty hits
        filtered_hits = [h for h in hits if h is not None]
        if len(filtered_hits) == 1:
            msg = 'Only one extension method completed successfully for {}. ' \
                  'Choosing the successfully extended sequence to the output.'.format(filtered_hits[0].extension.id)
            ml.info(msg)
            if ml.getEffectiveLevel() < 20:
                print(msg)
            analyzed_hits.hits.append(filtered_hits[0])
            continue
        elif len(filtered_hits) == 0:
            # append empty extension
            analyzed_hits.hits_failed.append(lh)
            continue

        bit_scores = [
            i.extension.annotations['cmstat']['bit_sc'] for i in hits
        ]

        mb = max(bit_scores)
        bit_index = [i for i, j in enumerate(bit_scores) if j == mb][0]
        order_out.append(bit_index)

        analyzed_hits.hits.append(hits[bit_index])

    # build failed hits
    b_dict_failed = {
        BA_support.get_hit_n(h): h
        for h in analyzed_hits_simple.hits_failed
    }
    l_dict_failed = {
        BA_support.get_hit_n(h): h
        for h in analyzed_hits_locarna.hits_failed
    }
    for inum in sorted(set(b_dict_failed) | set(l_dict_failed)):
        if inum not in ok_keys:
            if inum in b_dict_failed:
                analyzed_hits.hits_failed.append(b_dict_failed[inum])
            elif inum in l_dict_failed:
                analyzed_hits.hits_failed.append(l_dict_failed[inum])
            else:
                raise KeyError(
                    "Failed to find inum key in failed extensions. This should not happen."
                )

    # build the repredict file here if needed
    if args_inner.repredict_file:
        b_repredict = BA_support.iter2file_name(blast_args.repredict_file,
                                                multi_query, iteration)
        l_repredict = BA_support.iter2file_name(blast_args.repredict_file,
                                                multi_query, iteration)
        o_repredict = BA_support.iter2file_name(args_inner.repredict_file,
                                                multi_query, iteration)
        with open(b_repredict,
                  'r') as barf, open(l_repredict,
                                     'r') as larf, open(o_repredict,
                                                        'w') as reprf:
            """
            please note that order of files to merge must be same as the order of methods in previous for cycle
            ie same as the one in which order_out var is set
            """
            bb = (barf, larf)

            fl = bb[0].readline()
            reprf.write(fl)
            fl = bb[0].readline()
            reprf.write(fl)
            # dump first line of the other documents
            [[i.readline() for _ in range(1)] for i in bb[1:]]

            for o in order_out:
                lll = [i.readline() for i in bb]
                reprf.write(lll[o])

    # recreate needed data from selected hits
    homology_prediction = []
    homol_seqs = []
    for hit in analyzed_hits.hits:
        homology_prediction.append(hit.hpred)
        if hit.hpred:
            homol_seqs.append(hit.extension)

        # add default prediction if it is not present
        if 'ss0' not in hit.extension.letter_annotations:
            if 'sss' not in hit.extension.annotations:
                hit.extension.anotations['sss'] = []
            hit.extension.annotations['sss'] += ['ss0']
            hit.extension.letter_annotations['ss0'] = '.' * len(
                hit.extension.seq)

    # recreate needed data from selected hits
    homology_prediction = []
    homol_seqs = []
    for hit in analyzed_hits.hits:
        homology_prediction.append(hit.hpred)
        if hit.hpred:
            homol_seqs.append(hit.extension)

        # add default prediction if it is not present
        if 'ss0' not in hit.extension.letter_annotations:
            if 'sss' not in hit.extension.annotations:
                hit.extension.anotations['sss'] = []
            hit.extension.annotations['sss'] += ['ss0']
            hit.extension.letter_annotations['ss0'] = '.' * len(
                hit.extension.seq)

    # remove description from hits and sources
    for hit in analyzed_hits.hits:
        hit.extension.description = ''

    if args_inner.cm_file or args_inner.use_rfam:
        cm_file_rfam_user = ih_model
    else:
        cm_file_rfam_user = None
        BA_support.remove_one_file_with_try(ih_model)
    return analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user

Example #15

0

Show file

File: expand_by_BLAST.py Project: cas-bioinf/rboAnalyzer

def extend_simple_core(analyzed_hits, query, args_inner, all_short,
                       multi_query, iteration, ih_model):
    # the extra here is given "pro forma" the sequence is extended exactly by lenghts of unaligned portions of query
    if args_inner.db_type == "blastdb":
        shorts_expanded, _ = rna_blast_analyze.BR_core.extend_hits.expand_hits(
            all_short,
            args_inner.blast_db,
            len(query),
            extra=0,
            blast_regexp=args_inner.blast_regexp,
            skip_missing=args_inner.skip_missing,
            msgs=analyzed_hits.msgs,
        )
    elif args_inner.db_type in ["fasta", "gb", "server", "entrez"]:
        shorts_expanded, _ = rna_blast_analyze.BR_core.extend_hits.expand_hits_from_fasta(
            all_short,
            args_inner.blast_db,
            len(query),
            extra=0,
            blast_regexp=args_inner.blast_regexp,
            skip_missing=args_inner.skip_missing,
            msgs=analyzed_hits.msgs,
            format=args_inner.db_type,
            entrez_email=args_inner.entrez,
            blast_input_file=args_inner.blast_in,
        )
    else:
        raise exceptions.IncorrectDatabaseChoice()

    # check, if blast hits are non - overlapping, if so, add the overlapping hit info to the longer hit
    # reflect this in user output
    # shorts_expanded = merge_blast_hits(shorts_expanded)

    shorts_expanded = trim_before(shorts_expanded)

    shorts_expanded = BA_support.rc_hits_2_rna(shorts_expanded)

    query_seq = query.seq.transcribe()

    # blast only extension
    for exp_hit in shorts_expanded:
        try:
            _out = create_blast_only_report_object(exp_hit, len(query_seq))
            analyzed_hits.hits.append(_out)
        except AssertionError as e:
            exp_hit.annotations['msgs'] += [str(e)]
            analyzed_hits.hits_failed.append(BA_support.Subsequences(exp_hit))
        except exceptions.UnknownStrand as e:
            exp_hit.annotations['msgs'] += [str(e)]
            analyzed_hits.hits_failed.append(BA_support.Subsequences(exp_hit))
        except Exception as e:
            ml.error("Unexpected error when extending with 'simple'.")
            exp_hit.annotations['msgs'] += [str(e)]
            analyzed_hits.hits_failed.append(BA_support.Subsequences(exp_hit))

    if len(analyzed_hits.hits) == 0:
        ml.error(
            "Extension failed for all sequences. Please see the error message. You can also try '--mode locarna'."
        )
        sys.exit(1)

    # assign Locarna score to None as it is not directly accessible from mlocarna
    for hit in analyzed_hits.hits:
        hit.extension.annotations['score'] = None

    # this part predicts homology - it is not truly part of repredict
    homology_prediction, homol_seqs, cm_file_rfam_user = infer_homology(
        analyzed_hits=analyzed_hits,
        args=args_inner,
        cm_model_file=ih_model,
        multi_query=multi_query,
        iteration=iteration)
    for hit, pred in zip(analyzed_hits.hits, homology_prediction):
        hit.hpred = pred
    return analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user

Example #16

0

Show file

File: infer_homology.py Project: ELIXIR-CZ/rboAnalyzer

def infer_homology(analyzed_hits,
                   args,
                   cm_model_file,
                   multi_query=False,
                   iteration=0):
    """
    This is wrapper for infer homology methods. It deals with different options for generating CM.
    :return:
    """
    ml.info('Infering homology...')
    ml.debug(fname())
    bits, eval, loc_score, alig_length = hit_cons_characteristic(
        analyzed_hits.hits)

    # always run cmscan on rfam for informative reasons
    #  but use inferred CM only if --use_rfam was given
    #  if CM provided, also run inference but use provided file
    # print explanation alongside this information

    # find and extract cm model
    # This code is moved to each extension method to allow fail-fast if model is found in RFAM
    # cm_model_file, analyzed_hits = find_and_extract_cm_model(args, analyzed_hits)

    # include query seq in fasta file to get relevant bit score
    fd_f, fd_fasta = mkstemp(prefix='rba_', suffix='_28', dir=CONFIG.tmpdir)
    with os.fdopen(fd_f, 'w') as f:
        for seq in [analyzed_hits.query] + analyzed_hits.res_2_record_list():
            f.write('>{}\n{}\n'.format(seq.id, str(seq.seq)))

    cm_msa, cm_align_scores = run_cmalign_with_scores(fd_fasta,
                                                      cm_model_file,
                                                      threads=args.threads)

    _add_rsearch_align_scores2anal_hits(analyzed_hits, cm_align_scores)

    # remove first 1 (query) from the prediction scores
    prediction = infer_hits_cm(cm_align_scores[1:].bit_sc)

    # write scores to a table, compute it for all data and run some correlation statistics
    if args.repredict_file:
        # note that the first score is for the query and act as a benchmark here
        cm_msa_conservation = alignment_sequence_conservation(cm_msa,
                                                              gap_chars='-.')

        repredict_file = BA_support.iter2file_name(args.repredict_file,
                                                   multi_query, iteration)
        with open(repredict_file, 'w') as f:
            _print_table_for_corelation(f, cm_align_scores.seq_name[1:], bits,
                                        eval, loc_score, alig_length,
                                        cm_msa_conservation[1:],
                                        cm_align_scores.bit_sc[1:],
                                        cm_msa_conservation[0],
                                        cm_align_scores.bit_sc[0])

    BA_support.remove_one_file_with_try(fd_fasta)

    selected_hits = [
        hit.extension for b, hit in zip(prediction, analyzed_hits.hits) if b
    ]

    if args.cm_file or args.use_rfam:
        r_cm_file = cm_model_file
    else:
        r_cm_file = None
        BA_support.remove_one_file_with_try(cm_model_file)

    return prediction, selected_hits, r_cm_file

Example #17

0

Show file

def create_nr_trusted_hits_file_MSA_safe(
    sim_threshold_percent=None,
    all_hits=None,
    query=None,
    cmscore_tr=-2.03,
    cm_threshold_percent=None,
    check_unambiguous=False,
    len_diff=0.1,
):
    """
    create non redundant trusted hits file

    multiple at minimum (2) sequences are needed for profile alignment for some aligners
    so this function always return two or more sequences or raises exception

    :param sim_threshold_percent:   seq similarity threshold for homology exclusion
    :param all_hits:                list of hits
    :param query:                   blast query
    :param cmscore_tr:              threshold for homology inclusion in bits
    :param cm_threshold_percent:    threshold for homology inclusion in % of query bits
    :param check_unambiguous:       bool whether to check unambiguous seqs
    :param len_diff:                threshold for exclusion of hits lq=len(query) lq - diff*lq < len(seq) < lq + diff*lq
    :return:
    """
    ml.debug(fname())
    # i need to leave query, even if with ambiguous basepairs in
    # because it is used as an reference during distance computation and subsequence selection,
    # however i don't need to have all homologous seqs there
    if check_unambiguous:
        all_hits = BA_support.filter_ambiguous_seqs_from_list(all_hits)

    dist_table, homologous_seqs, msgs = _trusted_hits_selection_wrapper(
        all_hits, query, cmscore_tr, cm_threshold_percent, len_diff_=len_diff)

    if dist_table.size == 0:
        raise exceptions.NoHomologousSequenceException

    to_include = rna_blast_analyze.BR_core.predict_structures.select_sequences_from_similarity_rec(
        dist_table, sim_threshold_percent=sim_threshold_percent)
    nr_homolog_hits = [homologous_seqs[i] for i in to_include]

    # final checking of nr homologs
    # if sequence is filtered here, it is ambiguous basepair in query
    # removing it is fine if multiple homologous sequences are present
    # the problem will arise when only 1 homologous sequence will remain
    # if we added sequence in previous step, raise exception, else behave like in prev step
    # what if trusted hit is only one?

    msg = (
        'STATUS: Only one sequence remained under defined "pred_sim_threshold" parameter.\n'
        ' Mitigation: Adding the most dissimilar homologous sequence to the non redundant sequences list.'
    )
    if len(nr_homolog_hits) < 2 and not check_unambiguous:
        msgs.append(msg)
        ml.info(msg)
        if ml.level > 20:
            print(msg)

        dis_hom_index = dist_table[:, 0].argmin()
        nr_homolog_hits.append(
            SeqRecord(homologous_seqs[dis_hom_index].seq, id='dummy_seq_01'))
        del dis_hom_index

    elif len(nr_homolog_hits) < 2 and check_unambiguous:
        if len(BA_support.filter_ambiguous_seqs_from_list(
                nr_homolog_hits)) == 0:
            # this mean query contain ambiguous bases
            raise exceptions.NoHomologousSequenceException
        else:
            msgs.append(msg)
            ml.info(msg)
            if ml.level > 20:
                print(msg)

            dis_hom_index = dist_table[:, 0].argmin()
            nr_homolog_hits.append(
                SeqRecord(homologous_seqs[dis_hom_index].seq,
                          id='dummy_seq_01'))
            del dis_hom_index
        homologous_seqs = BA_support.filter_ambiguous_seqs_from_list(
            homologous_seqs)

    elif len(nr_homolog_hits) >= 2 and not check_unambiguous:
        pass

    elif len(nr_homolog_hits) > 2 and check_unambiguous:
        nr_homolog_hits = BA_support.filter_ambiguous_seqs_from_list(
            nr_homolog_hits)
        homologous_seqs = BA_support.filter_ambiguous_seqs_from_list(
            homologous_seqs)

    elif len(nr_homolog_hits) == 2 and check_unambiguous:
        homologous_seqs = BA_support.filter_ambiguous_seqs_from_list(
            homologous_seqs)
        if len(BA_support.filter_ambiguous_seqs_from_list(
                nr_homolog_hits)) == 1:
            # this mean that query contains ambiguous base
            raise exceptions.NoHomologousSequenceException

    else:
        raise Exception()

    fd_h, nr_homo_hits_file = mkstemp(prefix='rba_',
                                      suffix='_58',
                                      dir=CONFIG.tmpdir)
    with os.fdopen(fd_h, 'w') as f:
        SeqIO.write(nr_homolog_hits, f, 'fasta')

    return nr_homo_hits_file, homologous_seqs, msgs

Example #18

0

Show file

def wrapped_ending_with_prediction(
    args_inner,
    analyzed_hits,
    pred_method=None,
    method_params=None,
    used_cm_file=None,
    multi_query=False,
    iteration=0,
):
    """
    wrapper for prediction of secondary structures
    :param args_inner: Namespace of input arguments
    :param analyzed_hits: BlastSearchRecompute object
    :param pred_method:
    :param method_params:
    :param used_cm_file: cmfile if cmfile is known (user given or computed)
    :return:
    """
    ml.debug(fname())
    exec_time = {}
    msg = 'Entering structure prediction..'
    if ml.level < 21:
        ml.info(msg)
    else:
        print(msg)
        ml.info(msg)

    if pred_method is None:
        pred_method = args_inner.prediction_method

    if isinstance(pred_method, str):
        pred_method = (pred_method, )

    if method_params is None:
        method_params = args_inner.pred_params

    # ======= filter if needed =======
    # do the filtering based on e-val or bitscore
    # homologous hits still gets used for prediction

    # annotate ambiguous bases
    query = BA_support.annotate_ambiguos_base(analyzed_hits.query)

    # copy the list before filtering
    all_hits_list = [i.extension for i in analyzed_hits.get_all_hits()]

    if args_inner.filter_by_eval is not None:
        hits2predict = filter_by_eval(analyzed_hits.get_all_hits(),
                                      BA_support.blast_hit_getter_from_subseq,
                                      args_inner.filter_by_eval)
        _hits = HitList()
        for h in hits2predict:
            _hits.append(h)
        analyzed_hits.hits = _hits
    elif args_inner.filter_by_bitscore is not None:
        hits2predict = filter_by_bits(analyzed_hits.get_all_hits(),
                                      BA_support.blast_hit_getter_from_subseq,
                                      args_inner.filter_by_bitscore)
        _hits = HitList()
        for h in hits2predict:
            _hits.append(h)
        analyzed_hits.hits = _hits
    else:
        analyzed_hits.hits = analyzed_hits.get_all_hits()

    # if used_cm_file is provided do not override it with CM from RFAM
    # if use_rfam flag was given, then used_cm_file is already the best_matching model
    # if analyzed_hits.best_matching_model is None - then we could not find the best matching model in RFAM
    #  and the rfam based methods should fail (i.e. not predict anything)
    delete_cm = False
    if used_cm_file is None and analyzed_hits.best_matching_model is not None:
        rfam = RfamInfo()
        used_cm_file = run_cmfetch(
            rfam.file_path, analyzed_hits.best_matching_model['target_name'])
        delete_cm = True

    fd, seqs2predict_fasta = mkstemp(prefix='rba_',
                                     suffix='_83',
                                     dir=CONFIG.tmpdir)
    with os.fdopen(fd, 'w') as fah:
        for hit in analyzed_hits.hits:
            if len(hit.extension.seq) == 0:
                continue
            fah.write('>{}\n{}\n'.format(hit.extension.id,
                                         str(hit.extension.seq)))

    if not isinstance(method_params, dict):
        raise Exception('prediction method parameters must be python dict')

    # prediction methods present in analyzed_hits
    #  which might be loaded from intermediate file

    # check if structures of a method are predicted for all required hit
    # check if the prediction parameters of such method were same

    prediction_msgs = []
    # compute prediction methods which were not computed
    for pkey in set(pred_method):
        # add sha1 hashes
        nh = sha1()
        nh.update(str(sorted(method_params.get(pkey, {}).items())).encode())
        current_hash = nh.hexdigest()

        if all(pkey in h.extension.letter_annotations for h in analyzed_hits.hits) and \
                len(
                    {
                        h.extension.annotations.get('sha1', {}).get(pkey, None) for h in analyzed_hits.hits
                    } | {current_hash, }
                ) == 1:
            msg_skip = 'All structures already computed for {}. Skipping...'.format(
                pkey)
            ml.info(msg_skip)
            if ml.level > 20:
                print(msg_skip, flush=True)
            continue

        msg_run = 'Running: {}...'.format(pkey)
        ml.info(msg_run)

        if ml.level > 20:
            print(msg_run, flush=True)

        structures, etime, msgs = repredict_structures_for_homol_seqs(
            query,
            seqs2predict_fasta,
            args_inner.threads,
            prediction_method=pkey,
            pred_method_params=method_params,
            all_hits_list=all_hits_list,
            seqs2predict_list=[i.extension for i in analyzed_hits.hits],
            use_cm_file=used_cm_file,
        )

        exec_time[pkey] = etime

        if structures is None:
            msg = 'Structures not predicted with {} method'.format(pkey)
            ml.info(msg)
            if ml.level > 20:
                print('STATUS: ' + msg)

        else:
            for i, hit in enumerate(analyzed_hits.hits):
                assert str(hit.extension.seq) == str(structures[i].seq)
                hit.extension.annotations['sss'] += [pkey]

                hit.extension.annotations['msgs'] += structures[
                    i].annotations.get('msgs', [])

                # expects "predicted" in annotations - for now, if not given, default is True, as not all prediction
                #  methods implement "predicted" in their output
                if structures[i].annotations.get('predicted', True):
                    hit.extension.letter_annotations[pkey] = structures[
                        i].letter_annotations['ss0']

                if 'sha1' not in hit.extension.annotations:
                    hit.extension.annotations['sha1'] = dict()
                hit.extension.annotations['sha1'][pkey] = current_hash

                try:
                    del hit.extension.letter_annotations['ss0']
                except KeyError:
                    pass
                try:
                    hit.extension.annotations['sss'].remove('ss0')
                except ValueError:
                    pass

            analyzed_hits.update_hit_stuctures()

        # check if msgs are not empty
        if msgs:
            prediction_msgs.append('{}: {}'.format(pkey, '\n'.join(msgs)))

        analyzed_hits.msgs = prediction_msgs

        with open(args_inner.blast_in + '.r-' + args_inner.sha1[:10],
                  'r+') as f:
            all_saved_data = json.load(f)
            all_saved_data[iteration] = blastsearchrecompute2dict(
                analyzed_hits)
            f.seek(0)
            f.truncate()
            json.dump(all_saved_data, f, indent=2)

    # remove structures predicted by different methods (which might be saved from previous computation)
    for hit in analyzed_hits.hits:
        for pkey in set(hit.extension.letter_annotations.keys()):
            if pkey not in pred_method:
                del hit.extension.letter_annotations[pkey]
                try:
                    hit.extension.annotations['sss'].remove(pkey)
                except ValueError:
                    pass

    BA_support.remove_one_file_with_try(seqs2predict_fasta)

    if delete_cm:
        BA_support.remove_one_file_with_try(used_cm_file)

    add_loc_to_description(analyzed_hits)

    # write html if requested
    if args_inner.html:
        html_file = iter2file_name(args_inner.html, multi_query, iteration)
        ml.info('Writing html to {}.'.format(html_file))
        with open(html_file, 'wb') as h:
            h.write(write_html_output(analyzed_hits))

    # write csv file if requested
    if args_inner.csv:
        csv_file = iter2file_name(args_inner.csv, multi_query, iteration)
        ml.info('Writing csv to {}.'.format(csv_file))
        analyzed_hits.to_csv(csv_file)

    # replace with json
    if args_inner.json:
        json_file = iter2file_name(args_inner.json, multi_query, iteration)
        ml.info('Writing json to {}.'.format(json_file))
        j_obj = json.dumps(blastsearchrecompute2dict(analyzed_hits), indent=2)
        if getattr(args_inner, 'zip_json', False):
            with open(json_file + '.gz', 'wb') as ff:
                ff.write(gzip.compress(j_obj.encode()))
        else:
            with open(json_file, 'w') as ff:
                ff.write(j_obj)

    if args_inner.pandas_dump:
        pickle_file = iter2file_name(args_inner.pandas_dump, multi_query,
                                     iteration)
        ml.info('Writing pandas pickle to {}.'.format(pickle_file))
        pandas.to_pickle(analyzed_hits.pandas, pickle_file)

    if args_inner.dump:
        dump_file = iter2file_name(args_inner.dump, multi_query, iteration)
        ml.info('Writing dump files base: {}.'.format(dump_file))
        with open(dump_file, 'wb') as pp:
            pickle.dump(analyzed_hits, pp, pickle.HIGHEST_PROTOCOL)

        with open(dump_file + '.time_dump', 'wb') as pp:
            pickle.dump(exec_time, pp, pickle.HIGHEST_PROTOCOL)

    return analyzed_hits

Example #19

0

Show file

def _trusted_hits_selection_wrapper(all_hits_,
                                    query_,
                                    cmscore_tr_,
                                    cm_threshold_percent_,
                                    len_diff_=0.1):
    """
    runs basic non_redundant sequences calculation (ie exact sequence match)
    selects homologous sequences from all hits list by cmscore threshold or by query sequence

    behaviour:
        will return distance array with similarities in % including query sequence and list of homologous sequences
        including query sequence

        if no sequence is homologous
        it will return empty array for distance matrix and list with query sequence
    """
    ml.debug(fname())
    msgs = []
    # trusted sequence selection
    # ========================================================
    assert (cmscore_tr_ == 0) or cm_threshold_percent_ is None

    score = _extract_cmscore_from_hom_seqs(all_hits_)

    if cm_threshold_percent_ is not None:
        selection_threshold = cm_threshold_percent_ * query_.annotations[
            'cmstat'].bit_sc / 100
    else:
        selection_threshold = cmscore_tr_

    pred = infer_hits_cm(score, tr=selection_threshold)
    trusted_seqs_ = [i for i, j in zip(all_hits_, pred) if j]

    if len(trusted_seqs_) == 0:
        msg = 'STATUS: No estimated full-length sequences from BLAST output ' \
              'selected as reference for structure prediction.\n' \
              ' Using query sequence as reference.'
        msgs.append(msg)
        ml.info(msg)
        if ml.level > 20:
            print(msg)
        return np.empty(0), [query_], msgs

    # add query to trusted sequences
    trusted_seqs_query = [query_] + trusted_seqs_

    # make nr list of sequences -> faster alignment
    # better selection
    nr_trusted_seqs_query = BA_support.non_redundant_seqs(trusted_seqs_query)

    # check if the homologous sequence is not exact match as query
    #  (ie taking non redundant set would be only one sequence)
    if len(nr_trusted_seqs_query) == 1:
        msg = 'STATUS: All sequences selected as reference are exactly same as query sequence.'
        msgs.append(msg)
        ml.info(msg)
        if ml.level > 20:
            print(msg)
        return np.empty(0), [query_], msgs

    # select only sequences in some predifined length range to query
    # this is needed for longish ncRNAs
    #   tolerate 10 % length difference?
    ref_len = len(query_)
    nr_len_selected_trusted = [
        seq for seq in nr_trusted_seqs_query
        if ref_len * (1 - len_diff_) < len(seq) < ref_len * (1 + len_diff_)
    ]

    # this is to control if only one sequence remained after filtering for length difference
    if len(nr_len_selected_trusted) == 1:
        msg = \
            'No sequence satisfy the length difference condition ({}: {}-{})'.format(
                len_diff_,
                ref_len * (1 - len_diff_),
                ref_len * (1 + len_diff_)
            )
        msgs.append(msg)
        ml.info(msg)
        if ml.level > 20:
            print(msg)
        return np.empty(0), [query_], msgs

    # sanitize seq names (muscle has issues with too long names)
    san_hom_seqs, san_dict = BA_support.sanitize_fasta_names_in_seqrec_list(
        nr_len_selected_trusted)

    c_fd, trusted_sequence_file_ = mkstemp(prefix='rba_',
                                           suffix='_60',
                                           dir=CONFIG.tmpdir)
    with os.fdopen(c_fd, 'w') as f:
        SeqIO.write(san_hom_seqs, f, 'fasta')

    align_file = BA_support.run_muscle(trusted_sequence_file_, reorder=True)
    alig = AlignIO.read(align_file, format='clustal')
    distance_calc = DistanceCalculator(model='identity')
    dist_mat = distance_calc.get_distance(alig)
    # rebuild index from sanitized
    orig_index = [san_dict[i] for i in dist_mat.names]
    dist_mat_pd = pandas.DataFrame.from_records(dist_mat.matrix,
                                                index=orig_index)
    dist_table_ = (1 - dist_mat_pd.values) * 100

    BA_support.remove_files_with_try([align_file, trusted_sequence_file_])
    return dist_table_, trusted_seqs_query, msgs

Example #20

0

Show file

def extend_locarna_core(analyzed_hits, query, args_inner, all_short, multi_query, iteration, ih_model):
    # expand hits according to query + 10 nucleotides +-
    if args_inner.db_type == "blastdb":
        shorts_expanded, _ = rna_blast_analyze.BR_core.extend_hits.expand_hits(
            all_short,
            args_inner.blast_db,
            len(query),
            extra=args_inner.subseq_window_locarna,
            blast_regexp=args_inner.blast_regexp,
            skip_missing=args_inner.skip_missing,
            msgs=analyzed_hits.msgs,
        )
    elif args_inner.db_type in ["fasta", "gb", "server", "entrez"]:
        shorts_expanded, _ = rna_blast_analyze.BR_core.extend_hits.expand_hits_from_fasta(
            all_short,
            args_inner.blast_db,
            len(query),
            extra=args_inner.subseq_window_locarna,
            blast_regexp=args_inner.blast_regexp,
            skip_missing=args_inner.skip_missing,
            msgs=analyzed_hits.msgs,
            format=args_inner.db_type,
            entrez_email=args_inner.entrez,
            blast_input_file=args_inner.blast_in,
        )
    else:
        raise exceptions.IncorrectDatabaseChoice()

    shorts_expanded = BA_support.rc_hits_2_rna(shorts_expanded)

    query_seq = query.seq.transcribe()

    # compute alignment here

    if args_inner.threads == 1:
        result = []
        for oeh in shorts_expanded:
            result.append(
                locarna_worker(
                    (
                        oeh,
                        query_seq,
                        args_inner.locarna_params,
                        args_inner.locarna_anchor_length
                    )
                )
            )
    else:
        pack = []
        for oeh in shorts_expanded:
            pack.append(
                (
                    oeh,
                    query_seq,
                    args_inner.locarna_params,
                    args_inner.locarna_anchor_length
                )
            )
        pool = Pool(processes=args_inner.threads)
        result = pool.map(locarna_worker, pack)
        pool.close()

    for res in result:
        if res.extension is None:
            analyzed_hits.hits_failed.append(res)
        else:
            analyzed_hits.hits.append(res)

    # this part predicts homology - it is not truly part of repredict
    homology_prediction, homol_seqs, cm_file_rfam_user = infer_homology(
        analyzed_hits=analyzed_hits, args=args_inner, cm_model_file=ih_model, multi_query=multi_query,
        iteration=iteration
    )
    # add homology prediction to the data
    for hit, pred in zip(analyzed_hits.hits, homology_prediction):
        hit.hpred = pred
    return analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user

Example #21

0

Show file

def turbofold_ext_nr_fast(all_seqs,
                          nrset,
                          cpu,
                          n,
                          turbofold_params,
                          pkey='Turbo-fast',
                          sha1val='',
                          timeout=None):
    # ambiguos sequence cannot be predicted with turbofold
    # do not predict sequence twice
    ml.debug(fname())

    msg_short_list = 'Turbo-fast: Number of sequences is less then required.'

    if not len(nrset) == len(
            BA_support.filter_ambiguous_seqs_from_list(nrset)) == len(
                BA_support.non_redundant_seqs(nrset)):
        msgfail = 'Wrong nr set specification.'
        if len(nrset) != len(
                BA_support.filter_ambiguous_seqs_from_list(nrset)):
            msgfail += 'nr set contains seq(s) with ambiguous character.'
        if len(nrset) != len(BA_support.non_redundant_seqs(nrset)):
            msgfail += 'nr set contain non redundant sequence(s).'
        ml.error(msgfail)
        raise AssertionError(msgfail)

    for seq in chain(all_seqs, nrset):
        if 'msgs' not in seq.annotations:
            seq.annotations['msgs'] = []

    list2predict = []
    for seq in all_seqs:
        if seq.annotations.get('ambiguous', False):
            ml.warning('Skipping prediction for {} (ambiguous base)'.format(
                seq.id))
            continue

        if pkey in seq.letter_annotations and seq.annotations.get(
                'sha1', {}).get(pkey) == sha1val:
            # nothing to do, the structure already computed
            continue

        seq_set = _prepare_set_n(seq, nrset, n)

        if len(seq_set) < 2:
            msgfail = "Turbo-fast can't be used with less then 2 sequences - {}".format(
                seq.id)
            ml.warning(msgfail)
            if ml.getEffectiveLevel() > 30:
                print(msgfail)
            continue

        if len(seq_set) < n:
            msg_short_list_custom = msg_short_list + ' n={} ({})'.format(
                len(seq_set), n)
            ml.info(msg_short_list_custom + ' ' + seq.id)
            seq.annotations['msgs'].append(msg_short_list_custom)
        list2predict.append((seq_set, turbofold_params, seq.id))

    if cpu == 1:
        pred_list = []
        for oneseqset, tpar, _ in list2predict:
            pred_list.append(run_turbofold(oneseqset, tpar, timeout=timeout))
    else:
        pool = multiprocessing.Pool(processes=cpu)
        pred_list = pool.map(_rt_wrapper, list2predict)
        pool.close()

    # rebuild predicted TurboFold structures
    # - take care that prediction might be empty if TurboFold fails
    out_list = []
    for out, l_in in zip(pred_list, list2predict):
        if isinstance(out, exceptions.SubprocessException):
            seq = next(s for s in all_seqs if s.id == l_in[2])
            seq.annotations['msgs'].append(str(out))
            # do not propagate call output
            # seq.annotations['msgs'].append(out.errors)
            continue
        elif isinstance(out, Exception):
            seq = next(s for s in all_seqs if s.id == l_in[2])
            seq.annotations['msgs'].append(str(out))
            continue

        sel = [o for o in out if o.id == l_in[2]]
        if len(sel) == 1:
            out_list.append(sel[0])

    return out_list

Example #22

0

Show file

File: test_support_functions.py Project: cas-bioinf/rboAnalyzer

 def test_nr(self):
     for seqs_list, check_n in self.test_cases:
         nr = BA_support.non_redundant_seqs(seqs_list)
         self.assertEqual(len(nr), len(check_n), 'length do not match')
         self.assertEqual({str(seq.seq) for seq in nr}, check_n)

Example #23

0

Show file

def repredict_structures_for_homol_seqs(
    query,
    seqs2predict_fasta,
    threads=None,
    prediction_method=None,
    pred_method_params=None,
    all_hits_list=None,
    seqs2predict_list=None,
    use_cm_file=None,
):
    """Run RNA structure prediction based on chosen method and parameters.
    """

    default_sim_tr_perc = 90
    default_score_tr = 0.0
    query_max_len_diff = 0.1

    try:
        if 'default' == prediction_method:
            # do nothing
            return None, None, []

        elif 'rfam-Rc' == prediction_method:
            if use_cm_file is None:
                msg = "No CM model. Can't use {}.".format(prediction_method)
                ml.warning(msg)
                return None, None, [msg]
            else:
                structures, exec_time = cmmodel_rnafold_c(
                    seqs2predict_fasta,
                    use_cm_file,
                    threads=threads,
                    params=pred_method_params.get(prediction_method, {}))
                return structures, exec_time, []

        elif 'rfam-centroid' == prediction_method:
            # run cmscan if needed
            # run cmfetch
            # run cmemit -> homologous seqs
            # run centroid_homfold

            method_parameters = pred_method_params.get(prediction_method, {})
            if use_cm_file is None:
                msg = "No CM model. Can't use {}.".format(prediction_method)
                ml.warning(msg)
                return None, None, [msg]
            else:
                cep = method_parameters.get('cmemit', '')
                if '-u' not in cep:
                    cep += ' -u'
                if '-N' not in cep:
                    cep += ' -N {}'.format(method_parameters.get('n_seqs', 10))

                hf_file = run_cmemit(use_cm_file, params=cep)

                structures, exec_time = me_centroid_homfold(
                    seqs2predict_fasta, hf_file, params=method_parameters)

                BA_support.remove_one_file_with_try(hf_file)
                return structures, exec_time, []

        elif 'rfam-sub' == prediction_method:
            if use_cm_file is None:
                msg = "No CM model. Can't use {}.".format(prediction_method)
                ml.warning(msg)
                return None, None, [msg]
            else:
                ref_structure = extract_ref_from_cm(use_cm_file)

                structures, exec_time = rfam_subopt_pred(
                    seqs2predict_fasta,
                    ref_structure,
                    params=pred_method_params.get(prediction_method, None),
                    threads=threads,
                )
                return structures, exec_time, []

        elif 'rnafold' == prediction_method:
            structures, exec_time = rnafold_wrap_for_predict(
                seqs2predict_fasta,
                params=pred_method_params.get(prediction_method,
                                              {}).get('RNAfold', ''))
            return structures, exec_time, []

        elif 'fq-sub' == prediction_method:
            a, qf = mkstemp(prefix='rba_', suffix='_55', dir=CONFIG.tmpdir)
            with os.fdopen(a, 'w') as fd:
                fd.write('>query\n{}\n'.format(str(query.seq)))

            structures, exec_time = subopt_fold_query(
                seqs2predict_fasta,
                qf,
                params=pred_method_params.get(prediction_method, None),
                threads=threads)
            BA_support.remove_one_file_with_try(qf)
            return structures, exec_time, []

        elif 'C-A-sub' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, homologous_seqs, msgs = create_nr_trusted_hits_file_MSA_safe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file

            f, homologous_sequence_file = mkstemp(prefix='rba_',
                                                  suffix='_64',
                                                  dir=CONFIG.tmpdir)
            with os.fdopen(f, 'w') as fh:
                SeqIO.write(homologous_seqs, fh, 'fasta')

            structures, exec_time = subopt_fold_alifold(
                seqs2predict_fasta,
                homologous_sequence_file,
                aligner='clustalo',
                params=method_parameters,
                threads=threads)
            BA_support.remove_one_file_with_try(homologous_sequence_file)
            del homologous_sequence_file
            del homologous_seqs
            return structures, exec_time, msgs

        elif 'M-A-sub' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, homologous_seqs, msgs = create_nr_trusted_hits_file_MSA_safe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file

            f, homologous_sequence_file = mkstemp(prefix='rba_',
                                                  suffix='_65',
                                                  dir=CONFIG.tmpdir)
            with os.fdopen(f, 'w') as fh:
                SeqIO.write(homologous_seqs, fh, 'fasta')

            structures, exec_time = subopt_fold_alifold(
                seqs2predict_fasta,
                homologous_sequence_file,
                aligner='muscle',
                params=method_parameters,
                threads=threads,
            )

            BA_support.remove_one_file_with_try(homologous_sequence_file)
            del homologous_sequence_file
            del homologous_seqs
            return structures, exec_time, msgs

        elif 'C-A-r-Rc' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, _, msgs = create_nr_trusted_hits_file_MSA_safe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )

            structures, exec_time = alifold_refold_prediction(
                nr_homo_hits_file,
                seqs2predict_fasta,
                refold='refold_rnafoldc',
                threads=threads,
                params=method_parameters,
                msa_alg='clustalo')

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file
            return structures, exec_time, msgs

        elif 'M-A-r-Rc' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, _, msgs = create_nr_trusted_hits_file_MSA_safe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )
            structures, exec_time = alifold_refold_prediction(
                nr_homo_hits_file,
                seqs2predict_fasta,
                refold='refold_rnafoldc',
                threads=threads,
                params=method_parameters,
                msa_alg='muscle')

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file
            return structures, exec_time, msgs

        elif 'C-A-U-r-Rc' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, _, msgs = create_nr_trusted_hits_file_MSA_safe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )
            structures, exec_time = alifold_refold_prediction(
                nr_homo_hits_file,
                seqs2predict_fasta,
                refold='conserved_ss_rnafoldc',
                threads=threads,
                params=method_parameters,
                msa_alg='clustalo')

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file
            return structures, exec_time, msgs

        elif 'M-A-U-r-Rc' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, _, msgs = create_nr_trusted_hits_file_MSA_safe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )
            structures, exec_time = alifold_refold_prediction(
                nr_homo_hits_file,
                seqs2predict_fasta,
                refold='conserved_ss_rnafoldc',
                threads=threads,
                params=method_parameters,
                msa_alg='muscle')

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file
            return structures, exec_time, msgs

        elif 'centroid' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, _, msgs = create_nr_homolog_hits_file_MSA_unsafe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )

            raw_structures, exec_time = me_centroid_homfold(
                seqs2predict_fasta,
                nr_homo_hits_file,
                params=method_parameters)

            # check noncanonical
            if prediction_method in pred_method_params and pred_method_params[
                    prediction_method]:
                allow_nc = pred_method_params[prediction_method].get(
                    'allow_noncanonical', False)
                allow_lp = pred_method_params[prediction_method].get(
                    'allow_lonely_pairs', False)
            else:
                allow_nc = False
                allow_lp = False
            if not allow_nc:
                for seq in raw_structures:
                    repstr = find_nc_and_remove(
                        str(seq.seq), structure=seq.letter_annotations['ss0'])
                    seq.letter_annotations['ss0'] = repstr

            # check lonely basepairs
            if not allow_lp:
                for seq in raw_structures:
                    repstr = check_lonely_bp(seq.letter_annotations['ss0'])
                    seq.letter_annotations['ss0'] = repstr

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file
            return raw_structures, exec_time, msgs

        elif 'centroid-fast' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})
            if query.annotations['ambiguous']:
                raise exceptions.AmbiguousQuerySequenceException

            raw_structures, exec_time = centroid_homfold_fast(
                all_seqs=all_hits_list,
                query=query,
                all_seqs_fasta=seqs2predict_fasta,
                n=method_parameters.get('max_seqs_in_prediction', 10),
                centroid_homfold_params=method_parameters,
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff))

            # check noncanonical
            if prediction_method in pred_method_params and pred_method_params[
                    prediction_method]:
                allow_nc = pred_method_params[prediction_method].get(
                    'allow_noncanonical', False)
                allow_lp = pred_method_params[prediction_method].get(
                    'allow_lonely_pairs', False)
            else:
                allow_nc = False
                allow_lp = False
            if not allow_nc:
                for seq in raw_structures:
                    repstr = find_nc_and_remove(
                        str(seq.seq), structure=seq.letter_annotations['ss0'])
                    seq.letter_annotations['ss0'] = repstr

            # check lonely basepairs
            if not allow_lp:
                for seq in raw_structures:
                    repstr = check_lonely_bp(seq.letter_annotations['ss0'])
                    seq.letter_annotations['ss0'] = repstr

            return raw_structures, exec_time, []

        elif 'TurboFold' == prediction_method:
            # set arbitrary sim_threshold_percent to 100, because we want to remove only identical sequences from prediction
            #  with TurboFold. The structure of redundant sequences will be set according to the one in prediction
            all_hits_filtered = BA_support.filter_ambiguous_seqs_from_list(
                all_hits_list)
            seqs2predict_filtered = BA_support.filter_ambiguous_seqs_from_list(
                seqs2predict_list)
            if len(seqs2predict_list) != len(seqs2predict_filtered):
                ml.warning(
                    'Some sequences contain ambiguous bases - they will not be predicted.'
                )

            if query.annotations['ambiguous']:
                raise exceptions.AmbiguousQuerySequenceException()

            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, _, msgs = create_nr_homolog_hits_file_MSA_unsafe(
                all_hits=all_hits_filtered,
                query=query,
                sim_threshold_percent=100,
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )

            with open(nr_homo_hits_file, 'r') as nrf:
                nr_homo_hits = [
                    seq for seq in SeqIO.parse(nrf, format='fasta')
                ]

            nh = sha1()
            nh.update(str(sorted(method_parameters.items())).encode())
            nh_str = nh.hexdigest()

            structures_t, exec_time = turbofold_with_homologous(
                all_sequences=seqs2predict_filtered,
                nr_homologous=nr_homo_hits,
                params=method_parameters.get('TurboFold', {}),
                n=method_parameters.get('max_seqs_in_prediction', 3),
                cpu=threads,
                pkey=prediction_method,
                sha1val=nh_str,
            )

            structures = BA_support.rebuild_structures_output_from_pred(
                seqs2predict_list, structures_t)

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del structures_t
            del nr_homo_hits
            del nr_homo_hits_file
            return structures, exec_time, msgs

        elif 'Turbo-fast' == prediction_method:
            if query.annotations['ambiguous']:
                raise exceptions.AmbiguousQuerySequenceException()

            nh = sha1()
            nh.update(
                str(
                    sorted(
                        pred_method_params.get(prediction_method,
                                               {}).items())).encode())
            nh_str = nh.hexdigest()

            structures_t, exec_time = turbofold_fast(
                all_seqs=all_hits_list,
                seqs2predict=seqs2predict_list,
                query=query,
                cpu=threads,
                n=pred_method_params.get(prediction_method,
                                         {}).get('max_seqs_in_prediction', 3),
                turbofold_params=pred_method_params.get(prediction_method,
                                                        {}).get(
                                                            'TurboFold', {}),
                len_diff=pred_method_params.get(prediction_method, {}).get(
                    'query_max_len_diff', query_max_len_diff),
                pkey=prediction_method,
                sha1val=nh_str,
            )

            structures = BA_support.rebuild_structures_output_from_pred(
                seqs2predict_list, structures_t)

            del structures_t
            return structures, exec_time, []

    except exceptions.NoHomologousSequenceException:
        msg = nonhomseqwarn(prediction_method)
        return None, None, [msg]
    except exceptions.AmbiguousQuerySequenceException:
        msgfail = "Query sequence contains ambiguous characters. Can't use {}.".format(
            prediction_method)
        ml.warning(msgfail)
        return None, None, [msgfail]
    except exceptions.SubprocessException as e:
        msg = "{} can't be used. Error message follows: {} \n{}".format(
            prediction_method, str(e), e.errors)
        ml.error(msg)
        return None, None, [str(e)]
    except Exception as e:
        ml.error("{} can't be used. Error message follows: \n{}.".format(
            prediction_method, str(e)))
        return None, None, [str(e)]

    assert False, "Should not reach here (bad prediction method name)."

Example #24

0

Show file

def locarna_worker(pack):
    ml.debug(fname())
    one_expanded_hit, query_seq, locarna_params, anchor_length = pack

    locarna_file1 = locarna_file2 = loc_out_file = None

    try:
        # read the aligned segment and use it as anchors for locarna
        # run locarna in local mode and put the query sequence with the extended sequence with the blast aligned
        # segment as anchor
        blast_entry = one_expanded_hit.annotations['blast'][1]

        anchors = LocarnaAnchor(
            to_rna(blast_entry.query),
            blast_entry.match,
            to_rna(blast_entry.sbjct),
            anchor_length=anchor_length
        )

        if anchors.too_many_anchors:
            ml.info('Too many anchors for {}. Can handle up to 520 distinct anchors.'.format(one_expanded_hit.id))
        # extracted temp is my query

        # access the locarna aligner directly
        fd1, locarna_file1 = mkstemp(prefix='rba_', suffix='_20', dir=CONFIG.tmpdir)
        with os.fdopen(fd1, 'w') as fp_locarna_file_1:
            ql1, ql2 = anchors.anchor_whole_seq(str(query_seq), 'query')
            write_clustal_like_file_with_anchors(fp_locarna_file_1,
                                                 'query',
                                                 str(query_seq),
                                                 (
                                                     ('#A1', ql1.split()[0]),
                                                     ('#A2', ql2.split()[0])
                                                 ))

        fd2, locarna_file2 = mkstemp(prefix='rba_', suffix='_21', dir=CONFIG.tmpdir)
        with os.fdopen(fd2, 'w') as fp_locarna_file_2:
            sl1, sl2 = anchors.anchor_whole_seq(str(one_expanded_hit.seq), 'subject')
            write_clustal_like_file_with_anchors(fp_locarna_file_2,
                                                 one_expanded_hit.id,
                                                 str(one_expanded_hit.seq),
                                                 (
                                                     ('#A1', sl1.split()[0]),
                                                     ('#A2', sl2.split()[0])
                                                 ))

        loc_out_file = run_locarna(
            locarna_file1,
            locarna_file2,
            locarna_params
        )

        # read locarna alignment
        with open(loc_out_file, 'r') as f:
            locarna_alig = parse_locarna_alignment(f)

        if len(locarna_alig) != 2:
            raise exceptions.SubseqMatchError('There must be 2 sequences in Locarna alignment.')

        loc_rep = create_report_object_from_locarna(one_expanded_hit, locarna_alig)

        return loc_rep
    except exceptions.LocarnaException as e:
        one_expanded_hit.annotations['msgs'] = [str(e), e.errors]
        empty_hit = BA_support.Subsequences(one_expanded_hit)
        return empty_hit
    except (exceptions.SubseqMatchError, exceptions.ParsingError) as e:
        one_expanded_hit.annotations['msgs'] = [str(e)]
        empty_hit = BA_support.Subsequences(one_expanded_hit)
        return empty_hit
    except (TypeError, AttributeError, FileNotFoundError) as e:
        one_expanded_hit.annotations['msgs'] = [str(e)]
        empty_hit = BA_support.Subsequences(one_expanded_hit)
        return empty_hit
    finally:
        for f in [locarna_file1, locarna_file2, loc_out_file]:
            if f is not None:
                BA_support.remove_one_file_with_try(f)

Example #25

0

Show file

def lunch_computation(args_inner, shared_list=None):
    ml.debug(fname())
    if not shared_list:
        shared_list = []

    # update params if different config is requested
    CONFIG.override(tools_paths(args_inner.config_file))

    p_blast = BA_support.blast_in(args_inner.blast_in, b=args_inner.b_type)
    query_seqs = [i for i in SeqIO.parse(args_inner.blast_query, 'fasta')]

    if len(p_blast) != len(query_seqs):
        ml.error(
            'Number of query sequences in provided BLAST output file ({}) does not match number of query sequences'
            ' in query FASTA file ({}).'.format(len(p_blast), len(query_seqs)))
        sys.exit(1)

    # check if BLAST does not contain unexpected sequence characters
    validate_args.check_blast(p_blast)

    # create list of correct length if needed
    all_saved_data = [None] * len(query_seqs)
    saved_file = '{}.r-{}'.format(args_inner.blast_in, args_inner.sha1[:10])
    with open(saved_file, 'r+') as f:
        _saved = json.load(f)
        if _saved is None:
            f.seek(0)
            f.truncate()
            json.dump(all_saved_data, f)
        else:
            msg = "Loading backup data."
            print('STATUS: ' + msg)
            ml.info(msg + ' file: ' + saved_file)
            all_saved_data = _saved

            for saved_data in all_saved_data:
                # we can have partially computed data
                if saved_data is None:
                    continue
                if saved_data['args']['sha1'] != args_inner.sha1:
                    msg = "Input argument hash does not match the saved argument hash. "
                    if saved_data['args']['sha1'][:10] == args_inner.sha1[:10]:
                        msg += "This is because of truncating hashes to first 10 characters. "
                        msg += "Please remove the '{}' file.".format(
                            saved_file)
                        ml.error(msg)
                        sys.exit(1)
                    else:
                        msg += "Please remove the '{}' file.".format(
                            saved_file)
                        sys.exit(1)

    if len(p_blast) > 1:
        multi_query = True
    else:
        multi_query = False

    # this is done for each query
    ml_out_line = []
    all_analyzed = []
    for iteration, (bhp, query, saved_data) in enumerate(
            zip(p_blast, query_seqs, all_saved_data)):
        if saved_data is None:
            print('STATUS: processing query: {}'.format(query.id))
            validate_args.verify_query_blast(blast=bhp, query=query)

            analyzed_hits = BlastSearchRecompute(args_inner, query, iteration)
            analyzed_hits.multi_query = multi_query

            # run cm model build
            # allows to fail fast if rfam was selected and we dont find the model
            ih_model, analyzed_hits = find_and_extract_cm_model(
                args_inner, analyzed_hits)

            # select all
            all_blast_hits = BA_support.blast_hsps2list(bhp)

            if len(all_blast_hits) == 0:
                ml.error('No hits found in {} - {}. Nothing to do.'.format(
                    args_inner.blast_in, bhp.query))
                continue

            # filter if needed
            if args_inner.filter_by_eval is not None:
                tmp = filter_by_eval(all_blast_hits,
                                     BA_support.blast_hit_getter_from_hits,
                                     args_inner.filter_by_eval)
                if len(tmp) == 0 and len(all_blast_hits) != 0:
                    ml.error(
                        'The requested filter removed all BLAST hits {} - {}. Nothing to do.'
                        .format(args_inner.blast_in, bhp.query))
                    continue
            elif args_inner.filter_by_bitscore is not None:
                tmp = filter_by_bits(all_blast_hits,
                                     BA_support.blast_hit_getter_from_hits,
                                     args_inner.filter_by_bitscore)
                if len(tmp) == 0 and len(all_blast_hits) != 0:
                    ml.error(
                        'The requested filter removed all BLAST hits {} - {}. Nothing to do.'
                        .format(args_inner.blast_in, bhp.query))
                    continue

            all_short = all_blast_hits

            # now this is different for each mode
            if args_inner.mode == 'simple':
                analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user = extend_simple_core(
                    analyzed_hits, query, args_inner, all_short, multi_query,
                    iteration, ih_model)
            elif args_inner.mode == 'locarna':
                analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user = extend_locarna_core(
                    analyzed_hits, query, args_inner, all_short, multi_query,
                    iteration, ih_model)
            elif args_inner.mode == 'meta':
                analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user = extend_meta_core(
                    analyzed_hits, query, args_inner, all_short, multi_query,
                    iteration, ih_model)
            else:
                raise ValueError(
                    'Unknown option - should be cached by argparse.')

            if len(analyzed_hits.hits) == 0:
                ml.error(
                    "Extension failed for all sequences. Please see the error message. You can also try '--mode simple'."
                )
                sys.exit(1)

            analyzed_hits.copy_hits()

            with open(args_inner.blast_in + '.r-' + args_inner.sha1[:10],
                      'r+') as f:
                all_saved_data = json.load(f)
                all_saved_data[iteration] = blastsearchrecompute2dict(
                    analyzed_hits)
                f.seek(0)
                f.truncate()
                json.dump(all_saved_data, f, indent=2)

        else:
            print(
                'STATUS: extended sequences loaded from backup file for query {}'
                .format(query.id))
            analyzed_hits = blastsearchrecomputefromdict(saved_data)

            # overwrite the saved args with current
            # this will update used prediction methods and other non essential stuff
            analyzed_hits.args = args_inner

            if analyzed_hits.args.cm_file:
                cm_file_rfam_user = analyzed_hits.args.cm_file
            else:
                cm_file_rfam_user = None

        all_analyzed.append(analyzed_hits)

        # write all hits to fasta
        fda, all_hits_fasta = mkstemp(prefix='rba_',
                                      suffix='_22',
                                      dir=CONFIG.tmpdir)
        os.close(fda)
        analyzed_hits.write_results_fasta(all_hits_fasta)

        out_line = []
        # multiple prediction params
        if args_inner.dev_pred:
            dp_list = []
            # acomodate more dev pred outputs
            dpfile = None
            if getattr(args_inner, 'dump', False):
                dpfile = args_inner.dump.strip('dump')
            if getattr(args_inner, 'pandas_dump', False):
                dpfile = args_inner.pandas_dump.strip('pandas_dump')
            if getattr(args_inner, 'json', False):
                dpfile = args_inner.json.strip('json')

            # optimization so the rfam cm file is used only once
            if cm_file_rfam_user is None and 'rfam' in ''.join(
                    args_inner.prediction_method):
                best_model = get_cm_model(args_inner.blast_query,
                                          threads=args_inner.threads)
                rfam = RfamInfo()
                cm_file_rfam_user = run_cmfetch(rfam.file_path, best_model)

            for method in args_inner.prediction_method:
                # cycle the prediction method settings
                # get set of params for each preditcion
                selected_pred_params = [
                    kk for kk in args_inner.pred_params if method in kk
                ]
                shuffle(selected_pred_params)
                # for method_params in args_inner.pred_params:
                for i, method_params in enumerate(selected_pred_params):
                    ah = deepcopy(analyzed_hits)

                    random_flag = BA_support.generate_random_name(
                        8, shared_list)
                    shared_list.append(random_flag)

                    pname = re.sub(' ', '', str(method))
                    flag = '|pred_params|' + random_flag

                    # rebuild the args only with actualy used prediction settings
                    ah.args.prediction_method = method
                    ah.args.pred_params = method_params

                    if getattr(args_inner, 'dump', False):
                        spa = args_inner.dump.split('.')
                        ah.args.dump = '.'.join(
                            spa[:-1]) + flag + '.' + spa[-1]
                    if getattr(args_inner, 'pandas_dump', False):
                        spa = args_inner.pandas_dump.split('.')
                        ah.args.pandas_dump = '.'.join(
                            spa[:-1]) + flag + '.' + spa[-1]
                    if getattr(args_inner, 'pdf_out', False):
                        spa = args_inner.pdf_out.split('.')
                        ah.args.pdf_out = '.'.join(
                            spa[:-1]) + flag + '.' + spa[-1]
                    if getattr(args_inner, 'json', False):
                        spa = args_inner.json.split('.')
                        ah.args.json = '.'.join(
                            spa[:-1]) + flag + '.' + spa[-1]

                    wrapped_ending_with_prediction(
                        args_inner=ah.args,
                        analyzed_hits=ah,
                        pred_method=method,
                        method_params=method_params,
                        used_cm_file=cm_file_rfam_user,
                        multi_query=multi_query,
                        iteration=iteration,
                    )
                    success = True
                    out_line.append(to_tab_delim_line_simple(ah.args))

                    dp_list.append((i, method_params, success, flag, pname,
                                    random_flag, args_inner.pred_params))

            if dpfile is not None:
                with open(dpfile + 'devPredRep', 'wb') as devf:
                    pickle.dump(dp_list, devf)
        else:
            wrapped_ending_with_prediction(
                args_inner=args_inner,
                analyzed_hits=analyzed_hits,
                used_cm_file=cm_file_rfam_user,
                multi_query=multi_query,
                iteration=iteration,
            )
            out_line.append(to_tab_delim_line_simple(args_inner))

        ml_out_line.append('\n'.join(out_line))

        if cm_file_rfam_user is not None and os.path.exists(cm_file_rfam_user):
            BA_support.remove_one_file_with_try(cm_file_rfam_user)

        BA_support.remove_one_file_with_try(all_hits_fasta)
    return '\n'.join(ml_out_line), all_analyzed