Esempio n. 1
0
def _prepare_set_n(seq, nr_seqs, n):
    if len(nr_seqs) - n + 1 < 0:
        return BA_support.non_redundant_seqs([seq] + nr_seqs)

    i = 0
    while i != len(nr_seqs) - n + 1:
        seqs = [seq] + nr_seqs[:n - 1 + i]
        n_nr_seqs = BA_support.non_redundant_seqs(seqs)
        if len(n_nr_seqs) == n:
            return n_nr_seqs
        i += 1

    return BA_support.non_redundant_seqs([seq] + nr_seqs)
 def test_nr(self):
     for seqs_list, check_n in self.test_cases:
         nr = BA_support.non_redundant_seqs(seqs_list)
         self.assertEqual(len(nr), len(check_n), 'length do not match')
         self.assertEqual({str(seq.seq) for seq in nr}, check_n)
Esempio n. 3
0
def turbofold_ext_nr_fast(all_seqs,
                          nrset,
                          cpu,
                          n,
                          turbofold_params,
                          pkey='Turbo-fast',
                          sha1val='',
                          timeout=None):
    # ambiguos sequence cannot be predicted with turbofold
    # do not predict sequence twice
    ml.debug(fname())

    msg_short_list = 'Turbo-fast: Number of sequences is less then required.'

    if not len(nrset) == len(
            BA_support.filter_ambiguous_seqs_from_list(nrset)) == len(
                BA_support.non_redundant_seqs(nrset)):
        msgfail = 'Wrong nr set specification.'
        if len(nrset) != len(
                BA_support.filter_ambiguous_seqs_from_list(nrset)):
            msgfail += 'nr set contains seq(s) with ambiguous character.'
        if len(nrset) != len(BA_support.non_redundant_seqs(nrset)):
            msgfail += 'nr set contain non redundant sequence(s).'
        ml.error(msgfail)
        raise AssertionError(msgfail)

    for seq in chain(all_seqs, nrset):
        if 'msgs' not in seq.annotations:
            seq.annotations['msgs'] = []

    list2predict = []
    for seq in all_seqs:
        if seq.annotations.get('ambiguous', False):
            ml.warning('Skipping prediction for {} (ambiguous base)'.format(
                seq.id))
            continue

        if pkey in seq.letter_annotations and seq.annotations.get(
                'sha1', {}).get(pkey) == sha1val:
            # nothing to do, the structure already computed
            continue

        seq_set = _prepare_set_n(seq, nrset, n)

        if len(seq_set) < 2:
            msgfail = "Turbo-fast can't be used with less then 2 sequences - {}".format(
                seq.id)
            ml.warning(msgfail)
            if ml.getEffectiveLevel() > 30:
                print(msgfail)
            continue

        if len(seq_set) < n:
            msg_short_list_custom = msg_short_list + ' n={} ({})'.format(
                len(seq_set), n)
            ml.info(msg_short_list_custom + ' ' + seq.id)
            seq.annotations['msgs'].append(msg_short_list_custom)
        list2predict.append((seq_set, turbofold_params, seq.id))

    if cpu == 1:
        pred_list = []
        for oneseqset, tpar, _ in list2predict:
            pred_list.append(run_turbofold(oneseqset, tpar, timeout=timeout))
    else:
        pool = multiprocessing.Pool(processes=cpu)
        pred_list = pool.map(_rt_wrapper, list2predict)
        pool.close()

    # rebuild predicted TurboFold structures
    # - take care that prediction might be empty if TurboFold fails
    out_list = []
    for out, l_in in zip(pred_list, list2predict):
        if isinstance(out, exceptions.SubprocessException):
            seq = next(s for s in all_seqs if s.id == l_in[2])
            seq.annotations['msgs'].append(str(out))
            # do not propagate call output
            # seq.annotations['msgs'].append(out.errors)
            continue
        elif isinstance(out, Exception):
            seq = next(s for s in all_seqs if s.id == l_in[2])
            seq.annotations['msgs'].append(str(out))
            continue

        sel = [o for o in out if o.id == l_in[2]]
        if len(sel) == 1:
            out_list.append(sel[0])

    return out_list
Esempio n. 4
0
def _trusted_hits_selection_wrapper(all_hits_,
                                    query_,
                                    cmscore_tr_,
                                    cm_threshold_percent_,
                                    len_diff_=0.1):
    """
    runs basic non_redundant sequences calculation (ie exact sequence match)
    selects homologous sequences from all hits list by cmscore threshold or by query sequence

    behaviour:
        will return distance array with similarities in % including query sequence and list of homologous sequences
        including query sequence

        if no sequence is homologous
        it will return empty array for distance matrix and list with query sequence
    """
    ml.debug(fname())
    msgs = []
    # trusted sequence selection
    # ========================================================
    assert (cmscore_tr_ == 0) or cm_threshold_percent_ is None

    score = _extract_cmscore_from_hom_seqs(all_hits_)

    if cm_threshold_percent_ is not None:
        selection_threshold = cm_threshold_percent_ * query_.annotations[
            'cmstat'].bit_sc / 100
    else:
        selection_threshold = cmscore_tr_

    pred = infer_hits_cm(score, tr=selection_threshold)
    trusted_seqs_ = [i for i, j in zip(all_hits_, pred) if j]

    if len(trusted_seqs_) == 0:
        msg = 'STATUS: No estimated full-length sequences from BLAST output ' \
              'selected as reference for structure prediction.\n' \
              ' Using query sequence as reference.'
        msgs.append(msg)
        ml.info(msg)
        if ml.level > 20:
            print(msg)
        return np.empty(0), [query_], msgs

    # add query to trusted sequences
    trusted_seqs_query = [query_] + trusted_seqs_

    # make nr list of sequences -> faster alignment
    # better selection
    nr_trusted_seqs_query = BA_support.non_redundant_seqs(trusted_seqs_query)

    # check if the homologous sequence is not exact match as query
    #  (ie taking non redundant set would be only one sequence)
    if len(nr_trusted_seqs_query) == 1:
        msg = 'STATUS: All sequences selected as reference are exactly same as query sequence.'
        msgs.append(msg)
        ml.info(msg)
        if ml.level > 20:
            print(msg)
        return np.empty(0), [query_], msgs

    # select only sequences in some predifined length range to query
    # this is needed for longish ncRNAs
    #   tolerate 10 % length difference?
    ref_len = len(query_)
    nr_len_selected_trusted = [
        seq for seq in nr_trusted_seqs_query
        if ref_len * (1 - len_diff_) < len(seq) < ref_len * (1 + len_diff_)
    ]

    # this is to control if only one sequence remained after filtering for length difference
    if len(nr_len_selected_trusted) == 1:
        msg = \
            'No sequence satisfy the length difference condition ({}: {}-{})'.format(
                len_diff_,
                ref_len * (1 - len_diff_),
                ref_len * (1 + len_diff_)
            )
        msgs.append(msg)
        ml.info(msg)
        if ml.level > 20:
            print(msg)
        return np.empty(0), [query_], msgs

    # sanitize seq names (muscle has issues with too long names)
    san_hom_seqs, san_dict = BA_support.sanitize_fasta_names_in_seqrec_list(
        nr_len_selected_trusted)

    c_fd, trusted_sequence_file_ = mkstemp(prefix='rba_',
                                           suffix='_60',
                                           dir=CONFIG.tmpdir)
    with os.fdopen(c_fd, 'w') as f:
        SeqIO.write(san_hom_seqs, f, 'fasta')

    align_file = BA_support.run_muscle(trusted_sequence_file_, reorder=True)
    alig = AlignIO.read(align_file, format='clustal')
    distance_calc = DistanceCalculator(model='identity')
    dist_mat = distance_calc.get_distance(alig)
    # rebuild index from sanitized
    orig_index = [san_dict[i] for i in dist_mat.names]
    dist_mat_pd = pandas.DataFrame.from_records(dist_mat.matrix,
                                                index=orig_index)
    dist_table_ = (1 - dist_mat_pd.values) * 100

    BA_support.remove_files_with_try([align_file, trusted_sequence_file_])
    return dist_table_, trusted_seqs_query, msgs