Exemple #1
0
def create_report_object_from_locarna(exp_hit, locarna_alig):
    """
    create object which will be appended to BlastSearchRecompute class
    This needs to be Subsequences object

    :param exp_hit:
    :param locarna_alig:
    :return:
    """
    ml.debug(fname())
    # chop alignment by seq
    query_ind = [i for i, j in enumerate(locarna_alig) if j.id == 'query']
    if len(query_ind) != 1:
        raise exceptions.SubseqMatchError('Got multiple hits with id "query" in the Locarna alignment.')
    trimmed_locarna_alig = trim_alignment_by_sequence(
        locarna_alig,
        str(locarna_alig[query_ind[0]].seq),
        structure_annotation='SS_cons'
    )

    aligned_subsequence = BA_support.select_analyzed_aligned_hit(trimmed_locarna_alig, exp_hit.id)

    # add annotations from exp hit
    aligned_subsequence.annotations = exp_hit.annotations
    aligned_subsequence.name = exp_hit.name

    # also add annotations from locarna, mainly score
    aligned_subsequence.annotations.update(locarna_alig.annotations)

    # get the structure
    # by refold
    refold_structures = refold_stockholm(trimmed_locarna_alig, trimmed_locarna_alig.column_annotations['SS_cons'])

    # select refold structure for my seq
    seq_refold_structure = _select_refold_structure(refold_structures, exp_hit.id)

    aligned_subsequence.letter_annotations['ss0'] = seq_refold_structure.letter_annotations['ss0']
    aligned_subsequence.annotations['sss'] = ['ss0']

    # prepare seq_record for subsequences
    aligned_subsequence.description = ''
    hit = BA_support.Subsequences(exp_hit)

    hit.extension = aligned_subsequence

    # find the matching sequence
    pos_match = re.search(str(aligned_subsequence.seq), str(exp_hit.seq), flags=re.IGNORECASE)
    if not pos_match:
        raise exceptions.SubseqMatchError(
            'Aligned portion of subject sequence in Locarna alignment was not found in parent sequence.'
        )

    hit.best_start, hit.best_end = compute_true_location_locarna(hit, pos_match)

    return hit
def create_blast_only_report_object(exp_hit, query_len):
    # init new Subsequences object
    #  here the object source shadows the final hit
    hit = BA_support.Subsequences(exp_hit)

    # init new SeqRecord object
    ns = deepcopy(exp_hit)
    ann = ns.annotations
    tss = ann['trimmed_ss']
    tse = ann['trimmed_se']
    tes = ann['trimmed_es']
    tee = ann['trimmed_ee']

    ns.letter_annotations['ss0'] = '.' * len(ns.seq)
    ns.annotations['sss'] = ['ss0']
    ns.description = ''

    hit.extension = ns

    bl = ns.annotations['blast'][1]

    if bl.sbjct_start < bl.sbjct_end:
        bls = bl.sbjct_start - bl.query_start + 1
        ble = bl.sbjct_end + (query_len - bl.query_end)
    elif bl.sbjct_end < bl.sbjct_start:
        bls = bl.sbjct_end - (query_len - bl.query_end)
        ble = bl.sbjct_start + bl.query_start - 1
    else:
        raise exceptions.UnknownStrand(
            "Can't determine HSP strand (sbjct_start appears equal to sbjct_end)"
        )

    # if whole subject sequence too short, this assertion will fail
    if tss or tse or tes or tee:
        msg = 'STATUS: Skipping sequence check ({}) - subject sequence too short.'.format(
            ns.id)
        ml.info(msg)
    else:
        assert len(ns.seq) == abs(bls - ble) + 1
        assert bls == ns.annotations['extended_start']
        assert ble == ns.annotations['extended_end']

    hit.best_start, hit.best_end = compute_true_location_se(hit, query_len)

    return hit
def extend_simple_core(analyzed_hits, query, args_inner, all_short,
                       multi_query, iteration, ih_model):
    # the extra here is given "pro forma" the sequence is extended exactly by lenghts of unaligned portions of query
    if args_inner.db_type == "blastdb":
        shorts_expanded, _ = rna_blast_analyze.BR_core.extend_hits.expand_hits(
            all_short,
            args_inner.blast_db,
            len(query),
            extra=0,
            blast_regexp=args_inner.blast_regexp,
            skip_missing=args_inner.skip_missing,
            msgs=analyzed_hits.msgs,
        )
    elif args_inner.db_type in ["fasta", "gb", "server", "entrez"]:
        shorts_expanded, _ = rna_blast_analyze.BR_core.extend_hits.expand_hits_from_fasta(
            all_short,
            args_inner.blast_db,
            len(query),
            extra=0,
            blast_regexp=args_inner.blast_regexp,
            skip_missing=args_inner.skip_missing,
            msgs=analyzed_hits.msgs,
            format=args_inner.db_type,
            entrez_email=args_inner.entrez,
            blast_input_file=args_inner.blast_in,
        )
    else:
        raise exceptions.IncorrectDatabaseChoice()

    # check, if blast hits are non - overlapping, if so, add the overlapping hit info to the longer hit
    # reflect this in user output
    # shorts_expanded = merge_blast_hits(shorts_expanded)

    shorts_expanded = trim_before(shorts_expanded)

    shorts_expanded = BA_support.rc_hits_2_rna(shorts_expanded)

    query_seq = query.seq.transcribe()

    # blast only extension
    for exp_hit in shorts_expanded:
        try:
            _out = create_blast_only_report_object(exp_hit, len(query_seq))
            analyzed_hits.hits.append(_out)
        except AssertionError as e:
            exp_hit.annotations['msgs'] += [str(e)]
            analyzed_hits.hits_failed.append(BA_support.Subsequences(exp_hit))
        except exceptions.UnknownStrand as e:
            exp_hit.annotations['msgs'] += [str(e)]
            analyzed_hits.hits_failed.append(BA_support.Subsequences(exp_hit))
        except Exception as e:
            ml.error("Unexpected error when extending with 'simple'.")
            exp_hit.annotations['msgs'] += [str(e)]
            analyzed_hits.hits_failed.append(BA_support.Subsequences(exp_hit))

    if len(analyzed_hits.hits) == 0:
        ml.error(
            "Extension failed for all sequences. Please see the error message. You can also try '--mode locarna'."
        )
        sys.exit(1)

    # assign Locarna score to None as it is not directly accessible from mlocarna
    for hit in analyzed_hits.hits:
        hit.extension.annotations['score'] = None

    # this part predicts homology - it is not truly part of repredict
    homology_prediction, homol_seqs, cm_file_rfam_user = infer_homology(
        analyzed_hits=analyzed_hits,
        args=args_inner,
        cm_model_file=ih_model,
        multi_query=multi_query,
        iteration=iteration)
    for hit, pred in zip(analyzed_hits.hits, homology_prediction):
        hit.hpred = pred
    return analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user
Exemple #4
0
def locarna_worker(pack):
    ml.debug(fname())
    one_expanded_hit, query_seq, locarna_params, anchor_length = pack

    locarna_file1 = locarna_file2 = loc_out_file = None

    try:
        # read the aligned segment and use it as anchors for locarna
        # run locarna in local mode and put the query sequence with the extended sequence with the blast aligned
        # segment as anchor
        blast_entry = one_expanded_hit.annotations['blast'][1]

        anchors = LocarnaAnchor(
            to_rna(blast_entry.query),
            blast_entry.match,
            to_rna(blast_entry.sbjct),
            anchor_length=anchor_length
        )

        if anchors.too_many_anchors:
            ml.info('Too many anchors for {}. Can handle up to 520 distinct anchors.'.format(one_expanded_hit.id))
        # extracted temp is my query

        # access the locarna aligner directly
        fd1, locarna_file1 = mkstemp(prefix='rba_', suffix='_20', dir=CONFIG.tmpdir)
        with os.fdopen(fd1, 'w') as fp_locarna_file_1:
            ql1, ql2 = anchors.anchor_whole_seq(str(query_seq), 'query')
            write_clustal_like_file_with_anchors(fp_locarna_file_1,
                                                 'query',
                                                 str(query_seq),
                                                 (
                                                     ('#A1', ql1.split()[0]),
                                                     ('#A2', ql2.split()[0])
                                                 ))

        fd2, locarna_file2 = mkstemp(prefix='rba_', suffix='_21', dir=CONFIG.tmpdir)
        with os.fdopen(fd2, 'w') as fp_locarna_file_2:
            sl1, sl2 = anchors.anchor_whole_seq(str(one_expanded_hit.seq), 'subject')
            write_clustal_like_file_with_anchors(fp_locarna_file_2,
                                                 one_expanded_hit.id,
                                                 str(one_expanded_hit.seq),
                                                 (
                                                     ('#A1', sl1.split()[0]),
                                                     ('#A2', sl2.split()[0])
                                                 ))

        loc_out_file = run_locarna(
            locarna_file1,
            locarna_file2,
            locarna_params
        )

        # read locarna alignment
        with open(loc_out_file, 'r') as f:
            locarna_alig = parse_locarna_alignment(f)

        if len(locarna_alig) != 2:
            raise exceptions.SubseqMatchError('There must be 2 sequences in Locarna alignment.')

        loc_rep = create_report_object_from_locarna(one_expanded_hit, locarna_alig)

        return loc_rep
    except exceptions.LocarnaException as e:
        one_expanded_hit.annotations['msgs'] = [str(e), e.errors]
        empty_hit = BA_support.Subsequences(one_expanded_hit)
        return empty_hit
    except (exceptions.SubseqMatchError, exceptions.ParsingError) as e:
        one_expanded_hit.annotations['msgs'] = [str(e)]
        empty_hit = BA_support.Subsequences(one_expanded_hit)
        return empty_hit
    except (TypeError, AttributeError, FileNotFoundError) as e:
        one_expanded_hit.annotations['msgs'] = [str(e)]
        empty_hit = BA_support.Subsequences(one_expanded_hit)
        return empty_hit
    finally:
        for f in [locarna_file1, locarna_file2, loc_out_file]:
            if f is not None:
                BA_support.remove_one_file_with_try(f)