Exemple #1
0
def refold_stockholm(stockholm_alig, consensus_structure):
    """
    compute refold.pl from Vienna RNA package
    :param stockholm_alig:
    :param consensus_structure:
    :return:
    """
    ml.debug(fname())
    # convert to clustal alignment
    fd, clust_tempfile = mkstemp(prefix='rba_', suffix='_23', dir=CONFIG.tmpdir)
    with os.fdopen(fd, 'w') as f:
        stockholm_alig.write_clustal(f)

    # write fake alifold output with given consensus structure
    fd, alif_fake_file = mkstemp(prefix='rba_', suffix='_24', dir=CONFIG.tmpdir)
    with os.fdopen(fd, 'w') as f:
        # the consensus sequence in alifold file is really not used for anything
        f.write('A'*len(consensus_structure) + '\n')
        f.write(consensus_structure + '\n')

    # compute refold
    # refold_path = locate_refold()
    refold_constrained_file = compute_refold(clust_tempfile, alif_fake_file)

    parsed_seqs = []
    with open(refold_constrained_file, 'r') as f:
        # read the file
        for seq in BA_support.parse_seq_str(f):
            parsed_seqs.append(seq)

    # cleanup
    BA_support.remove_files_with_try([clust_tempfile, alif_fake_file, refold_constrained_file])

    return parsed_seqs
Exemple #2
0
def alifold_refold_prediction(nr_homologs_hits_fasta,
                              all_hits_fasta,
                              refold='refold',
                              threads=None,
                              params=None,
                              msa_alg='clustalo'):
    """
    return predicted structures for all hits based on provided sequence homologs
    ! beware, clustal mixes order of sequences in profile alignment, correct for it
    possible param keys: "clustal", "alifold", "clustalo_profile", "repred_unpaired_tr"
    """
    ml.debug(fname())
    nr_path, san_dict = sanitize_fasta_file(nr_homologs_hits_fasta)
    all_path, san_dict = sanitize_fasta_file(all_hits_fasta,
                                             used_dict=san_dict)

    if params is None:
        params = dict()

    ref_pred = ['refold', 'refold_rnafoldc', 'conserved_ss_rnafoldc']
    if refold not in ref_pred:
        raise Exception(
            'refold procedure not recognized: {}, possible values are {}'.
            format(refold, ' '.join(ref_pred)))

    cl_file = _aligner_block(nr_path, params, msa_alg, threads)

    # cannot rely on that, the order of a cl_file would be the same as the order of the nr_homolog_hits_file
    ali_file = compute_alifold(cl_file,
                               alifold_params=params.get('alifold', ''))

    consensus_record = read_seq_str(ali_file)[0]

    clustalo_profile_params = '--outfmt clustal '
    clustalo_profile_params += params.get('clustalo_profile', '')
    if threads:
        clustalo_profile_params += ' --threads {}'.format(threads)
    realign_file = run_clustal_profile2seqs_align(
        cl_file, all_path, clustalo_params=clustalo_profile_params)
    realign_alig = AlignIO.read(realign_file, format='clustal')

    # slice alignment ( get seqname from nr_homolog_hits_file, find it in the realign and slice the whole segment off
    #  take care that the id may be the same and it must be checked for multiple occurence

    first_nr_record = _parse_first_record_only(nr_path)

    realign_allseq_possition = [
        i for i, seq in enumerate(realign_alig) if seq.id == first_nr_record.id
    ]

    new_alig_for_refold = realign_alig[:realign_allseq_possition[-1]]
    old_alig_in_new = realign_alig[realign_allseq_possition[-1]:]

    orig_alignment = AlignIO.read(cl_file, format='clustal')

    first_original_alignment_record = orig_alignment[0]

    match_original_seq_in_new_alig = [
        i for i in old_alig_in_new
        if i.id == first_original_alignment_record.id
    ][0]

    mapping = _map_alignment_columns_from_profile_match(
        first_original_alignment_record, match_original_seq_in_new_alig)

    # map and repair structure when mapping is unbiguous
    cs_encode = encode_structure_unicode(
        consensus_record.letter_annotations['ss0'])
    new_consensus_structure_encoded = _repair_consensus_structure_by_maping(
        cs_encode,
        mapping,
        len(match_original_seq_in_new_alig.seq),
        gap_char=49)
    new_consensus_structure_repaired = repair_structure_any_variant(
        new_consensus_structure_encoded)

    new_consensus_structure = decode_structure_unicode(
        new_consensus_structure_repaired)

    new_consensus_sequence = _repair_consensus_structure_by_maping(
        str(consensus_record.seq),
        mapping,
        len(match_original_seq_in_new_alig.seq),
        gap_char=ord('_'))

    # write new consensus to a file
    a_fd, new_alifold_consensus_file = mkstemp(prefix='rba_',
                                               suffix='_33',
                                               dir=CONFIG.tmpdir)
    with os.fdopen(a_fd, 'w') as f:
        f.write(new_consensus_sequence + '\n')
        f.write(new_consensus_structure + '\n')

    # write sliced alignment to a file
    sa_fd, sliced_alignment_file = mkstemp(prefix='rba_',
                                           suffix='_34',
                                           dir=CONFIG.tmpdir)
    with os.fdopen(sa_fd, 'w') as f:
        AlignIO.write(new_alig_for_refold, f, 'clustal')

    # now process the file, and map alignment to consensus structure
    if refold in ['refold', 'refold_rnafoldc']:
        refold_file = compute_refold(sliced_alignment_file,
                                     new_alifold_consensus_file)

        if refold == 'refold_rnafoldc':
            rnafold_parameters = params.get('RNAfold', '')
            if '-C' not in rnafold_parameters:
                rnafold_parameters += ' -C'

            seq_str = rnafold_prediction(refold_file,
                                         params=rnafold_parameters)

        else:
            seq_str = read_seq_str(refold_file)

        remove_one_file_with_try(refold_file)

    else:
        st_alig_file = build_stockholm_from_clustal_alig(
            sliced_alignment_file, new_alifold_consensus_file)
        repred_tr = str(params.get('repred_unpaired_tr', '9'))
        conseq_conserved = params.get('conseq_conserved', 1)

        seq_str = _refold_with_unpaired_conservation(
            st_alig_file,
            repred_tr=repred_tr,
            conseq_conserved=conseq_conserved)
        remove_one_file_with_try(st_alig_file)

    structures_out = desanitize_fasta_names_in_seqrec_list(seq_str, san_dict)

    remove_files_with_try([
        nr_path, all_path, sliced_alignment_file, new_alifold_consensus_file,
        cl_file, ali_file, realign_file
    ])

    return structures_out
Exemple #3
0
def cmmodel_rnafold_c(allhits_fasta,
                      cmmodel_file,
                      threads=None,
                      params=None,
                      timeout=None):
    ml.debug(fname())
    if params is None:
        params = dict()

    allhits_fasta_file, san_dict = sanitize_fasta_file(allhits_fasta)

    cmalign_params = ''
    if threads:
        cmalign_params += '--cpu {}'.format(threads)

    if 'cmalign' in params and params['cmalign']:
        cmalign_params += ' ' + params['cmalign']

    if '--notrunc' not in cmalign_params:
        cmalign_params += ' --notrunc'

    # rnafold params
    rnafold_params = params.get('RNAfold', '-C')
    assert isinstance(rnafold_params,
                      str), "Incorrect parameters for RNAfold -C"
    if '-C' not in rnafold_params:
        # some parameters given but -C not present
        rnafold_params += ' -C'

    alig_file = run_cmalign_on_fasta(allhits_fasta_file,
                                     cmmodel_file,
                                     cmalign_params=cmalign_params,
                                     timeout=timeout)
    # multiple sequence cm align
    # split by sequence, then run the rest
    cm_alig = read_st(alig_file)

    remove_files_with_try([allhits_fasta_file, alig_file])

    # ===== use refold.pl directly ====
    cm_alig_upper = cm_alig.to_upper()
    fd, temp_mock_consensus = mkstemp(prefix='rba_',
                                      suffix='_41',
                                      dir=CONFIG.tmpdir)
    f, temp_clustal_aln = mkstemp(prefix='rba_',
                                  suffix='_42',
                                  dir=CONFIG.tmpdir)
    with os.fdopen(f, 'w') as h_clustal, os.fdopen(fd, 'w') as h_constraints:
        cm_alig_upper.write_clustal(h_clustal)

        h_constraints.write('{}\n{}\n'.format(
            re.sub('[^ACTGU]',
                   '_',
                   cm_alig_upper.column_annotations['RF'],
                   flags=re.IGNORECASE),
            cm_strucutre2br(cm_alig_upper.column_annotations['SS_cons'])))

    temp_constraint_file = compute_refold(temp_clustal_aln,
                                          temp_mock_consensus,
                                          timeout=timeout)
    structures = rnafold_prediction(temp_constraint_file,
                                    params=rnafold_params,
                                    timeout=timeout)
    str_out = desanitize_fasta_names_in_seqrec_list(structures, san_dict)

    remove_files_with_try(
        [temp_constraint_file, temp_clustal_aln, temp_mock_consensus])

    return str_out