Ejemplo n.º 1
0
def pair_align_SeqRecords(seqr_a, seqr_b, ex_aligner = needle_align):
    ''' Pairwise align two SeqRecords using external or internal aligner.

        seqr_a, seqr_b: SeqRecords to align
        ex_aligner: helper function that aligns sequences in two files
                    *If ex_aligner is None, use internal aligner*

        Internal aligner:
            Bio.pairwise2.align.globalds() with Bio.SubsMat.MatrixInfo.Blosum62
            and default gap penalties (gapopen = -10.0, gapextend = -0.5)

        Returns a MultipleSeqAlignment object

    '''

    if ex_aligner is None:
        inaln = align.globalds(ungap_SeqRecord(seqr_a), ungap_SeqRecord(seqr_b),
                               MatrixInfo.blosum62, -10.0, -0.5)
        exaln = Align.MultipleSeqAlignment(inaln[0][:2])
    else:
        tmp_fa = make_tmp_fa(ungap_SeqRecord(seqr_a))
        tmp_ref_fa = make_tmp_fa(ungap_SeqRecord(seqr_b))
        exaln = ex_aligner(tmp_fa.name, tmp_ref_fa.name)
        remove(tmp_fa.name)
        remove(tmp_ref_fa.name)

    return exaln
def map_pose_indices(ref_ungapped, pose, chain):
    pose_seq, pose_indices = get_chain_info(pose, chain)

    # Align the pose sequence with the (ungapped) reference sequence:
    #
    # I decided to use BLOSUM80 because the two sequences in this case should
    # be very similar (not that it should matter much).  I used the
    # corresponding gap penalties from BLAST 2.2.27, which I found in the
    # reference below:
    #
    # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3848038/

    alignments = align.globalds(
        pose_seq,
        ref_ungapped,
        blosum80,
        -10,
        -1,
    )
    aligned_pose, aligned_ref, score, start, end = alignments[0]

    i_ref = i_pose = 0
    mapped_indices = {}

    for aa_ref, aa_pose in zip(aligned_ref, aligned_pose):
        hit_ref = aa_ref not in '-.'
        hit_pose = aa_pose not in '-.'

        if hit_ref and hit_pose:
            mapped_indices[i_ref] = pose_indices[i_pose]

        i_ref += hit_ref
        i_pose += hit_pose

    return mapped_indices
Ejemplo n.º 3
0
def align_sequences_match_residues(mobile_seq,
                                   target_seq,
                                   seq_align_mat='BLOSUM80',
                                   gap_penalty=-1.0,
                                   verbosity=0):
    """ Align two aminoacid sequences using Bio.pairwise2.globalds and substution matrix seq_align_mat, return a tuple
    with two list of residues to be used in the 3D alignment (mobile, refence)

    :param str mobile_seq: sequence of mobile protein
    :param str target_seq: sequence of target protein
    :param str seq_align_mat: use this substution matrix from Bio.SubsMat.MatrixInfo
    :param float gap_penalty: gap penalty to the alignment; avoid values too low in module
    :param int verbosity: sets the verbosity level
    :rtype: tuple
    """
    try:
        from Bio.pairwise2 import align
        from Bio.Align import substitution_matrices
        seq_align_mat = substitution_matrices.load(seq_align_mat)
    except ImportError as error:
        os_util.local_print(
            'Failed to import Biopython with error: {}\nBiopython is necessary to sequence'
            'alignment. Sequences to be aligned:\nReference: {}\nMobile: {}'
            ''.format(error, target_seq, mobile_seq),
            msg_verbosity=os_util.verbosity_level.error,
            current_verbosity=verbosity)
        raise ImportError(error)
    except FileNotFoundError as error:
        available_matrices = substitution_matrices.load()
        os_util.local_print(
            'Failed to import substitution matrix {} with error: {}\nSubstitution matrix must be one '
            'of: {})'
            ''.format(seq_align_mat, error, available_matrices),
            msg_verbosity=os_util.verbosity_level.error,
            current_verbosity=verbosity)
        raise FileNotFoundError(error)
    else:
        align_result = align.globalds(target_seq, mobile_seq, seq_align_mat,
                                      gap_penalty, gap_penalty)[0]
        os_util.local_print(
            'This is the alignment result to be used in protein alignment:\n{}'
            ''.format(align_result),
            msg_verbosity=os_util.verbosity_level.info,
            current_verbosity=verbosity)
        ref_align_str = [
            True if res_j != '-' else False
            for res_i, res_j in zip(align_result[0], align_result[1])
            if res_i != '-'
        ]
        mob_align_str = [
            True if res_i != '-' else False
            for res_i, res_j in zip(align_result[0], align_result[1])
            if res_j != '-'
        ]

        return mob_align_str, ref_align_str
Ejemplo n.º 4
0
def pairwise_align(comp_seq, ref_seq):
    '''
    Perform a pairwise alignment of two sequences.

    Uses the BioPython pairwise2 module with the BLOSUM62 matrix for scoring
    similarity. Gap opening penalty is -11 and gap extend penalty is -1,
    which is the same as the default blastp parameters.

    Output is two dictionaries: residue numbering in PDB chain (key) mapped to
    the residue position in the reference sequence (value), and vice versa.

    Args:
        comp_seq (str): A comparison protein sequence.
        ref_seq (str): A reference protein sequence.

    Returns:
        dict: A dictionary mapping comparison sequence numbering (key) to
            reference sequence numbering (value)
        dict: A dictionary mapping reference sequence numbering (key) to
            comparison sequence numbering (value)
    '''
    alignment = align.globalds(comp_seq,
                               ref_seq,
                               matlist.blosum62,
                               -11,
                               -1,
                               penalize_end_gaps=False,
                               one_alignment_only=True)[0]
    query_string = alignment[0]
    sbjct_string = alignment[1]
    #Create dictionary mapping position in PDB chain to position in ref sequence
    pdb_to_ref = {}
    ref_to_pdb = {}
    key = 1
    ref = 1
    for i, res in enumerate(query_string):
        if res.isalpha() and sbjct_string[i].isalpha():
            pdb_to_ref[key] = ref
            ref_to_pdb[ref] = key
            key += 1
            ref += 1
        elif res.isalpha():
            key += 1
        elif sbjct_string[i].isalpha():
            ref += 1
    return pdb_to_ref, ref_to_pdb
def main():
    import docopt
    args = docopt.docopt(__doc__)

    plots = []

    for path in args['<dels_workspace>']:
        work_dels = DeletionsWorkspace.from_path(path)
        msa = load_weighted_msa(work_dels.msa)
        dels = pd.read_hdf(work_dels.deletions_hdf5)

        plot = Plot()
        plot.y = count_deletions(msa, dels)
        plot.x = np.arange(len(plot.y))
        plot.label = f'{work_dels.relpath} (N={len(dels)})'
        plot.seq = msa.ref_ungapped
        plots.append(plot)

    if args['--align']:
        if len(plots) != 2:
            fatal("Must specify 2 worksapces to use the --align option.")

        # I decided to use BLOSUM62 because the two sequences in this case may
        # not be particularly similar.  I used the corresponding gap penalties
        # from BLAST 2.2.27, which I found in the reference below:
        #
        # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3848038/

        alignments = align.globalds(
            plots[0].seq,
            plots[1].seq,
            blosum62,
            -11,
            -1,
        )
        aligned_seq1, aligned_seq2, score, start, end = alignments[0]
        aligned_x1 = []
        aligned_x2 = []

        for i, (aa1, aa2) in enumerate(zip(aligned_seq1, aligned_seq2)):
            if aa1 not in '-':
                aligned_x1.append(i)

            if aa2 not in '-':
                aligned_x2.append(i)

        plots[0].x = np.array(aligned_x1)
        plots[1].x = np.array(aligned_x2)

        percent_id = sum(x[0] == x[1] and '-' not in x
                         for x in zip(aligned_seq1, aligned_seq2))
        percent_id /= max(len(p.seq) for p in plots)

        print(f"Scaffolds aligned with {100*percent_id:.2f}% identity.")

    if os.fork():
        sys.exit()

    for p in plots:
        plt.plot(p.x, p.y, label=p.label)

    plt.xlabel("aligned residue index" if args['--align'] else "residue index")
    plt.ylabel("relative deletions" if args['--normalize'] else "deletions")
    plt.xlim((0, max(p.x[-1] for x in plots)))
    plt.legend(loc='best')
    plt.show()
Ejemplo n.º 6
0
def align_sequences_match_residues(mobile_seq,
                                   target_seq,
                                   seq_align_mat='blosum80',
                                   gap_penalty=-1.0,
                                   verbosity=0):
    """ Align two aminoacid sequences using Bio.pairwise2.globalds and substution matrix seq_align_mat, return a tuple
    with two list of residues to be used in the 3D alignment (mobile, refence)

    :param str mobile_seq: sequence of mobile protein
    :param str target_seq: sequence of target protein
    :param str seq_align_mat: use this substution matrix from Bio.SubsMat.MatrixInfo
    :param float gap_penalty: gap penalty to the alignment; avoid values too low in module
    :param int verbosity: sets the verbosity level
    :rtype: tuple
    """
    try:
        from Bio.pairwise2 import align
        seq_align_mat = import_module(
            'Bio.SubsMat.MatrixInfo').__dict__[seq_align_mat]
    except ImportError as error:
        os_util.local_print(
            'Failed to import Biopython with error: {}\nBiopython is necessary to sequence'
            'alignment. Sequences to be aligned:\nReference: {}\nMobile: {}'
            ''.format(error, target_seq, mobile_seq),
            msg_verbosity=os_util.verbosity_level.error,
            current_verbosity=verbosity)
        raise ImportError(error)
    except KeyError as error:
        try:
            from Bio.SubsMat.MatrixInfo import available_matrices
        except ImportError:
            os_util.local_print(
                "Failed to import Biopython. The sequences fo your protein structures mismatch, so I "
                "need Biopython to align them. See documentation.",
                msg_verbosity=os_util.verbosity_level.error,
                current_verbosity=verbosity)
            raise SystemExit(1)
        os_util.local_print(
            'Failed to import substitution matrix {} with error: {}\nSubstitution matrix must be one '
            'from Bio.SubsMat.MatrixInfo (in this installation: {})'
            ''.format(seq_align_mat, error, available_matrices),
            msg_verbosity=os_util.verbosity_level.error,
            current_verbosity=verbosity)
        raise KeyError(error)
    else:
        align_result = align.globalds(target_seq, mobile_seq, seq_align_mat,
                                      gap_penalty, gap_penalty)[0]
        os_util.local_print(
            'This is the alignment result to be used in protein alignment:\n{}'
            ''.format(align_result),
            msg_verbosity=os_util.verbosity_level.info,
            current_verbosity=verbosity)
        ref_align_str = [
            True if res_j != '-' else False
            for res_i, res_j in zip(align_result[0], align_result[1])
            if res_i != '-'
        ]
        mob_align_str = [
            True if res_i != '-' else False
            for res_i, res_j in zip(align_result[0], align_result[1])
            if res_j != '-'
        ]

        return mob_align_str, ref_align_str