def setUp(self):
     aligner = PairwiseAligner()
     aligner.internal_open_gap_score = -1
     aligner.internal_extend_gap_score = -0.0
     aligner.match_score = +1
     aligner.mismatch_score = -1
     aligner.mode = "local"
     self.aligner = aligner
예제 #2
0
def bioPython_default_local_aligner(a, b):
    aligner = PairwiseAligner()
    aligner.mode = 'local'
    aligner.match_score = 2
    aligner.mismatch_score = -3
    aligner.open_gap_score = -7
    aligner.extend_gap_score = -2

    sequence1 = SeqIO.read('./resource/fasta' + str(a) + '.fasta', 'fasta')
    sequence2 = SeqIO.read('./resource/fasta' + str(b) + '.fasta', 'fasta')
    alignments = aligner.align(sequence1.seq, sequence2.seq)
예제 #3
0
def get_clusters_from_seqlist(seqlist, dist_threshold=0.05):
    """Cluster a list of sequences by a distance identity threshold

    Parameters
    ----------
    seqlist : list
        list of sequences as str
    dist_threshold : float
        Max distance value to retain, branches above this length in the 
        hierarchical clustering tree will be cut.

    Returns
    -------
    list
        list of lists - input sequences now grouped by cluster
    list
        list of int - cluster memberships of the originally input list
    """
    if len(seqlist) == 1:
        # Skip alignment if there is only one sequence
        return([seqlist], [0])
    else:
        aligner = PairwiseAligner()
        aligner.mode = "local"

        # Convert sequence list to distance matrix
        distmatrix = []
        for seq1 in seqlist:
            row = []
            for seq2 in seqlist:
                maxlen = max([len(seq1), len(seq2)])
                # Take percentage identity of pairwise alignment score (match base
                # +1, all other operations +0) over the longer sequence in pair
                idval = aligner.align(seq1, seq2).score / maxlen
                distval = 1 - idval  # convert to distance fraction
                row.append(distval)
            distmatrix.append(row)
        # Hierarchical clustering from the distance matrix
        htree = treecluster(data=None, distancematrix=array(distmatrix))
        # Find number of branches with length longer than threshold, and add 1
        # to get number of cuts
        cuts = 1 + len([htree[i].distance for i in range(len(htree))
                        if htree[i].distance > dist_threshold])
        clust_ids = list(htree.cut(cuts))
        clust_seqs_dict = defaultdict(list)
        for i in range(len(seqlist)):
            clust_seqs_dict[clust_ids[i]] += [seqlist[i]]
        # Convert dict of lists to list of lists
        clust_seqs = [clust_seqs_dict[i] for i in clust_seqs_dict]
        return(clust_seqs, clust_ids)
예제 #4
0
    def pairwise(self, potential_parent):
        """
        Парное выравнивание последовательности листа на потенциальных родителей и сохранение скора

        Args:
            potential_parent (str): последовательность потенциального родителя
        """
        aligner = PairwiseAligner()

        # используем локальное выравнивание
        aligner.mode = 'local'

        # заменим дефолтные символы пропуска на другие
        seq = self.seq.replace('-', '')
        potential_parent_undersores = potential_parent.replace('-', '')

        # выравнивание
        score = aligner.score(seq, potential_parent_undersores)

        # сохраняем скор
        self.parent_scores[potential_parent] = score
def perform_randomized_tests(n=1000):
    """Perform randomized tests and compare to pslMap.

    Run this function to perform 8 x n mappings for alignments of randomly
    generated sequences, get the alignment in PSL format, and compare the
    result to that of pslMap.
    """
    aligner = PairwiseAligner()
    aligner.internal_open_gap_score = -1
    aligner.internal_extend_gap_score = -0.0
    aligner.match_score = +1
    aligner.mismatch_score = -1
    aligner.mode = "local"
    for i in range(n):
        nBlocks1 = random.randint(1, 10)
        nBlocks2 = random.randint(1, 10)
        test_random(aligner, nBlocks1, nBlocks2, "+", "+")
        test_random(aligner, nBlocks1, nBlocks2, "+", "-")
        test_random(aligner, nBlocks1, nBlocks2, "-", "+")
        test_random(aligner, nBlocks1, nBlocks2, "-", "-")
        test_random_sequences("+", "+")
        test_random_sequences("+", "-")
        test_random_sequences("-", "+")
        test_random_sequences("-", "-")
예제 #6
0
                    type=str,
                    required=True)

parser.add_argument('-r',
                    '--reference',
                    help='Reference to be aligned to',
                    type=str,
                    required=True)

parser.add_argument('-n',
                    '--seq_name',
                    help='Name of the aligned sequence',
                    type=str,
                    required=True)

args = parser.parse_args()

aligner = PairwiseAligner()
aligner.mode = 'global'
aligner.match_score = 1
aligner.mismatch_score = 0
aligner.open_gap_score = -2
aligner.extend_gap_score = -1

ref = SeqIO.read(args.reference, "fasta")
ref.seq = str(ref.seq.upper()).replace('-', 'N')
cons = SeqIO.read(args.infile, "fasta")
aln = aligner.align(ref.seq, cons.seq)
with open(args.outfile, 'w') as out:
    print(">", args.seq_name, file=out)
    print(str(aln[0]).strip().split('\n')[2], file=out)
        args.open_gap_score = -5
    if not args.extend_gap_score:
        args.extend_gap_score = -2
else:
    sub_matrix = getattr(import_module('Bio.SubsMat.MatrixInfo'),
                         args.sub_matrix)
    aligners['global'].substitution_matrix = sub_matrix
    if not args.open_gap_score:
        args.open_gap_score = -11
    if not args.extend_gap_score:
        args.extend_gap_score = -1
aligners['global'].open_gap_score = args.open_gap_score
aligners['global'].extend_gap_score = args.extend_gap_score
if args.sim_algo == 'smith-waterman':
    aligners['local'] = PairwiseAligner()
    aligners['local'].mode = 'local'
    if args.seq_type in ('dna', 'rna'):
        aligners['local'].match = args.match_score
        aligners['local'].mismatch = args.mismatch_score
    else:
        aligners['local'].substitution_matrix = sub_matrix
    aligners['local'].open_gap_score = args.open_gap_score
    aligners['local'].extend_gap_score = args.extend_gap_score

# Karlin-Altschul parameter values
if args.seq_type in ('dna', 'rna'):
    if ((args.match_score, args.mismatch_score) in KA_PARAMS['na']
            and (abs(args.open_gap_score), abs(args.extend_gap_score))
            in KA_PARAMS['na'][(args.match_score, args.mismatch_score)]):
        args.ka_gapped_l = KA_PARAMS['na'][(args.match_score,
                                            args.mismatch_score)][(
예제 #8
0
# # DEMO: Izračun lastne matrike in njena uporaba
#
# V spodnjem primeru je prikazan izračun lastne matrike (enako, kot pri prejšnji vaji - [VAJA: Izračun matrike zamenjav (Python)](matrika_zamenjav.ipynb)) ter uporaba tako izračunane matrike za poravnavo dveh zaporedij.
#
# ## Izračun matrike

# In[1]:

from Bio import SeqIO
sequence1 = SeqIO.read('vhod/matrika_zamenjav-myoglobin_horse.fasta', 'fasta')
sequence2 = SeqIO.read('vhod/matrika_zamenjav-myoglobin_rat.fasta', 'fasta')
# v mapi vhod sta tudi zaporedji mišjega in človeškega nebulina, ki sta bistveno daljši
from Bio.Align import PairwiseAligner
aligner = PairwiseAligner()
aligner.mode = 'local'
aligner.match_score = 2
aligner.mismatch_score = -3
aligner.open_gap_score = -7
aligner.extend_gap_score = -2
alignments = aligner.align(sequence1.seq, sequence2.seq)
alignment = alignments[0]
from Bio.Align.substitution_matrices import Array
frequency = Array('ACGT', dims=2)
for (start1, end1), (start2, end2) in zip(*alignment.aligned):
    seq1 = sequence1[start1:end1]
    seq2 = sequence2[start2:end2]
    for c1, c2 in zip(seq1, seq2):
        frequency[c1, c2] += 1
import numpy
probabilities = frequency / numpy.sum(frequency)