Esempio n. 1
0
def get_clusters_from_seqlist(seqlist, dist_threshold=0.05):
    """Cluster a list of sequences by a distance identity threshold

    Parameters
    ----------
    seqlist : list
        list of sequences as str
    dist_threshold : float
        Max distance value to retain, branches above this length in the 
        hierarchical clustering tree will be cut.

    Returns
    -------
    list
        list of lists - input sequences now grouped by cluster
    list
        list of int - cluster memberships of the originally input list
    """
    if len(seqlist) == 1:
        # Skip alignment if there is only one sequence
        return([seqlist], [0])
    else:
        aligner = PairwiseAligner()
        aligner.mode = "local"

        # Convert sequence list to distance matrix
        distmatrix = []
        for seq1 in seqlist:
            row = []
            for seq2 in seqlist:
                maxlen = max([len(seq1), len(seq2)])
                # Take percentage identity of pairwise alignment score (match base
                # +1, all other operations +0) over the longer sequence in pair
                idval = aligner.align(seq1, seq2).score / maxlen
                distval = 1 - idval  # convert to distance fraction
                row.append(distval)
            distmatrix.append(row)
        # Hierarchical clustering from the distance matrix
        htree = treecluster(data=None, distancematrix=array(distmatrix))
        # Find number of branches with length longer than threshold, and add 1
        # to get number of cuts
        cuts = 1 + len([htree[i].distance for i in range(len(htree))
                        if htree[i].distance > dist_threshold])
        clust_ids = list(htree.cut(cuts))
        clust_seqs_dict = defaultdict(list)
        for i in range(len(seqlist)):
            clust_seqs_dict[clust_ids[i]] += [seqlist[i]]
        # Convert dict of lists to list of lists
        clust_seqs = [clust_seqs_dict[i] for i in clust_seqs_dict]
        return(clust_seqs, clust_ids)
Esempio n. 2
0
def nw_bio_mat(seq1, seq2, cost_mat, key):
    aligner = PairwiseAligner(alphabet=key)
    matrix = {}
    for i in range(len(key)):
        for j in range(0, len(key)):
            matrix[(key[i], key[j])] = cost_mat[i * len(key) + j]
    aligner.substitution_matrix = substitution_matrices.Array(data=matrix)
    aligner.gap_score = cost_mat[len(key)**2]
    alignments = aligner.align(seq1, seq2)
    formated_alignments = []

    for i in range(len(alignments)):
        als = str(alignments[i]).split("\n")
        formated_alignments.append([als[0], als[2], int(alignments[i].score)])

    return formated_alignments
Esempio n. 3
0
def make_aligner() -> PairwiseAligner:
    aligner = PairwiseAligner(match_score=MATCH_SCORE,
                              mismatch_score=MISMATCH_SCORE,
                              end_open_gap_score=END_GAP_PENALTY,
                              end_extend_gap_score=END_GAP_EXTEND_PENALTY,
                              internal_open_gap_score=GAP_PENALTY,
                              internal_extend_gap_score=GAP_EXTEND_PENALTY)
    return aligner
Esempio n. 4
0
def bioPython_default_local_aligner(a, b):
    aligner = PairwiseAligner()
    aligner.mode = 'local'
    aligner.match_score = 2
    aligner.mismatch_score = -3
    aligner.open_gap_score = -7
    aligner.extend_gap_score = -2

    sequence1 = SeqIO.read('./resource/fasta' + str(a) + '.fasta', 'fasta')
    sequence2 = SeqIO.read('./resource/fasta' + str(b) + '.fasta', 'fasta')
    alignments = aligner.align(sequence1.seq, sequence2.seq)
 def setUp(self):
     aligner = PairwiseAligner()
     aligner.internal_open_gap_score = -1
     aligner.internal_extend_gap_score = -0.0
     aligner.match_score = +1
     aligner.mismatch_score = -1
     aligner.mode = "local"
     self.aligner = aligner
Esempio n. 6
0
def create_aligner() -> PairwiseAligner:
    """
    Creates an aligner that can be used to search for proteins.
    """
    aligner = PairwiseAligner(mode="local")

    # By default we want matches and penalize mismatches.
    aligner.mismatch_score = -1
    aligner.match_score = 1

    # left or right gaps shouldn't count negatively due to the local search.
    aligner.query_left_gap_score = 0
    aligner.query_right_gap_score = 0
    aligner.target_right_gap_score = 0
    aligner.target_left_gap_score = 0

    # Gaps in the middle should count negatively to narrow down the search space.
    aligner.query_internal_gap_score = -1
    aligner.target_internal_gap_score = -1

    return aligner
    def pairwise(self, potential_parent):
        """
        Парное выравнивание последовательности листа на потенциальных родителей и сохранение скора

        Args:
            potential_parent (str): последовательность потенциального родителя
        """
        aligner = PairwiseAligner()

        # используем локальное выравнивание
        aligner.mode = 'local'

        # заменим дефолтные символы пропуска на другие
        seq = self.seq.replace('-', '')
        potential_parent_undersores = potential_parent.replace('-', '')

        # выравнивание
        score = aligner.score(seq, potential_parent_undersores)

        # сохраняем скор
        self.parent_scores[potential_parent] = score
Esempio n. 8
0
def nw_bio(seq1, seq2, cost_table):
    aligner = PairwiseAligner(alphabet=list(set(seq1 + seq2)))
    aligner.match_score = cost_table[0]
    aligner.mismatch_score = cost_table[1]
    aligner.gap_score = cost_table[2]
    alignments = aligner.align(seq1, seq2)
    formated_alignments = []

    for i in range(len(alignments)):
        als = str(alignments[i]).split("\n")
        formated_alignments.append([als[0], als[2], int(alignments[i].score)])

    return formated_alignments
Esempio n. 9
0
 def _remove_missing_res(self, record: SeqRecord, pdb: Path):
     structure = PDBParser().get_structure(record.id, pdb)
     sequence = ''.join([
         str(_.get_sequence())
         for _ in CaPPBuilder().build_peptides(structure, aa_only=False)
     ])
     path = PairwiseAligner().align(record.seq.ungap('-'), sequence)[0].path
     gaps = []
     for i, _ in enumerate(path[:-1]):
         if path[i][1] == path[i + 1][1]:
             gaps.append((path[i][0], path[i + 1][0]))
     gaps = list(reversed(gaps))
     mut = record.seq.tomutable()
     for gap in gaps:
         i = 0
         for k, res in enumerate(mut):
             if res == '-':
                 continue
             if gap[0] <= i < gap[1]:
                 mut[k] = '-'
             i += 1
     record.seq = mut.toseq()
     return record
def perform_randomized_tests(n=1000):
    """Perform randomized tests and compare to pslMap.

    Run this function to perform 8 x n mappings for alignments of randomly
    generated sequences, get the alignment in PSL format, and compare the
    result to that of pslMap.
    """
    aligner = PairwiseAligner()
    aligner.internal_open_gap_score = -1
    aligner.internal_extend_gap_score = -0.0
    aligner.match_score = +1
    aligner.mismatch_score = -1
    aligner.mode = "local"
    for i in range(n):
        nBlocks1 = random.randint(1, 10)
        nBlocks2 = random.randint(1, 10)
        test_random(aligner, nBlocks1, nBlocks2, "+", "+")
        test_random(aligner, nBlocks1, nBlocks2, "+", "-")
        test_random(aligner, nBlocks1, nBlocks2, "-", "+")
        test_random(aligner, nBlocks1, nBlocks2, "-", "-")
        test_random_sequences("+", "+")
        test_random_sequences("+", "-")
        test_random_sequences("-", "+")
        test_random_sequences("-", "-")
Esempio n. 11
0
                    type=str,
                    required=True)

parser.add_argument('-r',
                    '--reference',
                    help='Reference to be aligned to',
                    type=str,
                    required=True)

parser.add_argument('-n',
                    '--seq_name',
                    help='Name of the aligned sequence',
                    type=str,
                    required=True)

args = parser.parse_args()

aligner = PairwiseAligner()
aligner.mode = 'global'
aligner.match_score = 1
aligner.mismatch_score = 0
aligner.open_gap_score = -2
aligner.extend_gap_score = -1

ref = SeqIO.read(args.reference, "fasta")
ref.seq = str(ref.seq.upper()).replace('-', 'N')
cons = SeqIO.read(args.infile, "fasta")
aln = aligner.align(ref.seq, cons.seq)
with open(args.outfile, 'w') as out:
    print(">", args.seq_name, file=out)
    print(str(aln[0]).strip().split('\n')[2], file=out)
Esempio n. 12
0
try:
    GAP_PENALTY = scores_dict['gap penalty']
    GAP_EXTEND_PENALTY = scores_dict['gap extend penalty']
    END_GAP_PENALTY = scores_dict['end gap penalty']
    END_GAP_EXTEND_PENALTY = scores_dict['end gap extend penalty']
    MATCH_SCORE = scores_dict['match score']
    MISMATCH_SCORE = scores_dict['mismatch score']
except KeyError as ex:
    raise ValueError(f"'{ex.args[0]}' is missing in data/scores.tab") from ex

score_matrix = np.empty((16, 16))

for i, j in itertools.product(range(0, 16), range(0, 16)):
    score_matrix[i, j] = MATCH_SCORE if i & j else MISMATCH_SCORE

aligner = PairwiseAligner(substitution_matrix=score_matrix, end_open_gap_score=END_GAP_PENALTY,
                          end_extend_gap_score=END_GAP_EXTEND_PENALTY, internal_open_gap_score=GAP_PENALTY, internal_extend_gap_score=GAP_EXTEND_PENALTY)

seq_read_dict = {
    '-':0,
    'A':1,
    'C':2,
    'G':4,
    'T':8,
    'R':5,
    'Y':10,
    'S':6,
    'W':9,
    'K':12,
    'M':3,
    'B':14,
    'D':13,
Esempio n. 13
0
# Add parse arguments
parser = argparse.ArgumentParser(description='Computes a pairwise similarity matrix from a fasta file.')
parser.add_argument('-f',help='name of the fasta file',required=True)
parser.add_argument('-s',help='name of subsitution matrix from BioPython',required=True,choices=MatrixInfo.available_matrices)
parser.add_argument('-go',help='gap opening score',type=float,required=True)
parser.add_argument('-ge',help='gap extension score',type=float,required=True)
args = parser.parse_args()

# Parse fasta file #
seqs = list(SeqIO.parse(args.f,'fasta'))

# Get substitution matrix 
substitution_matrix = getattr(MatrixInfo,args.s)

#Pairwise alignment 
aligner = PairwiseAligner()
aligner.open_gap_score, aligner.extend_gap_score  = args.go, args.ge
aligner.substitution_matrix = substitution_matrix

# Align sequences and build matrix
def similarity_matrix(seqs,n=len(seqs)):
  similarity_matrix = np.zeros([n,n])
	for i in range(len(seqs)):
    for j in range(len(seqs)):
			alignment = aligner.align(seqs[i].seq,seqs[j].seq)
			similarity_matrix[i][j] = alignment.score
	return similarity_matrix

m = similarity_matrix(seqs)

def print_matrix(m):
args = parser.parse_args()

for file in (args.query_seq, args.target_seq):
    if not path.isfile(file):
        parser.error("File %s doesn't exist" % file)

# alphabet
if args.seq_type == 'dna':
    args.seq_abc = IUPACAmbiguousDNA()
elif args.seq_type == 'rna':
    args.seq_abc = IUPACAmbiguousRNA()
else:
    args.seq_abc = ExtendedIUPACProtein()

# Aligners setup
aligners = {'global': PairwiseAligner(), 'local': None}
aligners['global'].mode = 'global'
if args.seq_type in ('dna', 'rna'):
    aligners['global'].match = args.match_score
    aligners['global'].mismatch = args.mismatch_score
    if not args.open_gap_score:
        args.open_gap_score = -5
    if not args.extend_gap_score:
        args.extend_gap_score = -2
else:
    sub_matrix = getattr(import_module('Bio.SubsMat.MatrixInfo'),
                         args.sub_matrix)
    aligners['global'].substitution_matrix = sub_matrix
    if not args.open_gap_score:
        args.open_gap_score = -11
    if not args.extend_gap_score:
Esempio n. 15
0
# coding: utf-8

# # DEMO: Izračun lastne matrike in njena uporaba
#
# V spodnjem primeru je prikazan izračun lastne matrike (enako, kot pri prejšnji vaji - [VAJA: Izračun matrike zamenjav (Python)](matrika_zamenjav.ipynb)) ter uporaba tako izračunane matrike za poravnavo dveh zaporedij.
#
# ## Izračun matrike

# In[1]:

from Bio import SeqIO
sequence1 = SeqIO.read('vhod/matrika_zamenjav-myoglobin_horse.fasta', 'fasta')
sequence2 = SeqIO.read('vhod/matrika_zamenjav-myoglobin_rat.fasta', 'fasta')
# v mapi vhod sta tudi zaporedji mišjega in človeškega nebulina, ki sta bistveno daljši
from Bio.Align import PairwiseAligner
aligner = PairwiseAligner()
aligner.mode = 'local'
aligner.match_score = 2
aligner.mismatch_score = -3
aligner.open_gap_score = -7
aligner.extend_gap_score = -2
alignments = aligner.align(sequence1.seq, sequence2.seq)
alignment = alignments[0]
from Bio.Align.substitution_matrices import Array
frequency = Array('ACGT', dims=2)
for (start1, end1), (start2, end2) in zip(*alignment.aligned):
    seq1 = sequence1[start1:end1]
    seq2 = sequence2[start2:end2]
    for c1, c2 in zip(seq1, seq2):
        frequency[c1, c2] += 1
import numpy