Esempio n. 1
0
def bioPython_default_local_aligner(a, b):
    aligner = PairwiseAligner()
    aligner.mode = 'local'
    aligner.match_score = 2
    aligner.mismatch_score = -3
    aligner.open_gap_score = -7
    aligner.extend_gap_score = -2

    sequence1 = SeqIO.read('./resource/fasta' + str(a) + '.fasta', 'fasta')
    sequence2 = SeqIO.read('./resource/fasta' + str(b) + '.fasta', 'fasta')
    alignments = aligner.align(sequence1.seq, sequence2.seq)
Esempio n. 2
0
def nw_bio(seq1, seq2, cost_table):
    aligner = PairwiseAligner(alphabet=list(set(seq1 + seq2)))
    aligner.match_score = cost_table[0]
    aligner.mismatch_score = cost_table[1]
    aligner.gap_score = cost_table[2]
    alignments = aligner.align(seq1, seq2)
    formated_alignments = []

    for i in range(len(alignments)):
        als = str(alignments[i]).split("\n")
        formated_alignments.append([als[0], als[2], int(alignments[i].score)])

    return formated_alignments
Esempio n. 3
0
def get_clusters_from_seqlist(seqlist, dist_threshold=0.05):
    """Cluster a list of sequences by a distance identity threshold

    Parameters
    ----------
    seqlist : list
        list of sequences as str
    dist_threshold : float
        Max distance value to retain, branches above this length in the 
        hierarchical clustering tree will be cut.

    Returns
    -------
    list
        list of lists - input sequences now grouped by cluster
    list
        list of int - cluster memberships of the originally input list
    """
    if len(seqlist) == 1:
        # Skip alignment if there is only one sequence
        return([seqlist], [0])
    else:
        aligner = PairwiseAligner()
        aligner.mode = "local"

        # Convert sequence list to distance matrix
        distmatrix = []
        for seq1 in seqlist:
            row = []
            for seq2 in seqlist:
                maxlen = max([len(seq1), len(seq2)])
                # Take percentage identity of pairwise alignment score (match base
                # +1, all other operations +0) over the longer sequence in pair
                idval = aligner.align(seq1, seq2).score / maxlen
                distval = 1 - idval  # convert to distance fraction
                row.append(distval)
            distmatrix.append(row)
        # Hierarchical clustering from the distance matrix
        htree = treecluster(data=None, distancematrix=array(distmatrix))
        # Find number of branches with length longer than threshold, and add 1
        # to get number of cuts
        cuts = 1 + len([htree[i].distance for i in range(len(htree))
                        if htree[i].distance > dist_threshold])
        clust_ids = list(htree.cut(cuts))
        clust_seqs_dict = defaultdict(list)
        for i in range(len(seqlist)):
            clust_seqs_dict[clust_ids[i]] += [seqlist[i]]
        # Convert dict of lists to list of lists
        clust_seqs = [clust_seqs_dict[i] for i in clust_seqs_dict]
        return(clust_seqs, clust_ids)
Esempio n. 4
0
def nw_bio_mat(seq1, seq2, cost_mat, key):
    aligner = PairwiseAligner(alphabet=key)
    matrix = {}
    for i in range(len(key)):
        for j in range(0, len(key)):
            matrix[(key[i], key[j])] = cost_mat[i * len(key) + j]
    aligner.substitution_matrix = substitution_matrices.Array(data=matrix)
    aligner.gap_score = cost_mat[len(key)**2]
    alignments = aligner.align(seq1, seq2)
    formated_alignments = []

    for i in range(len(alignments)):
        als = str(alignments[i]).split("\n")
        formated_alignments.append([als[0], als[2], int(alignments[i].score)])

    return formated_alignments
Esempio n. 5
0
                    type=str,
                    required=True)

parser.add_argument('-r',
                    '--reference',
                    help='Reference to be aligned to',
                    type=str,
                    required=True)

parser.add_argument('-n',
                    '--seq_name',
                    help='Name of the aligned sequence',
                    type=str,
                    required=True)

args = parser.parse_args()

aligner = PairwiseAligner()
aligner.mode = 'global'
aligner.match_score = 1
aligner.mismatch_score = 0
aligner.open_gap_score = -2
aligner.extend_gap_score = -1

ref = SeqIO.read(args.reference, "fasta")
ref.seq = str(ref.seq.upper()).replace('-', 'N')
cons = SeqIO.read(args.infile, "fasta")
aln = aligner.align(ref.seq, cons.seq)
with open(args.outfile, 'w') as out:
    print(">", args.seq_name, file=out)
    print(str(aln[0]).strip().split('\n')[2], file=out)
Esempio n. 6
0
# Parse fasta file #
seqs = list(SeqIO.parse(args.f,'fasta'))

# Get substitution matrix 
substitution_matrix = getattr(MatrixInfo,args.s)

#Pairwise alignment 
aligner = PairwiseAligner()
aligner.open_gap_score, aligner.extend_gap_score  = args.go, args.ge
aligner.substitution_matrix = substitution_matrix

# Align sequences and build matrix
def similarity_matrix(seqs,n=len(seqs)):
  similarity_matrix = np.zeros([n,n])
	for i in range(len(seqs)):
    for j in range(len(seqs)):
			alignment = aligner.align(seqs[i].seq,seqs[j].seq)
			similarity_matrix[i][j] = alignment.score
	return similarity_matrix

m = similarity_matrix(seqs)

def print_matrix(m):
	for i in m:
		row = ""
		for j in i:
			row+=str(j)+'\t'
		print(row)

print_matrix(m)
Esempio n. 7
0
# ## Izračun matrike

# In[1]:

from Bio import SeqIO
sequence1 = SeqIO.read('vhod/matrika_zamenjav-myoglobin_horse.fasta', 'fasta')
sequence2 = SeqIO.read('vhod/matrika_zamenjav-myoglobin_rat.fasta', 'fasta')
# v mapi vhod sta tudi zaporedji mišjega in človeškega nebulina, ki sta bistveno daljši
from Bio.Align import PairwiseAligner
aligner = PairwiseAligner()
aligner.mode = 'local'
aligner.match_score = 2
aligner.mismatch_score = -3
aligner.open_gap_score = -7
aligner.extend_gap_score = -2
alignments = aligner.align(sequence1.seq, sequence2.seq)
alignment = alignments[0]
from Bio.Align.substitution_matrices import Array
frequency = Array('ACGT', dims=2)
for (start1, end1), (start2, end2) in zip(*alignment.aligned):
    seq1 = sequence1[start1:end1]
    seq2 = sequence2[start2:end2]
    for c1, c2 in zip(seq1, seq2):
        frequency[c1, c2] += 1
import numpy
probabilities = frequency / numpy.sum(frequency)
probabilities = (probabilities + probabilities.transpose()) / 2.0
background = numpy.sum(probabilities, 0)
expected = numpy.dot(background[:, None], background[None, :])
oddsratios = probabilities / expected
scoring_matrix = numpy.log2(oddsratios)