def main(): calculator = DistanceCalculator() # Exercise 1 print("Exercise 1:") genomeAfrican = getGenome("genomes/africanAligned.fasta") genomeIndian = getGenome("genomes/indianAligned.fasta") genomeMammoth = getGenome("genomes/mammothAligned.fasta") distAM = calculator._pairwise(genomeAfrican, genomeMammoth) distIM = calculator._pairwise(genomeIndian, genomeMammoth) print("Distance between African and Mammoth is {}.".format(distAM)) print("Distance between Indian and Mammoth is {}.".format(distIM)) # Exercise 3 print("\nExercise 3:") genomeWhale = getGenome("genomes/whaleAligned.fasta") genomeCow = getGenome("genomes/cowAligned.fasta") genomeHippo = getGenome("genomes/hippoAligned.fasta") distWC = calculator._pairwise(genomeWhale, genomeCow) distWH = calculator._pairwise(genomeWhale, genomeHippo) print("Distance between Whale and Cow is {}.".format(distWC)) print("Distance between Whale and Hippo is {}.".format(distWH))
def calculate_weight_vector(aln_obj, algorithm='pairwise', calc_mx='identity', repeat=1000, nucl=False): alg_types = ['voronoi', 'pairwise'] if algorithm not in alg_types: raise ValueError("Invalid algorithm type. Expected one of: %s" % alg_types) i = 0 if algorithm == 'voronoi': calculator = DistanceCalculator(calc_mx) convergence_vr = [0] * len(aln_obj) while i < repeat: test_seq = generate_sequence_sampled_from_alignment(aln_obj) wei_vr = list() for seq_obj in aln_obj: wei_vr.append(calculator._pairwise(seq_obj.seq, test_seq)) closest_seq = min(wei_vr) closest_sequences = [ i for i, j in enumerate(wei_vr) if j == closest_seq ] for pos in closest_sequences: convergence_vr[pos] += 1 / len(closest_sequences) i += 1 return [i / sum(convergence_vr) for i in convergence_vr] if algorithm == 'pairwise': tree = tree_construct(aln_obj, nucl=nucl, calc_mx=calc_mx) distance_sums = list() for seq_obj in aln_obj: curr_seq_dist = 0 for seq_obj2 in aln_obj: curr_seq_dist += tree.distance(seq_obj.id, seq_obj2.id) distance_sums.append(curr_seq_dist) return [i / sum(distance_sums) for i in distance_sums]
def get_co_len(msa, circular_order): ''' Scoring an circular order. param: msa: list of string(sequence) ''' co = circular_order assert len(msa) > 3 assert len(msa) == len( co), 'length of msa and circular order must be equal' calculator = DistanceCalculator('blastn') pa_scores = [ calculator._pairwise(msa[co[i]], msa[co[i + 1]]) for i in range(len(co) - 1) ] pa_scores.append(calculator._pairwise(msa[co[-1]], msa[co[0]])) return sum(pa_scores)
def distances_to_seq(alignment, sequence, distance_model="identity"): """A tool for computing not the complete sequence-sequence distance matrix, but only the distances to certain sequences. Beware: relies on a protected member of DistanceCalculator. :param alignment: A MultipleSeqAlignment object. :param sequence: A SeqRecord object. Must be of the same length as the records in the alignment. :param distance_model: One of either 'identity', 'blastn', or 'trans'. Defines the distance of a nucleotide pair. See Bio.Phylo.TreeConstruction.DistanceCalculator documentation. :returns: A list of distances between the given sequence and all sequences in the MSA, in the order in which the sequences are in the MSA. """ dcalc = DistanceCalculator(distance_model) output = [dcalc._pairwise(sequence, msa_seq) for msa_seq in alignment] return output
s2 = seq_list[1] #s3 = seq_list[2] #calculate average of three gc %s GC_ave = sum(gc_ave) / len(gc_ave) #make it a string to write it to file GC_str_ave = str(GC_ave) #find average length (they are all the same) Len_ave = sum(len_ave) / len(len_ave) seq_len = Len_ave str_len = str(Len_ave) #===============================================# #this could be shortened with a function #calculate pairwise distance, and find average across the 3 seqs in each file, I multiple by 100 to make it a number rather than a decimal calculator = DistanceCalculator('identity') #pdist s1 and s2 pd1 = (calculator._pairwise(s1, s2)) * 100 #pdist s1 and s3 #pd2 = (calculator._pairwise(s1,s3))*100 #pdist s2 and s3 #pd3 = (calculator._pairwise(s2,s3))*100 #pd_ave = [pd1,pd2,pd3] #pd_mean = numpy.mean(pd_ave) #pd_mean2 = str(pd_mean) #write all outputs to the final output file fh.write(file_name + '\t' + GC_str_ave + '\t' + str_len + '\t' + str(pd1) + '\t' + '\n') fh2.close() #close the file for appending, open below for reading fh.close() #Part two-print summary statistics from the gc output file