예제 #1
0
def main():
	
	calculator = DistanceCalculator()

	# Exercise 1
	print("Exercise 1:")
	genomeAfrican = getGenome("genomes/africanAligned.fasta")
	genomeIndian = getGenome("genomes/indianAligned.fasta")
	genomeMammoth = getGenome("genomes/mammothAligned.fasta")

	distAM = calculator._pairwise(genomeAfrican, genomeMammoth)
	distIM = calculator._pairwise(genomeIndian, genomeMammoth)

	print("Distance between African and Mammoth is {}.".format(distAM))
	print("Distance between Indian and Mammoth is {}.".format(distIM))

	# Exercise 3
	print("\nExercise 3:")
	genomeWhale = getGenome("genomes/whaleAligned.fasta")
	genomeCow = getGenome("genomes/cowAligned.fasta")
	genomeHippo = getGenome("genomes/hippoAligned.fasta")

	distWC = calculator._pairwise(genomeWhale, genomeCow)
	distWH = calculator._pairwise(genomeWhale, genomeHippo)
	
	print("Distance between Whale and Cow is {}.".format(distWC))
	print("Distance between Whale and Hippo is {}.".format(distWH))
def calculate_weight_vector(aln_obj,
                            algorithm='pairwise',
                            calc_mx='identity',
                            repeat=1000,
                            nucl=False):
    alg_types = ['voronoi', 'pairwise']
    if algorithm not in alg_types:
        raise ValueError("Invalid algorithm type. Expected one of: %s" %
                         alg_types)
    i = 0
    if algorithm == 'voronoi':
        calculator = DistanceCalculator(calc_mx)
        convergence_vr = [0] * len(aln_obj)
        while i < repeat:
            test_seq = generate_sequence_sampled_from_alignment(aln_obj)
            wei_vr = list()
            for seq_obj in aln_obj:
                wei_vr.append(calculator._pairwise(seq_obj.seq, test_seq))
            closest_seq = min(wei_vr)
            closest_sequences = [
                i for i, j in enumerate(wei_vr) if j == closest_seq
            ]
            for pos in closest_sequences:
                convergence_vr[pos] += 1 / len(closest_sequences)
            i += 1
        return [i / sum(convergence_vr) for i in convergence_vr]
    if algorithm == 'pairwise':
        tree = tree_construct(aln_obj, nucl=nucl, calc_mx=calc_mx)
        distance_sums = list()
        for seq_obj in aln_obj:
            curr_seq_dist = 0
            for seq_obj2 in aln_obj:
                curr_seq_dist += tree.distance(seq_obj.id, seq_obj2.id)
            distance_sums.append(curr_seq_dist)
        return [i / sum(distance_sums) for i in distance_sums]
예제 #3
0
def get_co_len(msa, circular_order):
    '''
    Scoring an circular order.
    param:
        msa: list of string(sequence)
    '''
    co = circular_order
    assert len(msa) > 3
    assert len(msa) == len(
        co), 'length of msa and circular order must be equal'

    calculator = DistanceCalculator('blastn')

    pa_scores = [
        calculator._pairwise(msa[co[i]], msa[co[i + 1]])
        for i in range(len(co) - 1)
    ]
    pa_scores.append(calculator._pairwise(msa[co[-1]], msa[co[0]]))

    return sum(pa_scores)
def distances_to_seq(alignment, sequence, distance_model="identity"):
    """A tool for computing not the complete sequence-sequence distance matrix,
    but only the distances to certain sequences.

    Beware: relies on a protected member of DistanceCalculator.

    :param alignment: A MultipleSeqAlignment object.

    :param sequence: A SeqRecord object. Must be of the same length as the
        records in the alignment.

    :param distance_model: One of either 'identity', 'blastn', or 'trans'.
        Defines the distance of a nucleotide pair. See
        Bio.Phylo.TreeConstruction.DistanceCalculator documentation.

    :returns: A list of distances between the given sequence and all sequences
        in the MSA, in the order in which the sequences are in the MSA.
    """
    dcalc = DistanceCalculator(distance_model)
    output = [dcalc._pairwise(sequence, msa_seq) for msa_seq in alignment]
    return output
예제 #5
0
        s2 = seq_list[1]
        #s3 = seq_list[2]
        #calculate average of three gc %s
        GC_ave = sum(gc_ave) / len(gc_ave)
        #make it a string to write it to file
        GC_str_ave = str(GC_ave)
        #find average length (they are all the same)
        Len_ave = sum(len_ave) / len(len_ave)
        seq_len = Len_ave
        str_len = str(Len_ave)
        #===============================================#
        #this could be shortened with a function
        #calculate pairwise distance, and find average across the 3 seqs in each file, I multiple by 100 to make it a number rather than a decimal
        calculator = DistanceCalculator('identity')
        #pdist s1 and s2
        pd1 = (calculator._pairwise(s1, s2)) * 100
        #pdist s1 and s3
        #pd2 = (calculator._pairwise(s1,s3))*100
        #pdist s2 and s3
        #pd3 = (calculator._pairwise(s2,s3))*100
        #pd_ave = [pd1,pd2,pd3]
        #pd_mean = numpy.mean(pd_ave)
        #pd_mean2 = str(pd_mean)
        #write all outputs to the final output file
        fh.write(file_name + '\t' + GC_str_ave + '\t' + str_len + '\t' +
                 str(pd1) + '\t' + '\n')
        fh2.close()
#close the file for appending, open below for reading
fh.close()

#Part two-print summary statistics from the gc output file