Ejemplo n.º 1
0
    def test_log_probability_full(self):
        file_name = os.path.join(cur_dir,"..","data/generated_contigs_test.fna")
        f = fileinput.input(file_name)
        c = list(SeqIO.parse(f,"fasta"))
        f.close()
        dna_c1g1 = dna.DNA(id = c[0].id, seq = str(c[0].seq))
        dna_c2g1 = dna.DNA(id = c[1].id, seq = str(c[1].seq))
        dna_c3g1 = dna.DNA(id = c[2].id, seq = str(c[2].seq))

        dna_c1g2 = dna.DNA(id = c[-3].id, seq = str(c[-3].seq))
        dna_c2g2 = dna.DNA(id = c[-2].id, seq = str(c[-2].seq))
        dna_c3g2 = dna.DNA(id = c[-1].id, seq = str(c[-1].seq))
        
        cluster1 = [dna_c1g1,dna_c2g1,dna_c3g1]
        cluster2 = [dna_c1g2,dna_c2g2,dna_c3g2]
        for contig in cluster1 + cluster2:
            contig.calculate_signature()

        parameters1 = model.fit_nonzero_parameters(cluster1)
        parameters2 = model.fit_nonzero_parameters(cluster2)

        # These testa are probably too shaky, due to the
        # numerical optimization for finding the parameters
        s1 = dna_c1g1
        log_prob1 = model.log_probability(s1,parameters1)
        assert_almost_equal(log_prob1/10000.0, -0.450, places = 1)
        log_prob2 = model.log_probability(s1,parameters2)
        assert_almost_equal(log_prob2/10000.0,-0.4676, places = 2)

        s2 = dna_c1g2
        log_prob3 = model.log_probability(s2,parameters1)
        assert_almost_equal(log_prob3/10000.0,-0.517, places = 2)
        log_prob4 = model.log_probability(s2,parameters2)
        assert_almost_equal(log_prob4/10000.0,-0.483, places = 2)
Ejemplo n.º 2
0
    def test_choose_algorithm_fit_parameters(self):
        c = dna.DNA(id="ADADAD", seq='ACTTTAAACCC')
        c.calculate_signature()
        d = dna.DNA(id="ADADAD", seq='ACTTTACGAACCC')
        d.calculate_signature()
        dna_l = [c,d]
        
        alpha_fit_tnc = model.fit_nonzero_parameters([c,d])        
        alpha_fit_bfgs = model.fit_nonzero_parameters([c,d],algorithm="bfgs")

        assert_not_equal(sum(alpha_fit_tnc),sum(alpha_fit_bfgs))
def main(contigs_file,taxonomy_file, dir_path, kmer_length, contig_length, algorithm):

    groups = []
    DNA.generate_kmer_hash(kmer_length)

    contigs = read_contigs_file(contigs_file,start_position=True)
    
    # Divide genomes into groups, one for each genus
    meta_genomes = genome_info_from_parsed_taxonomy_file(taxonomy_file)

    # Fetch sequence for each genome
    genomes = read_FASTA_files_no_groups(meta_genomes, dir_path)

    genome_part_l = 10000
    for genome in genomes:
        genome.calculate_signature()
        genome.parts = genome.split_seq(genome_part_l)
        for part in genome.parts:
            part.calculate_signature()
        genome.pseudo_par = model.fit_nonzero_parameters(\
            genome.parts, algorithm = algorithm)

    scores = []
    for contig in contigs:
        contig.calculate_signature()
        for genome in genomes:
            if contig.id == genome.id:
                s = int(contig.start_position)
                start_part_index = s/genome_part_l
                end_part_index = (s+contig_length)/genome_part_l
                if start_part_index == end_part_index:
                    i = start_part_index
                    temp_pseudo_par = model.fit_nonzero_parameters(\
                        genome.parts[0:i]+genome.parts[i+1:],
                        algorithm=algorithm)
                else:
                    i1 = start_part_index
                    i2 = end_part_index
                    temp_pseudo_par = model.fit_nonzero_parameters(\
                        genome.parts[0:i1]+genome.parts[i2+1:],
                        algorithm=algorithm)

                p_val = model.log_probability(\
                    contig, temp_pseudo_par)
            else:
                p_val = model.log_probability(\
                    contig, genome.pseudo_par)
            scores.append(\
                Score(p_val, contig, genome, contig.contig_id))

    sys.stdout.write("p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id" + os.linesep)
    for score in scores:
        sys.stdout.write(str(score) + '\n')
Ejemplo n.º 4
0
 def test_fit_nonzero_parameters(self):
     c = dna.DNA(id="ADADAD",seq='ACTTTAAACCC')
     c.calculate_signature()
     d = dna.DNA(id="ADADAD",seq='AAACCCCTATC')
     d.calculate_signature()
     
     alpha_fit = model.fit_nonzero_parameters([c,d])
     # Produce output of correct length
     assert_equal(len(alpha_fit), 136)
     # Produce strictly positive parameters
     assert_equal((alpha_fit > 0).all(),True)
Ejemplo n.º 5
0
    def test_log_probability_order(self):
        file_name = os.path.join(cur_dir,"..","data/generated_contigs_test.fna")
        f = fileinput.input(file_name)
        c = list(SeqIO.parse(f,"fasta"))
        f.close()
        dna_c1g1 = dna.DNA(id = c[0].id, seq = str(c[0].seq))
        dna_c2g1 = dna.DNA(id = c[1].id, seq = str(c[1].seq))
        dna_c3g1 = dna.DNA(id = c[2].id, seq = str(c[2].seq))

        dna_c1g2 = dna.DNA(id = c[-3].id, seq = str(c[-3].seq))
        dna_c2g2 = dna.DNA(id = c[-2].id, seq = str(c[-2].seq))
        dna_c3g2 = dna.DNA(id = c[-1].id, seq = str(c[-1].seq))
        
        cluster1 = [dna_c1g1,dna_c2g1,dna_c3g1]
        cluster2 = [dna_c1g2,dna_c2g2,dna_c3g2]
        for contig in cluster1 + cluster2:
            contig.calculate_signature()

        parameters1 = model.fit_nonzero_parameters(cluster1)
        parameters2 = model.fit_nonzero_parameters(cluster2)

        log_prob1 = model.log_probability(dna_c1g1,parameters1)
        log_prob2 = model.log_probability(dna_c1g1,parameters2)
        assert_equal(log_prob1>log_prob2,True)