Esempio n. 1
0
 def test_read_write_kmers(self):
     """ test read/write kmers """
     sequence = "ACTGGGTATCGATGACGTATATGCATTGAGAGTACGTATGNNNACTG"
     kcounter = Kmer.KmerCounter(2)
     kcomparer = Kmer.KmerComparer(kcounter)
     spectrums = kcomparer.compute_spectrums([sequence,sequence],["A","B"])
     spectrums = np.array(spectrums)
     fn = os.path.join(self.datadir, "temp.x")
     Kmer.write_spectrums(spectrums, fn)
     specs = Kmer.read_spectrums(fn)
     for i,j in itertools.product(range(specs.shape[0]),range(specs.shape[1])):
         self.assertAlmostEqual(specs[i][j],spectrums[i][j], delta=0.005,
           msg="Problem reading/writing k-mer spectrums")
     os.remove(fn)
Esempio n. 2
0
 def test_L1_distance(self):
     """ test the L1 distance function. The same spectrum must given distance 0 """
     ks = [2, 3, 4]
     for k in ks:
         counter = Kmer.KmerCounter(k)
         spectrum1 = counter.get_spectrum(self.sequence)
         spectrum2 = counter.get_spectrum(self.sequence)
         d = Kmer.L1_distance(spectrum1, spectrum2)
         expected_distance = 0.0
         self.assertAlmostEqual(
             d,
             expected_distance,
             delta=1e-5,
             msg="L1 distance: {0}. Expected: {1}  k = {2}".format(
                 d, expected_distance, k))
Esempio n. 3
0
 def test_kmer_counter_whit_unknown2(self):
     """ Test counting kmers of size 2 in a sequence that has unknown values
     """
     counter = Kmer.KmerCounter(2)
     result = counter.count(self.seq_with_unknown)
     expected = np.array([1, 1, 1, 2, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0])
     self.check(result, expected)
Esempio n. 4
0
 def test_kmer_counter1(self):
     """ Test counting kmers of size 1
     """
     counter = Kmer.KmerCounter(1)
     result = counter.count(self.sequence)
     expected = np.array([5, 4, 5, 6])
     self.check(result, expected)
Esempio n. 5
0
 def test_kmer_counter2(self):
     """ Test counting kmers of size 2
     """
     counter = Kmer.KmerCounter(2)
     result = counter.count(self.sequence)
     expected = np.array([1, 1, 1, 2, 1, 1, 0, 2, 1, 2, 1, 1, 1, 0, 3, 1])
     self.check(result, expected)
Esempio n. 6
0
    def add_scaffold_spectrums(self, kmer_size):
        """ Calculate the k-mer spectrums for the scaffolds
            The sequences of the scaffolds are read from the database and
            their spectrums are stored as a new column
            @param k The size of the kmers
        """
        log.debug(
            "Adding a column with the k-mer spectrums to the scaffolds table")
        if not self.table_exists(self.ScaffoldsTable):
            raise ValueError(
                "Cannot add k-mer spectrums. Scaffolds table  does not exist")
        if not "spectrum" in self.get_table_column_names(self.ScaffoldsTable):
            self.add_column(self.ScaffoldsTable, "spectrum", str)

        kcounter = Kmer.KmerCounter(kmer_size)
        kcomparer = Kmer.KmerComparer(kcounter)
        sql_command = """SELECT scaffold, sequence FROM {0}""".format(
            self.ScaffoldsTable)
        cursor = self.execute(sql_command)
        record = cursor.fetchone()
        batch_size = 1000
        sequences = []
        scaffolds = []
        update_command = """ UPDATE {0} SET spectrum=? WHERE scaffold=? """.format(
            self.ScaffoldsTable)
        while record:
            scaffold = record["scaffold"]
            scaffolds.append(scaffold)
            sequences.append(record["sequence"])
            if len(sequences) == batch_size:
                spectrums = kcomparer.compute_spectrums(sequences, scaffolds)
                data = [("#".join(map(str, sp)), sc)
                        for sp, sc in zip(spectrums, scaffolds)]
                self.executemany(update_command, data)
                self.commit()
                sequences = []
                scaffolds = []
            record = cursor.fetchone()
        if len(sequences) > 0:
            spectrums = kcomparer.compute_spectrums(sequences, scaffolds)
            data = [("#".join(map(str, sp)), sc)
                    for sp, sc in zip(spectrums, scaffolds)]
            self.executemany(update_command, data)
            self.commit()
            sequences = []
            scaffolds = []
Esempio n. 7
0
 def test_read_write_kmers(self):
     """ test read/write kmers """
     sequence = "ACTGGGTATCGATGACGTATATGCATTGAGAGTACGTATGNNNACTG"
     kcounter = Kmer.KmerCounter(2)
     kcomparer = Kmer.KmerComparer(kcounter)
     spectrums = kcomparer.compute_spectrums([sequence, sequence],
                                             ["A", "B"])
     spectrums = np.array(spectrums)
     fn = os.path.join(self.datadir, "temp.x")
     Kmer.write_spectrums(spectrums, fn)
     specs = Kmer.read_spectrums(fn)
     for i, j in itertools.product(range(specs.shape[0]),
                                   range(specs.shape[1])):
         self.assertAlmostEqual(
             specs[i][j],
             spectrums[i][j],
             delta=0.005,
             msg="Problem reading/writing k-mer spectrums")
     os.remove(fn)
Esempio n. 8
0
def do_kmer_comparison(args):
    """ Compares the Kmer spectrums.
    Compares the scaffolds assigned using blast with the not assigned
    scaffolds
    """
    log.info("Performing kmer comparison. Parameters: ")
    log.info("kmer size: %s dist12: %s threshold: %s", args.kmer,
                            args.dist12,args.threshold)

    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    kcounter = Kmer.KmerCounter(args.kmer)
    kcomparer = Kmer.KmerComparer(kcounter)
    kcomparer.set_kmer_distance_threshold(args.threshold)
    kcomparer.set_first_to_second_distance_ratio(args.dist12)

    # add the combined sequences of the scaffolds belonging to the same genera
    genus2sequence_dict, assigned_scaffolds = \
            db.get_genera_sequences_from(db.ScaffoldKmerComparisonTable)
    for genus in genus2sequence_dict:
        kcomparer.add_reference_sequence(genus2sequence_dict[genus],genus)

    sql_command = "SELECT scaffold, sequence FROM {0}".format(db.ScaffoldsTable)
    cursor = db.execute(sql_command)
    batch_size = 1000
    all_assignments = []
    record = cursor.fetchone()
    while record:
        scaffold = record["scaffold"]
        if scaffold not in assigned_scaffolds:
            kcomparer.add_sequence(record["sequence"], scaffold)
        if kcomparer.get_number_of_sequences() == batch_size:
            matches = kcomparer.run()
            all_assignments.extend(matches)
        record = cursor.fetchone()
    if kcomparer.get_number_of_sequences() > 0:
        matches = kcomparer.run()
        all_assignments.extend(matches)
    db.store_data(db.ScaffoldKmerComparisonTable, all_assignments)
    db.close()
Esempio n. 9
0
def kmer_comparison_one_iteration(args):
    """ This function is the one-iteration version of the iterative function
    """
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    names = db.get_tables_names()
    if db.ScaffoldKmerComparisonTable in names:
       db.drop_table(db.ScaffoldKmerComparisonTable)
    db.create_scaffold_kmer_comparison_table()
    kcounter = Kmer.KmerCounter(args.kmer)
    kcomparer = Kmer.KmerComparer(kcounter)
    kcomparer.set_kmer_distance_threshold(args.threshold)
    kcomparer.set_first_to_second_distance_ratio(args.dist12)

    # add the combined sequences of the scaffolds belonging to the same genera
    genus2sequence_dict, assigned_scaffolds = db.get_genera_sequences_from(db.ScaffoldsAssignmentsTable)
    for genus in genus2sequence_dict:
        kcomparer.add_reference_sequence(genus2sequence_dict[genus],genus)

    sql_command = "SELECT scaffold, sequence FROM {0}".format(db.ScaffoldsTable)
    cursor = db.execute(sql_command)
    batch_size = 1000
    all_matches = []
    record = cursor.fetchone()
    while record:
        scaffold = record["scaffold"]
        if scaffold not in assigned_scaffolds:
            kcomparer.add_sequence(record["sequence"], scaffold)
        if kcomparer.get_number_of_sequences() == batch_size:
            matches = kcomparer.run()
            # kcomparer will return False if a reliable match has not been found
            all_matches.extend([m for m in matches if m[1] != False])
        record = cursor.fetchone()
    if kcomparer.get_number_of_sequences() > 0:
        matches = kcomparer.run()
        all_matches.extend([m for m in matches if m[1] != False])
    db.store_data(db.ScaffoldKmerComparisonTable, all_matches)
    db.close()
Esempio n. 10
0
 def test_spectrum_of_unique_kmers(self):
     """ Test the spectrums of unique kmers """
     fn = os.path.join(self.datadir, "thermus.fasta")
     f = open(fn, 'r')
     f.readline()  # discard first line (header)
     sequence = f.readline()
     f.close()
     for kmersize in [2, 3, 4]:
         kmercounter = Kmer.KmerCounter(kmersize)
         spectrum = kmercounter.get_spectrum(sequence)
         uspectrum = kmercounter.get_unique_kmers_spectrum(sequence)
         self.assertGreater(len(spectrum), len(uspectrum))
         self.assertAlmostEqual(spectrum.sum(),
                                1,
                                delta=0.001,
                                msg="problem with spectrum")
         self.assertAlmostEqual(uspectrum.sum(),
                                1,
                                delta=0.001,
                                msg="problem with spectrum of unique kmers")
         counts = kmercounter.count(sequence)
         ucounts = kmercounter.get_unique_kmers_counts(sequence)
         self.assertEqual(counts.sum(), ucounts.sum(),
                          "The kmer counts must be the same")
Esempio n. 11
0
logging.root.setLevel(logging.DEBUG)
"""
    This script shows how to calculate the spectrum of a DNA sequence

"""

# read the DNA
f = open('thermus.fasta', 'r')
f.readline()  # discard first line (header)
sequence = f.readline()
f.close()

kmersize = 4
alphabet = "ACGT"

kmers = Kmer.generate_kmers(kmersize, alphabet)
print "The kmers are "
print kmers
kmers = Kmer.remove_reversed(kmers)
print "the", len(kmers), "unique kmers are", kmers

kmercounter = Kmer.KmerCounter(kmersize)
kmercounter.set_alphabet(alphabet)

spectrum = kmercounter.get_spectrum(sequence)
print "The length of the kmer spectrum is", kmercounter.get_spectrum_length()
print "And the spectrum is:"
print spectrum

spectrum = kmercounter.get_unique_kmers_spectrum(sequence)
print "The length of the spectrum of unique kmers is", len(spectrum)
Esempio n. 12
0
"""
    This script shows how to calculate the spectrum of a DNA sequence

"""

# read the DNA
f = open("thermus.fasta", "r")
f.readline()  # discard first line (header)
sequence = f.readline()
f.close()

kmersize = 4
alphabet = "ACGT"

kmers = Kmer.generate_kmers(kmersize, alphabet)
print "The kmers are "
print kmers
kmers = Kmer.remove_reversed(kmers)
print "the", len(kmers), "unique kmers are", kmers


kmercounter = Kmer.KmerCounter(kmersize)
kmercounter.set_alphabet(alphabet)

spectrum = kmercounter.get_spectrum(sequence)
print "The length of the kmer spectrum is", kmercounter.get_spectrum_length()
print "And the spectrum is:"
print spectrum