def test_read_write_kmers(self): """ test read/write kmers """ sequence = "ACTGGGTATCGATGACGTATATGCATTGAGAGTACGTATGNNNACTG" kcounter = Kmer.KmerCounter(2) kcomparer = Kmer.KmerComparer(kcounter) spectrums = kcomparer.compute_spectrums([sequence,sequence],["A","B"]) spectrums = np.array(spectrums) fn = os.path.join(self.datadir, "temp.x") Kmer.write_spectrums(spectrums, fn) specs = Kmer.read_spectrums(fn) for i,j in itertools.product(range(specs.shape[0]),range(specs.shape[1])): self.assertAlmostEqual(specs[i][j],spectrums[i][j], delta=0.005, msg="Problem reading/writing k-mer spectrums") os.remove(fn)
def test_L1_distance(self): """ test the L1 distance function. The same spectrum must given distance 0 """ ks = [2, 3, 4] for k in ks: counter = Kmer.KmerCounter(k) spectrum1 = counter.get_spectrum(self.sequence) spectrum2 = counter.get_spectrum(self.sequence) d = Kmer.L1_distance(spectrum1, spectrum2) expected_distance = 0.0 self.assertAlmostEqual( d, expected_distance, delta=1e-5, msg="L1 distance: {0}. Expected: {1} k = {2}".format( d, expected_distance, k))
def test_kmer_counter_whit_unknown2(self): """ Test counting kmers of size 2 in a sequence that has unknown values """ counter = Kmer.KmerCounter(2) result = counter.count(self.seq_with_unknown) expected = np.array([1, 1, 1, 2, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0]) self.check(result, expected)
def test_kmer_counter1(self): """ Test counting kmers of size 1 """ counter = Kmer.KmerCounter(1) result = counter.count(self.sequence) expected = np.array([5, 4, 5, 6]) self.check(result, expected)
def test_kmer_counter2(self): """ Test counting kmers of size 2 """ counter = Kmer.KmerCounter(2) result = counter.count(self.sequence) expected = np.array([1, 1, 1, 2, 1, 1, 0, 2, 1, 2, 1, 1, 1, 0, 3, 1]) self.check(result, expected)
def add_scaffold_spectrums(self, kmer_size): """ Calculate the k-mer spectrums for the scaffolds The sequences of the scaffolds are read from the database and their spectrums are stored as a new column @param k The size of the kmers """ log.debug( "Adding a column with the k-mer spectrums to the scaffolds table") if not self.table_exists(self.ScaffoldsTable): raise ValueError( "Cannot add k-mer spectrums. Scaffolds table does not exist") if not "spectrum" in self.get_table_column_names(self.ScaffoldsTable): self.add_column(self.ScaffoldsTable, "spectrum", str) kcounter = Kmer.KmerCounter(kmer_size) kcomparer = Kmer.KmerComparer(kcounter) sql_command = """SELECT scaffold, sequence FROM {0}""".format( self.ScaffoldsTable) cursor = self.execute(sql_command) record = cursor.fetchone() batch_size = 1000 sequences = [] scaffolds = [] update_command = """ UPDATE {0} SET spectrum=? WHERE scaffold=? """.format( self.ScaffoldsTable) while record: scaffold = record["scaffold"] scaffolds.append(scaffold) sequences.append(record["sequence"]) if len(sequences) == batch_size: spectrums = kcomparer.compute_spectrums(sequences, scaffolds) data = [("#".join(map(str, sp)), sc) for sp, sc in zip(spectrums, scaffolds)] self.executemany(update_command, data) self.commit() sequences = [] scaffolds = [] record = cursor.fetchone() if len(sequences) > 0: spectrums = kcomparer.compute_spectrums(sequences, scaffolds) data = [("#".join(map(str, sp)), sc) for sp, sc in zip(spectrums, scaffolds)] self.executemany(update_command, data) self.commit() sequences = [] scaffolds = []
def test_read_write_kmers(self): """ test read/write kmers """ sequence = "ACTGGGTATCGATGACGTATATGCATTGAGAGTACGTATGNNNACTG" kcounter = Kmer.KmerCounter(2) kcomparer = Kmer.KmerComparer(kcounter) spectrums = kcomparer.compute_spectrums([sequence, sequence], ["A", "B"]) spectrums = np.array(spectrums) fn = os.path.join(self.datadir, "temp.x") Kmer.write_spectrums(spectrums, fn) specs = Kmer.read_spectrums(fn) for i, j in itertools.product(range(specs.shape[0]), range(specs.shape[1])): self.assertAlmostEqual( specs[i][j], spectrums[i][j], delta=0.005, msg="Problem reading/writing k-mer spectrums") os.remove(fn)
def do_kmer_comparison(args): """ Compares the Kmer spectrums. Compares the scaffolds assigned using blast with the not assigned scaffolds """ log.info("Performing kmer comparison. Parameters: ") log.info("kmer size: %s dist12: %s threshold: %s", args.kmer, args.dist12,args.threshold) db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) kcounter = Kmer.KmerCounter(args.kmer) kcomparer = Kmer.KmerComparer(kcounter) kcomparer.set_kmer_distance_threshold(args.threshold) kcomparer.set_first_to_second_distance_ratio(args.dist12) # add the combined sequences of the scaffolds belonging to the same genera genus2sequence_dict, assigned_scaffolds = \ db.get_genera_sequences_from(db.ScaffoldKmerComparisonTable) for genus in genus2sequence_dict: kcomparer.add_reference_sequence(genus2sequence_dict[genus],genus) sql_command = "SELECT scaffold, sequence FROM {0}".format(db.ScaffoldsTable) cursor = db.execute(sql_command) batch_size = 1000 all_assignments = [] record = cursor.fetchone() while record: scaffold = record["scaffold"] if scaffold not in assigned_scaffolds: kcomparer.add_sequence(record["sequence"], scaffold) if kcomparer.get_number_of_sequences() == batch_size: matches = kcomparer.run() all_assignments.extend(matches) record = cursor.fetchone() if kcomparer.get_number_of_sequences() > 0: matches = kcomparer.run() all_assignments.extend(matches) db.store_data(db.ScaffoldKmerComparisonTable, all_assignments) db.close()
def kmer_comparison_one_iteration(args): """ This function is the one-iteration version of the iterative function """ db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) names = db.get_tables_names() if db.ScaffoldKmerComparisonTable in names: db.drop_table(db.ScaffoldKmerComparisonTable) db.create_scaffold_kmer_comparison_table() kcounter = Kmer.KmerCounter(args.kmer) kcomparer = Kmer.KmerComparer(kcounter) kcomparer.set_kmer_distance_threshold(args.threshold) kcomparer.set_first_to_second_distance_ratio(args.dist12) # add the combined sequences of the scaffolds belonging to the same genera genus2sequence_dict, assigned_scaffolds = db.get_genera_sequences_from(db.ScaffoldsAssignmentsTable) for genus in genus2sequence_dict: kcomparer.add_reference_sequence(genus2sequence_dict[genus],genus) sql_command = "SELECT scaffold, sequence FROM {0}".format(db.ScaffoldsTable) cursor = db.execute(sql_command) batch_size = 1000 all_matches = [] record = cursor.fetchone() while record: scaffold = record["scaffold"] if scaffold not in assigned_scaffolds: kcomparer.add_sequence(record["sequence"], scaffold) if kcomparer.get_number_of_sequences() == batch_size: matches = kcomparer.run() # kcomparer will return False if a reliable match has not been found all_matches.extend([m for m in matches if m[1] != False]) record = cursor.fetchone() if kcomparer.get_number_of_sequences() > 0: matches = kcomparer.run() all_matches.extend([m for m in matches if m[1] != False]) db.store_data(db.ScaffoldKmerComparisonTable, all_matches) db.close()
def test_spectrum_of_unique_kmers(self): """ Test the spectrums of unique kmers """ fn = os.path.join(self.datadir, "thermus.fasta") f = open(fn, 'r') f.readline() # discard first line (header) sequence = f.readline() f.close() for kmersize in [2, 3, 4]: kmercounter = Kmer.KmerCounter(kmersize) spectrum = kmercounter.get_spectrum(sequence) uspectrum = kmercounter.get_unique_kmers_spectrum(sequence) self.assertGreater(len(spectrum), len(uspectrum)) self.assertAlmostEqual(spectrum.sum(), 1, delta=0.001, msg="problem with spectrum") self.assertAlmostEqual(uspectrum.sum(), 1, delta=0.001, msg="problem with spectrum of unique kmers") counts = kmercounter.count(sequence) ucounts = kmercounter.get_unique_kmers_counts(sequence) self.assertEqual(counts.sum(), ucounts.sum(), "The kmer counts must be the same")
logging.root.setLevel(logging.DEBUG) """ This script shows how to calculate the spectrum of a DNA sequence """ # read the DNA f = open('thermus.fasta', 'r') f.readline() # discard first line (header) sequence = f.readline() f.close() kmersize = 4 alphabet = "ACGT" kmers = Kmer.generate_kmers(kmersize, alphabet) print "The kmers are " print kmers kmers = Kmer.remove_reversed(kmers) print "the", len(kmers), "unique kmers are", kmers kmercounter = Kmer.KmerCounter(kmersize) kmercounter.set_alphabet(alphabet) spectrum = kmercounter.get_spectrum(sequence) print "The length of the kmer spectrum is", kmercounter.get_spectrum_length() print "And the spectrum is:" print spectrum spectrum = kmercounter.get_unique_kmers_spectrum(sequence) print "The length of the spectrum of unique kmers is", len(spectrum)
""" This script shows how to calculate the spectrum of a DNA sequence """ # read the DNA f = open("thermus.fasta", "r") f.readline() # discard first line (header) sequence = f.readline() f.close() kmersize = 4 alphabet = "ACGT" kmers = Kmer.generate_kmers(kmersize, alphabet) print "The kmers are " print kmers kmers = Kmer.remove_reversed(kmers) print "the", len(kmers), "unique kmers are", kmers kmercounter = Kmer.KmerCounter(kmersize) kmercounter.set_alphabet(alphabet) spectrum = kmercounter.get_spectrum(sequence) print "The length of the kmer spectrum is", kmercounter.get_spectrum_length() print "And the spectrum is:" print spectrum