def add_scaffold_spectrums(self, kmer_size): """ Calculate the k-mer spectrums for the scaffolds The sequences of the scaffolds are read from the database and their spectrums are stored as a new column @param k The size of the kmers """ log.debug( "Adding a column with the k-mer spectrums to the scaffolds table") if not self.table_exists(self.ScaffoldsTable): raise ValueError( "Cannot add k-mer spectrums. Scaffolds table does not exist") if not "spectrum" in self.get_table_column_names(self.ScaffoldsTable): self.add_column(self.ScaffoldsTable, "spectrum", str) kcounter = Kmer.KmerCounter(kmer_size) kcomparer = Kmer.KmerComparer(kcounter) sql_command = """SELECT scaffold, sequence FROM {0}""".format( self.ScaffoldsTable) cursor = self.execute(sql_command) record = cursor.fetchone() batch_size = 1000 sequences = [] scaffolds = [] update_command = """ UPDATE {0} SET spectrum=? WHERE scaffold=? """.format( self.ScaffoldsTable) while record: scaffold = record["scaffold"] scaffolds.append(scaffold) sequences.append(record["sequence"]) if len(sequences) == batch_size: spectrums = kcomparer.compute_spectrums(sequences, scaffolds) data = [("#".join(map(str, sp)), sc) for sp, sc in zip(spectrums, scaffolds)] self.executemany(update_command, data) self.commit() sequences = [] scaffolds = [] record = cursor.fetchone() if len(sequences) > 0: spectrums = kcomparer.compute_spectrums(sequences, scaffolds) data = [("#".join(map(str, sp)), sc) for sp, sc in zip(spectrums, scaffolds)] self.executemany(update_command, data) self.commit() sequences = [] scaffolds = []
def test_read_write_kmers(self): """ test read/write kmers """ sequence = "ACTGGGTATCGATGACGTATATGCATTGAGAGTACGTATGNNNACTG" kcounter = Kmer.KmerCounter(2) kcomparer = Kmer.KmerComparer(kcounter) spectrums = kcomparer.compute_spectrums([sequence, sequence], ["A", "B"]) spectrums = np.array(spectrums) fn = os.path.join(self.datadir, "temp.x") Kmer.write_spectrums(spectrums, fn) specs = Kmer.read_spectrums(fn) for i, j in itertools.product(range(specs.shape[0]), range(specs.shape[1])): self.assertAlmostEqual( specs[i][j], spectrums[i][j], delta=0.005, msg="Problem reading/writing k-mer spectrums") os.remove(fn)
def do_kmer_comparison(args): """ Compares the Kmer spectrums. Compares the scaffolds assigned using blast with the not assigned scaffolds """ log.info("Performing kmer comparison. Parameters: ") log.info("kmer size: %s dist12: %s threshold: %s", args.kmer, args.dist12,args.threshold) db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) kcounter = Kmer.KmerCounter(args.kmer) kcomparer = Kmer.KmerComparer(kcounter) kcomparer.set_kmer_distance_threshold(args.threshold) kcomparer.set_first_to_second_distance_ratio(args.dist12) # add the combined sequences of the scaffolds belonging to the same genera genus2sequence_dict, assigned_scaffolds = \ db.get_genera_sequences_from(db.ScaffoldKmerComparisonTable) for genus in genus2sequence_dict: kcomparer.add_reference_sequence(genus2sequence_dict[genus],genus) sql_command = "SELECT scaffold, sequence FROM {0}".format(db.ScaffoldsTable) cursor = db.execute(sql_command) batch_size = 1000 all_assignments = [] record = cursor.fetchone() while record: scaffold = record["scaffold"] if scaffold not in assigned_scaffolds: kcomparer.add_sequence(record["sequence"], scaffold) if kcomparer.get_number_of_sequences() == batch_size: matches = kcomparer.run() all_assignments.extend(matches) record = cursor.fetchone() if kcomparer.get_number_of_sequences() > 0: matches = kcomparer.run() all_assignments.extend(matches) db.store_data(db.ScaffoldKmerComparisonTable, all_assignments) db.close()
def kmer_comparison_one_iteration(args): """ This function is the one-iteration version of the iterative function """ db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) names = db.get_tables_names() if db.ScaffoldKmerComparisonTable in names: db.drop_table(db.ScaffoldKmerComparisonTable) db.create_scaffold_kmer_comparison_table() kcounter = Kmer.KmerCounter(args.kmer) kcomparer = Kmer.KmerComparer(kcounter) kcomparer.set_kmer_distance_threshold(args.threshold) kcomparer.set_first_to_second_distance_ratio(args.dist12) # add the combined sequences of the scaffolds belonging to the same genera genus2sequence_dict, assigned_scaffolds = db.get_genera_sequences_from(db.ScaffoldsAssignmentsTable) for genus in genus2sequence_dict: kcomparer.add_reference_sequence(genus2sequence_dict[genus],genus) sql_command = "SELECT scaffold, sequence FROM {0}".format(db.ScaffoldsTable) cursor = db.execute(sql_command) batch_size = 1000 all_matches = [] record = cursor.fetchone() while record: scaffold = record["scaffold"] if scaffold not in assigned_scaffolds: kcomparer.add_sequence(record["sequence"], scaffold) if kcomparer.get_number_of_sequences() == batch_size: matches = kcomparer.run() # kcomparer will return False if a reliable match has not been found all_matches.extend([m for m in matches if m[1] != False]) record = cursor.fetchone() if kcomparer.get_number_of_sequences() > 0: matches = kcomparer.run() all_matches.extend([m for m in matches if m[1] != False]) db.store_data(db.ScaffoldKmerComparisonTable, all_matches) db.close()