コード例 #1
0
    def add_scaffold_spectrums(self, kmer_size):
        """ Calculate the k-mer spectrums for the scaffolds
            The sequences of the scaffolds are read from the database and
            their spectrums are stored as a new column
            @param k The size of the kmers
        """
        log.debug(
            "Adding a column with the k-mer spectrums to the scaffolds table")
        if not self.table_exists(self.ScaffoldsTable):
            raise ValueError(
                "Cannot add k-mer spectrums. Scaffolds table  does not exist")
        if not "spectrum" in self.get_table_column_names(self.ScaffoldsTable):
            self.add_column(self.ScaffoldsTable, "spectrum", str)

        kcounter = Kmer.KmerCounter(kmer_size)
        kcomparer = Kmer.KmerComparer(kcounter)
        sql_command = """SELECT scaffold, sequence FROM {0}""".format(
            self.ScaffoldsTable)
        cursor = self.execute(sql_command)
        record = cursor.fetchone()
        batch_size = 1000
        sequences = []
        scaffolds = []
        update_command = """ UPDATE {0} SET spectrum=? WHERE scaffold=? """.format(
            self.ScaffoldsTable)
        while record:
            scaffold = record["scaffold"]
            scaffolds.append(scaffold)
            sequences.append(record["sequence"])
            if len(sequences) == batch_size:
                spectrums = kcomparer.compute_spectrums(sequences, scaffolds)
                data = [("#".join(map(str, sp)), sc)
                        for sp, sc in zip(spectrums, scaffolds)]
                self.executemany(update_command, data)
                self.commit()
                sequences = []
                scaffolds = []
            record = cursor.fetchone()
        if len(sequences) > 0:
            spectrums = kcomparer.compute_spectrums(sequences, scaffolds)
            data = [("#".join(map(str, sp)), sc)
                    for sp, sc in zip(spectrums, scaffolds)]
            self.executemany(update_command, data)
            self.commit()
            sequences = []
            scaffolds = []
コード例 #2
0
 def test_read_write_kmers(self):
     """ test read/write kmers """
     sequence = "ACTGGGTATCGATGACGTATATGCATTGAGAGTACGTATGNNNACTG"
     kcounter = Kmer.KmerCounter(2)
     kcomparer = Kmer.KmerComparer(kcounter)
     spectrums = kcomparer.compute_spectrums([sequence, sequence],
                                             ["A", "B"])
     spectrums = np.array(spectrums)
     fn = os.path.join(self.datadir, "temp.x")
     Kmer.write_spectrums(spectrums, fn)
     specs = Kmer.read_spectrums(fn)
     for i, j in itertools.product(range(specs.shape[0]),
                                   range(specs.shape[1])):
         self.assertAlmostEqual(
             specs[i][j],
             spectrums[i][j],
             delta=0.005,
             msg="Problem reading/writing k-mer spectrums")
     os.remove(fn)
コード例 #3
0
ファイル: kmer_comparison.py プロジェクト: javang/engr230_ngs
def do_kmer_comparison(args):
    """ Compares the Kmer spectrums.
    Compares the scaffolds assigned using blast with the not assigned
    scaffolds
    """
    log.info("Performing kmer comparison. Parameters: ")
    log.info("kmer size: %s dist12: %s threshold: %s", args.kmer,
                            args.dist12,args.threshold)

    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    kcounter = Kmer.KmerCounter(args.kmer)
    kcomparer = Kmer.KmerComparer(kcounter)
    kcomparer.set_kmer_distance_threshold(args.threshold)
    kcomparer.set_first_to_second_distance_ratio(args.dist12)

    # add the combined sequences of the scaffolds belonging to the same genera
    genus2sequence_dict, assigned_scaffolds = \
            db.get_genera_sequences_from(db.ScaffoldKmerComparisonTable)
    for genus in genus2sequence_dict:
        kcomparer.add_reference_sequence(genus2sequence_dict[genus],genus)

    sql_command = "SELECT scaffold, sequence FROM {0}".format(db.ScaffoldsTable)
    cursor = db.execute(sql_command)
    batch_size = 1000
    all_assignments = []
    record = cursor.fetchone()
    while record:
        scaffold = record["scaffold"]
        if scaffold not in assigned_scaffolds:
            kcomparer.add_sequence(record["sequence"], scaffold)
        if kcomparer.get_number_of_sequences() == batch_size:
            matches = kcomparer.run()
            all_assignments.extend(matches)
        record = cursor.fetchone()
    if kcomparer.get_number_of_sequences() > 0:
        matches = kcomparer.run()
        all_assignments.extend(matches)
    db.store_data(db.ScaffoldKmerComparisonTable, all_assignments)
    db.close()
コード例 #4
0
ファイル: kmer_comparison.py プロジェクト: javang/engr230_ngs
def kmer_comparison_one_iteration(args):
    """ This function is the one-iteration version of the iterative function
    """
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    names = db.get_tables_names()
    if db.ScaffoldKmerComparisonTable in names:
       db.drop_table(db.ScaffoldKmerComparisonTable)
    db.create_scaffold_kmer_comparison_table()
    kcounter = Kmer.KmerCounter(args.kmer)
    kcomparer = Kmer.KmerComparer(kcounter)
    kcomparer.set_kmer_distance_threshold(args.threshold)
    kcomparer.set_first_to_second_distance_ratio(args.dist12)

    # add the combined sequences of the scaffolds belonging to the same genera
    genus2sequence_dict, assigned_scaffolds = db.get_genera_sequences_from(db.ScaffoldsAssignmentsTable)
    for genus in genus2sequence_dict:
        kcomparer.add_reference_sequence(genus2sequence_dict[genus],genus)

    sql_command = "SELECT scaffold, sequence FROM {0}".format(db.ScaffoldsTable)
    cursor = db.execute(sql_command)
    batch_size = 1000
    all_matches = []
    record = cursor.fetchone()
    while record:
        scaffold = record["scaffold"]
        if scaffold not in assigned_scaffolds:
            kcomparer.add_sequence(record["sequence"], scaffold)
        if kcomparer.get_number_of_sequences() == batch_size:
            matches = kcomparer.run()
            # kcomparer will return False if a reliable match has not been found
            all_matches.extend([m for m in matches if m[1] != False])
        record = cursor.fetchone()
    if kcomparer.get_number_of_sequences() > 0:
        matches = kcomparer.run()
        all_matches.extend([m for m in matches if m[1] != False])
    db.store_data(db.ScaffoldKmerComparisonTable, all_matches)
    db.close()