Beispiel #1
0
def plot_label_propagation(args):
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    sql_command = """SELECT {0}.coverage, {0}.GC, {0}.length, {1}.genus, {1}.probability
                     FROM {0}
                     INNER JOIN {1}
                     WHERE {0}.scaffold = {1}.scaffold
                  """.format(db.ScaffoldsTable,
                             db.LabelPropagationResultsTable)
    data = db.retrieve_data(sql_command)
    db.close()
    coverages = []
    cgs = []
    lengths = []
    genera = []
    for r in data:
        if r["probability"] > args.lbl_prob:
            genera.append(r["genus"])
        else:
            genera.append(defs.not_assigned)
        coverages.append(r["coverage"])
        cgs.append(r["GC"])
        lengths.append(r["length"])


#    Plots.fig2(coverages, cgs, lengths, genera, args.fn_plot)

# plot a test ( coverage vs coverage/gcs)
    Plots.fig3(coverages, cgs, lengths, genera, args.fn_plot)
Beispiel #2
0
def plot_kmeans_clusters(args):
    """ PLot of the genus assignments for each of the scaffolds
        after performing k-means clustering
    """
    log.info("Plotting the K-means clusters")
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    sql_command = """SELECT {0}.scaffold, {0}.coverage, {0}.GC, {0}.length, {1}.cluster
                     FROM {0}
                     INNER JOIN {1}
                     WHERE {0}.scaffold = {1}.scaffold
                     ORDER BY {0}.scaffold
                  """.format(db.ScaffoldsTable, db.KmeansResultsTable)
    data = db.retrieve_data(sql_command)
    db.close()
    scaffolds = []
    coverages = []
    cgs = []
    lengths = []
    clusters = []
    for r in data:
        coverages.append(r["coverage"])
        cgs.append(r["GC"])
        lengths.append(r["length"])
        clusters.append(r["cluster"])
    Plots.fig2(coverages, cgs, lengths, clusters, args.fn_plot)
def go(args):
    # Read file marker cogs
    fhandle = open(args.fn_marker_cogs, "rU")
    reader = csv.reader(fhandle, delimiter=" ")
    reader.next()  # ignore comment
    markercogs = [row[0] for row in reader]
    if len(markercogs) == 0:
        raise ValueError("No marker COGs provided")
    fhandle.close()

    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    for cog in markercogs:
        log.info("Getting the sequences of all the genes belonging to COG %s",
                 cog)
        sql_command = """SELECT {0}.gene_id, {0}.cog_id, {1}.sequence
                     FROM {0}
                     INNER JOIN {1}
                     WHERE {0}.cog_id="{2}" AND {0}.gene_id={1}.gene_id 
                    """.format(db.GenesTable, db.SequenceTable, cog)
        data = db.retrieve_data(sql_command)
        fhandle = open("{0}.faa".format(cog), "w")
        for row in data:
            fhandle.write(">{0},{1}\n".format(row["gene_id"], row["cog_id"]))
            fhandle.write("{0}\n".format(row["sequence"]))
        fhandle.close()
    db.close()
def do_label_propagation_after_kmeans(args):
    """ Applies label propagation to k-means clusters
    """
    log.info("Applying label propagataion to the k-mer spectrums")
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    sql_command = """SELECT scaffold, cluster FROM {0} """.format(
        db.KmeansResultsTable)
    assigned_scaffolds = db.retrieve_data(sql_command)
    # calculate labels
    encoder = sklearn.preprocessing.LabelEncoder()
    known_labels = encoder.fit_transform(
        [r["cluster"] for r in assigned_scaffolds])
    log.debug("Labels %s", encoder.classes_)
    log.debug("Number of labels: %s", len(known_labels))
    # check that the encoder recovers the genus correctly
    #for r,c in zip(assigned_scaffolds,known_labels):
    #    print r["scaffold"],r["genus"], encoder.inverse_transform(c)
    scaffold2label_dict = dict()
    for r in assigned_scaffolds:
        scaffold2label_dict[r["scaffold"]] = encoder.transform([r["cluster"]
                                                                ])[0]
    sql_command = """SELECT scaffold, coverage, spectrum
                     FROM {0} ORDER BY scaffold""".format(db.ScaffoldsTable)
    data = db.retrieve_data(sql_command)
    mat = design_matrices.get_spectrums_coverage_matrix(data)
    all_labels = []
    scaffolds = []
    for r in data:
        s = r["scaffold"]
        if s not in scaffold2label_dict:
            all_labels.append(-1)  # unknown label
        else:
            all_labels.append(scaffold2label_dict[s])
        scaffolds.append(s)

    clamping_factor = 0.5
    label_spread = label_propagation.LabelSpreading(kernel='knn',
                                                    n_neighbors=7,
                                                    alpha=clamping_factor)
    label_spread.fit(mat, all_labels)
    output_labels = label_spread.predict(mat)
    probabilities = label_spread.predict_proba(mat)

    #    label_spread.fit(mat[0:1000], all_labels[0:1000])
    #    output_labels = label_spread.predict(mat[0:1000])
    #    probabilities = label_spread.predict_proba(mat[0:1000])

    if db.table_exists(db.KmeansLPResultsTable):
        db.drop_table(db.KmeansLPResultsTable)
    db.create_table(db.KmeansLPResultsTable, db.KmeansLPResultsFields,
                    db.KmeansLPResultsTypes)
    data = []
    for s, lab, probs in zip(scaffolds, output_labels, probabilities):
        p = probs.max()
        if np.isnan(p):
            data.append((s, defs.not_assigned, 0))
        else:
            data.append((s, encoder.inverse_transform(lab), p))
    db.store_data(db.KmeansLPResultsTable, data)
    db.close()
Beispiel #5
0
def plot_genus_assignments(args):
    """ Draws a plot of the read coverage for the scaffolds vs their GC content

        Each of the genera is assigned a color.
        This new version assumes that the ScaffoldKmerComparisonTable
        of final assignments has merged the results from ScaffoldsAssignmentsTable
        (the scaffolds assigned with BLAST)

    """
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    sql_command = """SELECT {1}.scaffold, {1}.genus, {0}.length, {0}.GC, {0}.coverage
                     FROM {1}
                     INNER JOIN {0}
                     WHERE {1}.scaffold = {0}.scaffold

                  """.format(db.ScaffoldsTable, db.ScaffoldKmerComparisonTable)
    data = db.retrieve_data(sql_command)
    coverages = []
    gcs = []
    lengths = []
    genera = []
    for r in data:
        coverages.append(r["coverage"])
        gcs.append(r["GC"])
        lengths.append(r["length"])
        genera.append(r["genus"])
    print "coverages", len(coverages), "gcs", len(gcs), "lengths", len(
        lengths), "genera", len(genera)
    Plots.fig2(coverages, gcs, lengths, genera, args.fn_plot)
Beispiel #6
0
    def test_database(self):
        """ Test the creation of the database for the metagenome """
        log.debug("Test creating a database with the metagenome data")
        fn_database = os.path.join(self.datadir, "tmp_database.db")
        db = MetagenomeDatabase.MetagenomeDatabase(fn_database, overwrite=True)
        # test the gene table
        fn_genes = os.path.join(self.datadir, "gene_info_test_file.xls")
        db.create_genes_table(fn_genes)
        sql_command = "SELECT * FROM {0}".format(db.GenesTable)
        genes = db.retrieve_data(sql_command)
        self.assertEqual(len(genes), 171)
        sql_command = """ SELECT *
                          FROM {0}
                          WHERE locus_tag="sg4i_00000050" """.format(
            db.GenesTable)
        genes = db.retrieve_data(sql_command)
        self.assertEqual(len(genes), 1)
        gene_t = GeneParser.GeneRecordTuple._make(genes[0])
        self.assertEqual(gene_t.gene_id, "2061973757", "Gene id test failed")

        # test the table of sequences
        fn_sequences = os.path.join(self.datadir, "proteins.faa")
        db.create_protein_sequences_table(fn_sequences)
        sql_command = """ SELECT * FROM {0}""".format(db.SequenceTable)
        sequences = db.retrieve_data(sql_command)
        self.assertEqual(len(sequences), 5)
        sql_command = """ SELECT * FROM {0}
                          WHERE gene_id="2061973757" """.format(
            db.SequenceTable)
        sequences = db.retrieve_data(sql_command)
        self.assertEqual(len(sequences), 1)
        self.assertEqual(gene_t.protein_length, len(sequences[0]["sequence"]))
        db.close()
        os.remove(fn_database)
Beispiel #7
0
def plot_kmeans_assignments(args):
    """ PLot of the genus assignments for each of the scaffolds
        after performing k-means clustering
    """
    log.info("Plotting the K-means assignments")
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    sql_command = """ SELECT DISTINCT cluster FROM {0}
                  """.format(db.KmeansResultsTable)
    data = db.retrieve_data(sql_command)
    clusters = [r["cluster"] for r in data]

    pairs_scaffold_genus = []
    for cluster in clusters:
        # Select the scaffolds assinged in the cluster,  sum the
        # bit scores of of each of the genera, and sort by the sum
        sql_command = """ SELECT {0}.scaffold, {0}.genus, SUM({0}.bits)
                        FROM {0}
                        INNER JOIN {1}
                        WHERE cluster = {2} AND
                        {0}.scaffold = {1}.scaffold
                        GROUP BY {0}.genus
                        ORDER BY {0}.bits DESC
                    """.format(db.ScaffoldsAssignmentsTable,
                               db.KmeansResultsTable, cluster)
        data = db.retrieve_data(sql_command)
        # get the genus with the largest number of bits assigned is the
        # first entry:
        if len(data) == 0:
            genus = defs.not_assigned
        else:
            genus = data[0]["genus"]
        # Assign the genus to all the scaffolds in the cluster
        sql_command = """ SELECT {0}.scaffold
                        FROM {0}
                        WHERE cluster = {1}
                    """.format(db.KmeansResultsTable, cluster)
        data = db.retrieve_data(sql_command)
        pairs_scaffold_genus.extend([(r["scaffold"], genus) for r in data])
    pairs_scaffold_genus.sort()

    sql_command = """SELECT {0}.scaffold, {0}.coverage, {0}.GC, {0}.length
                     FROM {0} ORDER BY scaffold
                  """.format(db.ScaffoldsTable)
    data = db.retrieve_data(sql_command)
    db.close()
    if len(data) != len(pairs_scaffold_genus):
        raise ValueError("The number of scaffolds in the database is not the " \
         "same as the number of scaffolds assigned with k-means")
    scaffolds = []
    coverages = []
    cgs = []
    lengths = []
    genera = []
    for r, pair in zip(data, pairs_scaffold_genus):
        coverages.append(r["coverage"])
        cgs.append(r["GC"])
        lengths.append(r["length"])
        genera.append(pair[1])
    Plots.fig2(coverages, cgs, lengths, genera, args.fn_plot)
Beispiel #8
0
def create_database(args):
    db = MetagenomeDatabase.MetagenomeDatabase(
        args.fn_database)  #, overwrite=True)
    if args.fn_genes:
        db.create_genes_table(args.fn_genes)
    if args.fn_protein_sequences:
        db.create_protein_sequences_table(args.fn_protein_sequences)
    if args.fn_scaffolds:
        db.fill_scaffolds_table(args.fn_scaffolds)
    if args.fn_scaffold_coverage:
        db.add_scaffold_coverage(args.fn_scaffold_coverage)
    db.close()
Beispiel #9
0
def blast_marker_cogs(args):
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    names = db.get_tables_names()
    if not db.GenesTable in names:
        raise ValueError("The database does not have a table of genes")
    if not db.SequenceTable in names:
        raise ValueError("The database does not have a table sequences")

    # Read file marker cogs
    fhandle = open(args.fn_marker_cogs, "r")
    reader = csv.reader(fhandle, delimiter="\t")
    markercogs = frozenset([row[0] for row in reader])
    if len(markercogs) == 0:
        raise ValueError("No marker COGs provided")

    for cog in markercogs:
        fn = os.path.join(args.cogsdbdir, cog + ".phr")
        if not os.path.exists(fn):
            raise IOError(
            "The database file {0} for the COG {1} does not exist".format(fn,cog))

    # Get genes
    sql_command = """SELECT gene_id,cog_id FROM {0}""".format(db.GenesTable)
    data = db.retrieve_data(sql_command)
    if db.BlastResultsTable in names:
        db.drop_table(db.BlastResultsTable)
    db.create_blast_results_table()

    log.info("Running BLAST for %s marker COGS",len(markercogs))
    n_batch_sequences = 100 # sequences to blast per batch
    sequence_tuples = []
    for gene_id,cog_id in data:
        if cog_id in markercogs:
            sql_command = """SELECT sequence FROM {0}
                        WHERE gene_id="{1}" """.format(db.SequenceTable, gene_id)
            records = db.retrieve_data(sql_command)
            if len(records) != 1:
                # Report but do not raise, continue processing other genes
                log.error("Problem with gene_id %s. There are no sequences in the database or "
                "there are more than one", gene_id)
                continue
            blast_database = os.path.join(args.cogsdbdir, cog_id)
            sequence_tuples.append((records[0][0], gene_id, blast_database))
            if len(sequence_tuples) == n_batch_sequences:
                batch_results = blast(sequence_tuples)
                db.store_blast_results(batch_results)
                sequence_tuples = []
    # Final run
    if len(sequence_tuples):
        batch_results = blast(sequence_tuples)
        batch_genes_ids = [tup[1] for tup in sequence_tuples]
        db.store_blast_results(batch_results)
    db.close()
Beispiel #10
0
def assignments2csv(args):
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    sql_command = """SELECT {0}.scaffold, {0}.coverage, {0}.length, {0}.GC, {1}.genus
                     FROM {0}
                     INNER JOIN {1}
                     WHERE {0}.scaffold={1}.scaffold
                      """.format("Scaffolds", "ScaffoldKmerComparison")
    cursor = db.execute(sql_command)
    record = cursor.fetchone()
    f = open(args.fn_csv, "w")
    writer = csv.writer(f, delimiter=",")
    while record:
        writer.writerow([w for w in record])
        record = cursor.fetchone()
    f.close()
    db.close()
Beispiel #11
0
def do_kmer_comparison(args):
    """ Compares the Kmer spectrums.
    Compares the scaffolds assigned using blast with the not assigned
    scaffolds
    """
    log.info("Performing kmer comparison. Parameters: ")
    log.info("kmer size: %s dist12: %s threshold: %s", args.kmer,
                            args.dist12,args.threshold)

    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    kcounter = Kmer.KmerCounter(args.kmer)
    kcomparer = Kmer.KmerComparer(kcounter)
    kcomparer.set_kmer_distance_threshold(args.threshold)
    kcomparer.set_first_to_second_distance_ratio(args.dist12)

    # add the combined sequences of the scaffolds belonging to the same genera
    genus2sequence_dict, assigned_scaffolds = \
            db.get_genera_sequences_from(db.ScaffoldKmerComparisonTable)
    for genus in genus2sequence_dict:
        kcomparer.add_reference_sequence(genus2sequence_dict[genus],genus)

    sql_command = "SELECT scaffold, sequence FROM {0}".format(db.ScaffoldsTable)
    cursor = db.execute(sql_command)
    batch_size = 1000
    all_assignments = []
    record = cursor.fetchone()
    while record:
        scaffold = record["scaffold"]
        if scaffold not in assigned_scaffolds:
            kcomparer.add_sequence(record["sequence"], scaffold)
        if kcomparer.get_number_of_sequences() == batch_size:
            matches = kcomparer.run()
            all_assignments.extend(matches)
        record = cursor.fetchone()
    if kcomparer.get_number_of_sequences() > 0:
        matches = kcomparer.run()
        all_assignments.extend(matches)
    db.store_data(db.ScaffoldKmerComparisonTable, all_assignments)
    db.close()
Beispiel #12
0
def plot_dpgmm(args):
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    sql_command = """SELECT {0}.coverage, {0}.GC, {0}.length, {1}.cluster, {1}.probability
                     FROM {0}
                     INNER JOIN {1}
                     WHERE {0}.scaffold = {1}.scaffold
                  """.format(db.ScaffoldsTable, db.DPGMMResultsTable)
    data = db.retrieve_data(sql_command)
    db.close()
    coverages = []
    cgs = []
    lengths = []
    genera = []
    for r in data:
        if r["probability"] > args.dpgmm:
            genera.append(r["cluster"])
        else:
            genera.append(defs.not_assigned)
        coverages.append(r["coverage"])
        cgs.append(r["GC"])
        lengths.append(r["length"])

    Plots.fig2(coverages, cgs, lengths, genera, args.fn_plot)
Beispiel #13
0
def kmer_comparison_one_iteration(args):
    """ This function is the one-iteration version of the iterative function
    """
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    names = db.get_tables_names()
    if db.ScaffoldKmerComparisonTable in names:
       db.drop_table(db.ScaffoldKmerComparisonTable)
    db.create_scaffold_kmer_comparison_table()
    kcounter = Kmer.KmerCounter(args.kmer)
    kcomparer = Kmer.KmerComparer(kcounter)
    kcomparer.set_kmer_distance_threshold(args.threshold)
    kcomparer.set_first_to_second_distance_ratio(args.dist12)

    # add the combined sequences of the scaffolds belonging to the same genera
    genus2sequence_dict, assigned_scaffolds = db.get_genera_sequences_from(db.ScaffoldsAssignmentsTable)
    for genus in genus2sequence_dict:
        kcomparer.add_reference_sequence(genus2sequence_dict[genus],genus)

    sql_command = "SELECT scaffold, sequence FROM {0}".format(db.ScaffoldsTable)
    cursor = db.execute(sql_command)
    batch_size = 1000
    all_matches = []
    record = cursor.fetchone()
    while record:
        scaffold = record["scaffold"]
        if scaffold not in assigned_scaffolds:
            kcomparer.add_sequence(record["sequence"], scaffold)
        if kcomparer.get_number_of_sequences() == batch_size:
            matches = kcomparer.run()
            # kcomparer will return False if a reliable match has not been found
            all_matches.extend([m for m in matches if m[1] != False])
        record = cursor.fetchone()
    if kcomparer.get_number_of_sequences() > 0:
        matches = kcomparer.run()
        all_matches.extend([m for m in matches if m[1] != False])
    db.store_data(db.ScaffoldKmerComparisonTable, all_matches)
    db.close()
Beispiel #14
0
def iterative_kmer_comparison(args):
    """ Compares not assigned scaffolds with the scaffolds assigned using
        BLAST using an iterative method. The function do_kmer_comparison
        uptades the sequences for each genus based on the scaffolds that
        have been assgined already. This way the most confident assignments
        are done first.
    """
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    names = db.get_tables_names()
    if db.ScaffoldKmerComparisonTable in names:
       db.drop_table(db.ScaffoldKmerComparisonTable)
    db.create_scaffold_kmer_comparison_table()
    db.pass_blast_assigned_scaffolds_to_kmer_table()

    n_elements = db.count(db.ScaffoldKmerComparisonTable)
    i = 0
    while True:
        log.info("Iterative comparison. Iteration %s",i)
        i += 1
        do_kmer_comparison(args)
        count = db.count(db.ScaffoldKmerComparisonTable)
        if count == n_elements:
            break
        n_elements = count
Beispiel #15
0
import MetaBinner.paranoid_log as paranoid_log
import MetaBinner.MetagenomeDatabase as MetagenomeDatabase
import MetaBinner.Kmer as Kmer
import MetaBinner.Plots as Plots
import MetaBinner.definitions as defs
import sys
import logging

logging.basicConfig(stream=sys.stdout)
logging.root.setLevel(logging.DEBUG)
db = MetagenomeDatabase.MetagenomeDatabase("2061766001_4mers.db")
db.add_scaffold_spectrums(4)
db.close()
 def __init__(self, fn_database):
     self.db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
Beispiel #17
0
def assign_genus_to_scaffolds(args):
    """ Assign genus to scaffolds in the database

    The function:
    1) Reads the genes in the database that belong to a given COG
    2) Reads the BLAST results for each of the genes.
    3) Recovers the best hit (genus and bit score) for the gene and
    identifies the scaffold where the gene is located
    4) Assigns the genus found in the hit to the scaffold.

    Various scaffolds can have different assignments. To select one assignment,
    1) sum the bit scores for the each of the genus assigned to a scaffold.
    2) Chose the genus with the largest total bit score

    Finally, store the assignments in the database
    """
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    names = db.get_tables_names()
    if not db.GenesTable in names:
        raise ValueError("The database does not have a table of genes")
    if not db.BlastResultsTable in names:
        raise ValueError("The database does not have a table of BLAST results")
    # Read file marker cogs
    fhandle = open(args.fn_marker_cogs, "rU")
    reader = csv.reader(fhandle, delimiter=" ")
    marker_cogs = frozenset([row[0] for row in reader])
    if len(marker_cogs) == 0:
        raise ValueError("No marker COGs provided")

    if db.ScaffoldsAssignmentsTable in names:
        db.drop_table(db.ScaffoldsAssignmentsTable)
    db.create_scaffold_assignments_table()

    blast_result = BLASTUtilities.BLASTResult()
    scaffolds_dict = {}
    for cog_id in marker_cogs:
        # read the genes and scaffolds for the cog
        sql_command = """SELECT {0}.gene_id,{0}.scaffold, {0}.dna_length,{1}.titles,{1}.bits
                         FROM {0}
                         INNER JOIN {1}
                         WHERE {0}.cog_id="{2}" AND {0}.gene_id={1}.gene_id
                      """.format(db.GenesTable, db.BlastResultsTable, cog_id)
        cursor = db.execute(sql_command)
        r = cursor.fetchone()
        while r:
            sc = r["scaffold"]
            organism, bit_score = blast_result.get_best_hit(
                r["titles"], r["bits"])
            genus = organism.split(" ")[0]
            add_to_scaffold_dictionary(scaffolds_dict, sc, genus,
                                       float(bit_score))
            r = cursor.fetchone()

    # Assign the genus with the largest bit score
    data = []
    for scaffold in scaffolds_dict:
        genus, bit_score = max(scaffolds_dict[scaffold].iteritems(),
                               key=operator.itemgetter(1))
        data.append((scaffold, genus, bit_score))
    data = BiologyBasedRules.filter_genus_assignments(data,
                                                      n_appearances=2,
                                                      bit_score_threshold=30)
    db.store_data(db.ScaffoldsAssignmentsTable, data)
    db.close()