Ejemplo n.º 1
0
def plot_dpgmm(args):
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    sql_command = """SELECT {0}.coverage, {0}.GC, {0}.length, {1}.cluster, {1}.probability
                     FROM {0}
                     INNER JOIN {1}
                     WHERE {0}.scaffold = {1}.scaffold
                  """.format(
        db.ScaffoldsTable, db.DPGMMResultsTable
    )
    data = db.retrieve_data(sql_command)
    db.close()
    coverages = []
    cgs = []
    lengths = []
    genera = []
    for r in data:
        if r["probability"] > args.dpgmm:
            genera.append(r["cluster"])
        else:
            genera.append(defs.not_assigned)
        coverages.append(r["coverage"])
        cgs.append(r["GC"])
        lengths.append(r["length"])

    Plots.fig2(coverages, cgs, lengths, genera, args.fn_plot)
Ejemplo n.º 2
0
def plot_genus_assignments(args):
    """ Draws a plot of the read coverage for the scaffolds vs their GC content

        Each of the genera is assigned a color.
        This new version assumes that the ScaffoldKmerComparisonTable
        of final assignments has merged the results from ScaffoldsAssignmentsTable
        (the scaffolds assigned with BLAST)

    """
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    sql_command = """SELECT {1}.scaffold, {1}.genus, {0}.length, {0}.GC, {0}.coverage
                     FROM {1}
                     INNER JOIN {0}
                     WHERE {1}.scaffold = {0}.scaffold

                  """.format(
        db.ScaffoldsTable, db.ScaffoldKmerComparisonTable
    )
    data = db.retrieve_data(sql_command)
    coverages = []
    gcs = []
    lengths = []
    genera = []
    for r in data:
        coverages.append(r["coverage"])
        gcs.append(r["GC"])
        lengths.append(r["length"])
        genera.append(r["genus"])
    print "coverages", len(coverages), "gcs", len(gcs), "lengths", len(lengths), "genera", len(genera)
    Plots.fig2(coverages, gcs, lengths, genera, args.fn_plot)
Ejemplo n.º 3
0
def plot_kmeans_clusters(args):
    """ PLot of the genus assignments for each of the scaffolds
        after performing k-means clustering
    """
    log.info("Plotting the K-means clusters")
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    sql_command = """SELECT {0}.scaffold, {0}.coverage, {0}.GC, {0}.length, {1}.cluster
                     FROM {0}
                     INNER JOIN {1}
                     WHERE {0}.scaffold = {1}.scaffold
                     ORDER BY {0}.scaffold
                  """.format(
        db.ScaffoldsTable, db.KmeansResultsTable
    )
    data = db.retrieve_data(sql_command)
    db.close()
    scaffolds = []
    coverages = []
    cgs = []
    lengths = []
    clusters = []
    for r in data:
        coverages.append(r["coverage"])
        cgs.append(r["GC"])
        lengths.append(r["length"])
        clusters.append(r["cluster"])
    Plots.fig2(coverages, cgs, lengths, clusters, args.fn_plot)
Ejemplo n.º 4
0
def plot_genus_assignments(args):
    """ Draws a plot of the read coverage for the scaffolds vs their GC content

        Each of the genera is assigned a color.
        This new version assumes that the ScaffoldKmerComparisonTable
        of final assignments has merged the results from ScaffoldsAssignmentsTable
        (the scaffolds assigned with BLAST)

    """
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    sql_command = """SELECT {1}.scaffold, {1}.genus, {0}.length, {0}.GC, {0}.coverage
                     FROM {1}
                     INNER JOIN {0}
                     WHERE {1}.scaffold = {0}.scaffold

                  """.format(db.ScaffoldsTable, db.ScaffoldKmerComparisonTable)
    data = db.retrieve_data(sql_command)
    coverages = []
    gcs = []
    lengths = []
    genera = []
    for r in data:
        coverages.append(r["coverage"])
        gcs.append(r["GC"])
        lengths.append(r["length"])
        genera.append(r["genus"])
    print "coverages", len(coverages), "gcs", len(gcs), "lengths", len(
        lengths), "genera", len(genera)
    Plots.fig2(coverages, gcs, lengths, genera, args.fn_plot)
Ejemplo n.º 5
0
def plot_kmeans_clusters(args):
    """ PLot of the genus assignments for each of the scaffolds
        after performing k-means clustering
    """
    log.info("Plotting the K-means clusters")
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    sql_command = """SELECT {0}.scaffold, {0}.coverage, {0}.GC, {0}.length, {1}.cluster
                     FROM {0}
                     INNER JOIN {1}
                     WHERE {0}.scaffold = {1}.scaffold
                     ORDER BY {0}.scaffold
                  """.format(db.ScaffoldsTable, db.KmeansResultsTable)
    data = db.retrieve_data(sql_command)
    db.close()
    scaffolds = []
    coverages = []
    cgs = []
    lengths = []
    clusters = []
    for r in data:
        coverages.append(r["coverage"])
        cgs.append(r["GC"])
        lengths.append(r["length"])
        clusters.append(r["cluster"])
    Plots.fig2(coverages, cgs, lengths, clusters, args.fn_plot)
Ejemplo n.º 6
0
def plot_kmeans_assignments(args):
    """ PLot of the genus assignments for each of the scaffolds
        after performing k-means clustering
    """
    log.info("Plotting the K-means assignments")
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    sql_command = """ SELECT DISTINCT cluster FROM {0}
                  """.format(db.KmeansResultsTable)
    data = db.retrieve_data(sql_command)
    clusters = [r["cluster"] for r in data]

    pairs_scaffold_genus = []
    for cluster in clusters:
        # Select the scaffolds assinged in the cluster,  sum the
        # bit scores of of each of the genera, and sort by the sum
        sql_command = """ SELECT {0}.scaffold, {0}.genus, SUM({0}.bits)
                        FROM {0}
                        INNER JOIN {1}
                        WHERE cluster = {2} AND
                        {0}.scaffold = {1}.scaffold
                        GROUP BY {0}.genus
                        ORDER BY {0}.bits DESC
                    """.format(db.ScaffoldsAssignmentsTable,
                               db.KmeansResultsTable, cluster)
        data = db.retrieve_data(sql_command)
        # get the genus with the largest number of bits assigned is the
        # first entry:
        if len(data) == 0:
            genus = defs.not_assigned
        else:
            genus = data[0]["genus"]
        # Assign the genus to all the scaffolds in the cluster
        sql_command = """ SELECT {0}.scaffold
                        FROM {0}
                        WHERE cluster = {1}
                    """.format(db.KmeansResultsTable, cluster)
        data = db.retrieve_data(sql_command)
        pairs_scaffold_genus.extend([(r["scaffold"], genus) for r in data])
    pairs_scaffold_genus.sort()

    sql_command = """SELECT {0}.scaffold, {0}.coverage, {0}.GC, {0}.length
                     FROM {0} ORDER BY scaffold
                  """.format(db.ScaffoldsTable)
    data = db.retrieve_data(sql_command)
    db.close()
    if len(data) != len(pairs_scaffold_genus):
        raise ValueError("The number of scaffolds in the database is not the " \
         "same as the number of scaffolds assigned with k-means")
    scaffolds = []
    coverages = []
    cgs = []
    lengths = []
    genera = []
    for r, pair in zip(data, pairs_scaffold_genus):
        coverages.append(r["coverage"])
        cgs.append(r["GC"])
        lengths.append(r["length"])
        genera.append(pair[1])
    Plots.fig2(coverages, cgs, lengths, genera, args.fn_plot)
Ejemplo n.º 7
0
def plot_dpgmm(args):
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    sql_command = """SELECT {0}.coverage, {0}.GC, {0}.length, {1}.cluster, {1}.probability
                     FROM {0}
                     INNER JOIN {1}
                     WHERE {0}.scaffold = {1}.scaffold
                  """.format(db.ScaffoldsTable, db.DPGMMResultsTable)
    data = db.retrieve_data(sql_command)
    db.close()
    coverages = []
    cgs = []
    lengths = []
    genera = []
    for r in data:
        if r["probability"] > args.dpgmm:
            genera.append(r["cluster"])
        else:
            genera.append(defs.not_assigned)
        coverages.append(r["coverage"])
        cgs.append(r["GC"])
        lengths.append(r["length"])

    Plots.fig2(coverages, cgs, lengths, genera, args.fn_plot)
Ejemplo n.º 8
0
def plot_kmeans_assignments(args):
    """ PLot of the genus assignments for each of the scaffolds
        after performing k-means clustering
    """
    log.info("Plotting the K-means assignments")
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    sql_command = """ SELECT DISTINCT cluster FROM {0}
                  """.format(
        db.KmeansResultsTable
    )
    data = db.retrieve_data(sql_command)
    clusters = [r["cluster"] for r in data]

    pairs_scaffold_genus = []
    for cluster in clusters:
        # Select the scaffolds assinged in the cluster,  sum the
        # bit scores of of each of the genera, and sort by the sum
        sql_command = """ SELECT {0}.scaffold, {0}.genus, SUM({0}.bits)
                        FROM {0}
                        INNER JOIN {1}
                        WHERE cluster = {2} AND
                        {0}.scaffold = {1}.scaffold
                        GROUP BY {0}.genus
                        ORDER BY {0}.bits DESC
                    """.format(
            db.ScaffoldsAssignmentsTable, db.KmeansResultsTable, cluster
        )
        data = db.retrieve_data(sql_command)
        # get the genus with the largest number of bits assigned is the
        # first entry:
        if len(data) == 0:
            genus = defs.not_assigned
        else:
            genus = data[0]["genus"]
        # Assign the genus to all the scaffolds in the cluster
        sql_command = """ SELECT {0}.scaffold
                        FROM {0}
                        WHERE cluster = {1}
                    """.format(
            db.KmeansResultsTable, cluster
        )
        data = db.retrieve_data(sql_command)
        pairs_scaffold_genus.extend([(r["scaffold"], genus) for r in data])
    pairs_scaffold_genus.sort()

    sql_command = """SELECT {0}.scaffold, {0}.coverage, {0}.GC, {0}.length
                     FROM {0} ORDER BY scaffold
                  """.format(
        db.ScaffoldsTable
    )
    data = db.retrieve_data(sql_command)
    db.close()
    if len(data) != len(pairs_scaffold_genus):
        raise ValueError(
            "The number of scaffolds in the database is not the "
            "same as the number of scaffolds assigned with k-means"
        )
    scaffolds = []
    coverages = []
    cgs = []
    lengths = []
    genera = []
    for r, pair in zip(data, pairs_scaffold_genus):
        coverages.append(r["coverage"])
        cgs.append(r["GC"])
        lengths.append(r["length"])
        genera.append(pair[1])
    Plots.fig2(coverages, cgs, lengths, genera, args.fn_plot)