def plot_dpgmm(args): db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) sql_command = """SELECT {0}.coverage, {0}.GC, {0}.length, {1}.cluster, {1}.probability FROM {0} INNER JOIN {1} WHERE {0}.scaffold = {1}.scaffold """.format( db.ScaffoldsTable, db.DPGMMResultsTable ) data = db.retrieve_data(sql_command) db.close() coverages = [] cgs = [] lengths = [] genera = [] for r in data: if r["probability"] > args.dpgmm: genera.append(r["cluster"]) else: genera.append(defs.not_assigned) coverages.append(r["coverage"]) cgs.append(r["GC"]) lengths.append(r["length"]) Plots.fig2(coverages, cgs, lengths, genera, args.fn_plot)
def plot_genus_assignments(args): """ Draws a plot of the read coverage for the scaffolds vs their GC content Each of the genera is assigned a color. This new version assumes that the ScaffoldKmerComparisonTable of final assignments has merged the results from ScaffoldsAssignmentsTable (the scaffolds assigned with BLAST) """ db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) sql_command = """SELECT {1}.scaffold, {1}.genus, {0}.length, {0}.GC, {0}.coverage FROM {1} INNER JOIN {0} WHERE {1}.scaffold = {0}.scaffold """.format( db.ScaffoldsTable, db.ScaffoldKmerComparisonTable ) data = db.retrieve_data(sql_command) coverages = [] gcs = [] lengths = [] genera = [] for r in data: coverages.append(r["coverage"]) gcs.append(r["GC"]) lengths.append(r["length"]) genera.append(r["genus"]) print "coverages", len(coverages), "gcs", len(gcs), "lengths", len(lengths), "genera", len(genera) Plots.fig2(coverages, gcs, lengths, genera, args.fn_plot)
def plot_kmeans_clusters(args): """ PLot of the genus assignments for each of the scaffolds after performing k-means clustering """ log.info("Plotting the K-means clusters") db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) sql_command = """SELECT {0}.scaffold, {0}.coverage, {0}.GC, {0}.length, {1}.cluster FROM {0} INNER JOIN {1} WHERE {0}.scaffold = {1}.scaffold ORDER BY {0}.scaffold """.format( db.ScaffoldsTable, db.KmeansResultsTable ) data = db.retrieve_data(sql_command) db.close() scaffolds = [] coverages = [] cgs = [] lengths = [] clusters = [] for r in data: coverages.append(r["coverage"]) cgs.append(r["GC"]) lengths.append(r["length"]) clusters.append(r["cluster"]) Plots.fig2(coverages, cgs, lengths, clusters, args.fn_plot)
def plot_label_propagation(args): db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) sql_command = """SELECT {0}.coverage, {0}.GC, {0}.length, {1}.genus, {1}.probability FROM {0} INNER JOIN {1} WHERE {0}.scaffold = {1}.scaffold """.format( db.ScaffoldsTable, db.LabelPropagationResultsTable ) data = db.retrieve_data(sql_command) db.close() coverages = [] cgs = [] lengths = [] genera = [] for r in data: if r["probability"] > args.lbl_prob: genera.append(r["genus"]) else: genera.append(defs.not_assigned) coverages.append(r["coverage"]) cgs.append(r["GC"]) lengths.append(r["length"]) # Plots.fig2(coverages, cgs, lengths, genera, args.fn_plot) # plot a test ( coverage vs coverage/gcs) Plots.fig3(coverages, cgs, lengths, genera, args.fn_plot)
def plot_genus_assignments(args): """ Draws a plot of the read coverage for the scaffolds vs their GC content Each of the genera is assigned a color. This new version assumes that the ScaffoldKmerComparisonTable of final assignments has merged the results from ScaffoldsAssignmentsTable (the scaffolds assigned with BLAST) """ db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) sql_command = """SELECT {1}.scaffold, {1}.genus, {0}.length, {0}.GC, {0}.coverage FROM {1} INNER JOIN {0} WHERE {1}.scaffold = {0}.scaffold """.format(db.ScaffoldsTable, db.ScaffoldKmerComparisonTable) data = db.retrieve_data(sql_command) coverages = [] gcs = [] lengths = [] genera = [] for r in data: coverages.append(r["coverage"]) gcs.append(r["GC"]) lengths.append(r["length"]) genera.append(r["genus"]) print "coverages", len(coverages), "gcs", len(gcs), "lengths", len( lengths), "genera", len(genera) Plots.fig2(coverages, gcs, lengths, genera, args.fn_plot)
def plot_label_propagation(args): db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) sql_command = """SELECT {0}.coverage, {0}.GC, {0}.length, {1}.genus, {1}.probability FROM {0} INNER JOIN {1} WHERE {0}.scaffold = {1}.scaffold """.format(db.ScaffoldsTable, db.LabelPropagationResultsTable) data = db.retrieve_data(sql_command) db.close() coverages = [] cgs = [] lengths = [] genera = [] for r in data: if r["probability"] > args.lbl_prob: genera.append(r["genus"]) else: genera.append(defs.not_assigned) coverages.append(r["coverage"]) cgs.append(r["GC"]) lengths.append(r["length"]) # Plots.fig2(coverages, cgs, lengths, genera, args.fn_plot) # plot a test ( coverage vs coverage/gcs) Plots.fig3(coverages, cgs, lengths, genera, args.fn_plot)
def plot_kmeans_clusters(args): """ PLot of the genus assignments for each of the scaffolds after performing k-means clustering """ log.info("Plotting the K-means clusters") db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) sql_command = """SELECT {0}.scaffold, {0}.coverage, {0}.GC, {0}.length, {1}.cluster FROM {0} INNER JOIN {1} WHERE {0}.scaffold = {1}.scaffold ORDER BY {0}.scaffold """.format(db.ScaffoldsTable, db.KmeansResultsTable) data = db.retrieve_data(sql_command) db.close() scaffolds = [] coverages = [] cgs = [] lengths = [] clusters = [] for r in data: coverages.append(r["coverage"]) cgs.append(r["GC"]) lengths.append(r["length"]) clusters.append(r["cluster"]) Plots.fig2(coverages, cgs, lengths, clusters, args.fn_plot)
def plot_kmeans_assignments(args): """ PLot of the genus assignments for each of the scaffolds after performing k-means clustering """ log.info("Plotting the K-means assignments") db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) sql_command = """ SELECT DISTINCT cluster FROM {0} """.format(db.KmeansResultsTable) data = db.retrieve_data(sql_command) clusters = [r["cluster"] for r in data] pairs_scaffold_genus = [] for cluster in clusters: # Select the scaffolds assinged in the cluster, sum the # bit scores of of each of the genera, and sort by the sum sql_command = """ SELECT {0}.scaffold, {0}.genus, SUM({0}.bits) FROM {0} INNER JOIN {1} WHERE cluster = {2} AND {0}.scaffold = {1}.scaffold GROUP BY {0}.genus ORDER BY {0}.bits DESC """.format(db.ScaffoldsAssignmentsTable, db.KmeansResultsTable, cluster) data = db.retrieve_data(sql_command) # get the genus with the largest number of bits assigned is the # first entry: if len(data) == 0: genus = defs.not_assigned else: genus = data[0]["genus"] # Assign the genus to all the scaffolds in the cluster sql_command = """ SELECT {0}.scaffold FROM {0} WHERE cluster = {1} """.format(db.KmeansResultsTable, cluster) data = db.retrieve_data(sql_command) pairs_scaffold_genus.extend([(r["scaffold"], genus) for r in data]) pairs_scaffold_genus.sort() sql_command = """SELECT {0}.scaffold, {0}.coverage, {0}.GC, {0}.length FROM {0} ORDER BY scaffold """.format(db.ScaffoldsTable) data = db.retrieve_data(sql_command) db.close() if len(data) != len(pairs_scaffold_genus): raise ValueError("The number of scaffolds in the database is not the " \ "same as the number of scaffolds assigned with k-means") scaffolds = [] coverages = [] cgs = [] lengths = [] genera = [] for r, pair in zip(data, pairs_scaffold_genus): coverages.append(r["coverage"]) cgs.append(r["GC"]) lengths.append(r["length"]) genera.append(pair[1]) Plots.fig2(coverages, cgs, lengths, genera, args.fn_plot)
def plot_dpgmm(args): db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) sql_command = """SELECT {0}.coverage, {0}.GC, {0}.length, {1}.cluster, {1}.probability FROM {0} INNER JOIN {1} WHERE {0}.scaffold = {1}.scaffold """.format(db.ScaffoldsTable, db.DPGMMResultsTable) data = db.retrieve_data(sql_command) db.close() coverages = [] cgs = [] lengths = [] genera = [] for r in data: if r["probability"] > args.dpgmm: genera.append(r["cluster"]) else: genera.append(defs.not_assigned) coverages.append(r["coverage"]) cgs.append(r["GC"]) lengths.append(r["length"]) Plots.fig2(coverages, cgs, lengths, genera, args.fn_plot)
def plot_kmeans_assignments(args): """ PLot of the genus assignments for each of the scaffolds after performing k-means clustering """ log.info("Plotting the K-means assignments") db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) sql_command = """ SELECT DISTINCT cluster FROM {0} """.format( db.KmeansResultsTable ) data = db.retrieve_data(sql_command) clusters = [r["cluster"] for r in data] pairs_scaffold_genus = [] for cluster in clusters: # Select the scaffolds assinged in the cluster, sum the # bit scores of of each of the genera, and sort by the sum sql_command = """ SELECT {0}.scaffold, {0}.genus, SUM({0}.bits) FROM {0} INNER JOIN {1} WHERE cluster = {2} AND {0}.scaffold = {1}.scaffold GROUP BY {0}.genus ORDER BY {0}.bits DESC """.format( db.ScaffoldsAssignmentsTable, db.KmeansResultsTable, cluster ) data = db.retrieve_data(sql_command) # get the genus with the largest number of bits assigned is the # first entry: if len(data) == 0: genus = defs.not_assigned else: genus = data[0]["genus"] # Assign the genus to all the scaffolds in the cluster sql_command = """ SELECT {0}.scaffold FROM {0} WHERE cluster = {1} """.format( db.KmeansResultsTable, cluster ) data = db.retrieve_data(sql_command) pairs_scaffold_genus.extend([(r["scaffold"], genus) for r in data]) pairs_scaffold_genus.sort() sql_command = """SELECT {0}.scaffold, {0}.coverage, {0}.GC, {0}.length FROM {0} ORDER BY scaffold """.format( db.ScaffoldsTable ) data = db.retrieve_data(sql_command) db.close() if len(data) != len(pairs_scaffold_genus): raise ValueError( "The number of scaffolds in the database is not the " "same as the number of scaffolds assigned with k-means" ) scaffolds = [] coverages = [] cgs = [] lengths = [] genera = [] for r, pair in zip(data, pairs_scaffold_genus): coverages.append(r["coverage"]) cgs.append(r["GC"]) lengths.append(r["length"]) genera.append(pair[1]) Plots.fig2(coverages, cgs, lengths, genera, args.fn_plot)