def load_orthofinder_cluster(_run_id, ogfile): """Load cluster file from mcl (via orthofinder) into homology database""" nseqs = 0 hist = {} database.execute("BEGIN") database.execute("DELETE FROM homology WHERE run_id=?;", (_run_id,)) with open(ogfile, 'r') as f: cluster_id = 0 for line in f: cluster = filter(lambda s : s.isdigit(), re.split(r'[@\s]', line)) n = len(cluster) hist[n] = hist.get(n, 0) + 1 if n >= 4: nseqs += n for seq_id in cluster: database.execute(""" INSERT INTO homology (run_id, component_id, sequence_id) VALUES (?,?,?);""", (_run_id, cluster_id, seq_id)) cluster_id += 1 database.execute("COMMIT") utils.info( "histogram of gene cluster sizes:\n", '\n '.join("%d\t:\t%d" % (k, hist[k]) for k in sorted(hist))) diagnostics.log('nseqs', nseqs) diagnostics.log('histogram', hist)
def load_mcl_cluster(_run_id, cluster_file): """Load cluster file from mcl into homology database""" nseqs = 0 hist = {} database.execute("BEGIN") database.execute("DELETE FROM homology WHERE run_id=?;", (_run_id,)) with open(cluster_file, "r") as f: cluster_id = 0 for line in f: cluster = filter(lambda s: s[0].isdigit(), line.rstrip().split()) n = len(cluster) hist[n] = hist.get(n, 0) + 1 if n >= 4: nseqs += n for seq_id in cluster: database.execute( """ INSERT INTO homology (run_id, component_id, sequence_id) VALUES (?,?,?);""", (_run_id, cluster_id, seq_id), ) cluster_id += 1 database.execute("COMMIT") utils.info("histogram of gene cluster sizes:\n", "\n ".join("%d\t:\t%d" % (k, hist[k]) for k in sorted(hist))) diagnostics.log("nseqs", nseqs) diagnostics.log("histogram", hist)
def component_histogram(self): """ Distribution of the number of nodes in each cluster. """ hist = {} sql = """ SELECT component_id, COUNT(*) FROM homology WHERE run_id=? GROUP BY component_id;""" for _, count in database.execute(sql, (self.run_id,)): hist[count] = hist.get(count, 0) + 1 if hist: hist = numpy.array([(k, hist[k]) for k in sorted(hist.iterkeys())]) imgname = "%d.component.hist.png" % self.run_id props = {"title": "Distribution of Cluster Sizes", "xlabel": "# Nodes in Cluster", "ylabel": "Frequency"} return [self.histogram_overlay(imgname, [hist], props=props)]