Example #1
0
def load_orthofinder_cluster(_run_id, ogfile):
    """Load cluster file from mcl (via orthofinder) into homology database"""

    nseqs = 0
    hist = {}

    database.execute("BEGIN")
    database.execute("DELETE FROM homology WHERE run_id=?;", (_run_id,))
    with open(ogfile, 'r') as f:
        cluster_id = 0
        for line in f:
            cluster = filter(lambda s : s.isdigit(), re.split(r'[@\s]', line))
            n = len(cluster)
            hist[n] = hist.get(n, 0) + 1
            if n >= 4:
                nseqs += n
                for seq_id in cluster:
                    database.execute("""
                        INSERT INTO homology (run_id, component_id, sequence_id)
                        VALUES (?,?,?);""",
                        (_run_id, cluster_id, seq_id))
                cluster_id += 1
    database.execute("COMMIT")

    utils.info(
        "histogram of gene cluster sizes:\n",
        '\n '.join("%d\t:\t%d" % (k, hist[k]) for k in sorted(hist)))

    diagnostics.log('nseqs', nseqs)
    diagnostics.log('histogram', hist)
Example #2
0
def load_mcl_cluster(_run_id, cluster_file):
    """Load cluster file from mcl into homology database"""

    nseqs = 0
    hist = {}

    database.execute("BEGIN")
    database.execute("DELETE FROM homology WHERE run_id=?;", (_run_id,))
    with open(cluster_file, "r") as f:
        cluster_id = 0
        for line in f:
            cluster = filter(lambda s: s[0].isdigit(), line.rstrip().split())
            n = len(cluster)
            hist[n] = hist.get(n, 0) + 1
            if n >= 4:
                nseqs += n
                for seq_id in cluster:
                    database.execute(
                        """
						INSERT INTO homology (run_id, component_id, sequence_id)
						VALUES (?,?,?);""",
                        (_run_id, cluster_id, seq_id),
                    )
                cluster_id += 1
    database.execute("COMMIT")

    utils.info("histogram of gene cluster sizes:\n", "\n ".join("%d\t:\t%d" % (k, hist[k]) for k in sorted(hist)))

    diagnostics.log("nseqs", nseqs)
    diagnostics.log("histogram", hist)
Example #3
0
    def component_histogram(self):
        """
		Distribution of the number of nodes in each cluster.
		"""
        hist = {}
        sql = """
			SELECT component_id, COUNT(*)
			FROM homology
			WHERE run_id=?
			GROUP BY component_id;"""
        for _, count in database.execute(sql, (self.run_id,)):
            hist[count] = hist.get(count, 0) + 1
        if hist:
            hist = numpy.array([(k, hist[k]) for k in sorted(hist.iterkeys())])
            imgname = "%d.component.hist.png" % self.run_id
            props = {"title": "Distribution of Cluster Sizes", "xlabel": "# Nodes in Cluster", "ylabel": "Frequency"}
            return [self.histogram_overlay(imgname, [hist], props=props)]