Ejemplo n.º 1
0
	def __exit__(self, exc_type, exc_val, exc_tb):
		self.f.close()
		if exc_val is None:
			self.arc.add(self.tmp, arcname=self.arcname)
		remove_temp(self.task, self.tmp)
		return False
Ejemplo n.º 2
0
def combination_recurrences(projects_set):
    log = task.logger

    config = GlobalConfig(task.conf)
    paths = PathsConfig(config)

    classifier, projects = projects_set

    classifier_id = classifier["id"]

    group_values = classifier["group_values"]
    short_values = classifier["group_short_values"]
    long_values = classifier["group_long_values"]

    group_name = classifier["group_name"]
    group_short_name = classifier["group_short_name"]
    group_long_name = classifier["group_long_name"]

    if len(group_values) == 0:
        group_file_prefix = classifier_id
    else:
        group_file_prefix = "{0}-{1}".format(classifier_id, group_short_name)

    group_file_prefix = normalize_id(group_file_prefix)

    log.info(
        "--- [{0} ({1}) ({2}) ({3})] {4}".format(
            classifier["name"], group_long_name, group_short_name, group_name, "-" * 30
        )
    )

    log.info("Creating database ...")

    db_path = make_temp_file(task, suffix="-{0}.db".format(group_file_prefix))
    log.debug("  > {0}".format(db_path))

    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row

    create_db(conn)

    log.info("Combining recurrences ...")

    c = conn.cursor()

    sample_total = 0

    project_ids = []
    for project in projects:
        project_ids += [project["id"]]

        log.info("  Project {0}:".format(project["id"]))

        projdb = ProjectDb(project["db"])

        project_sample_total = projdb.get_total_affected_samples()

        sample_total += project_sample_total

        log.info("    Total samples = {0}".format(project_sample_total))

        log.info("    Variant genes ...")

        count = 0
        for afg in projdb.affected_genes(join_variant=True, join_xrefs=True, join_rec=True):
            var = afg.var
            rec = afg.rec

            if rec.sample_freq is None:
                log.warn("Discarding variant gene without sample frequency: {0}".format(repr(afg)))
                continue

            start, end, ref, alt = var_to_tab(var)

            try:
                c.execute(
                    "INSERT INTO variants (chr, strand, start, ref, alt, xrefs) VALUES (?,?,?,?,?,?)",
                    (var.chr, var.strand, start, ref, alt, ",".join(var.xrefs)),
                )
                var_id = c.lastrowid
            except sqlite3.IntegrityError:
                c.execute(
                    "SELECT var_id FROM variants WHERE chr=? AND strand=? AND start=? AND ref=? AND alt=?",
                    (var.chr, var.strand, start, ref, alt),
                )
                r = c.fetchone()
                var_id = r[0]

            try:
                c.execute(
                    "INSERT INTO variant_genes (var_id, gene_id, impact, coding_region, prot_changes, sample_freq) VALUES (?,?,?,?,?,?)",
                    (var_id, afg.gene_id, afg.impact, afg.coding_region, afg.prot_changes, rec.sample_freq),
                )
            except sqlite3.IntegrityError:
                c.execute(
                    """
					UPDATE variant_genes
					SET sample_freq=sample_freq + ?
					WHERE var_id=? AND gene_id=?""",
                    (rec.sample_freq, var_id, afg.gene_id),
                )

            count += 1

        log.info("      {0} variant genes".format(count))

        log.info("    Genes ...")

        count = 0
        for gene in projdb.genes(join_xrefs=True, join_rec=True):
            rec = gene.rec

            if rec.sample_freq is None:
                continue

            c.execute("SELECT COUNT(*) FROM genes WHERE gene_id=?", (gene.id,))
            r = c.fetchone()
            if r[0] == 0:
                c.execute("INSERT INTO genes (gene_id, sample_freq) VALUES (?,?)", (gene.id, rec.sample_freq))
            else:
                c.execute("UPDATE genes SET sample_freq=sample_freq + ? WHERE gene_id=?", (rec.sample_freq, gene.id))
            count += 1

        log.info("      {0} genes".format(count))

        log.info("    Pathways ...")

        count = 0
        for pathway in projdb.pathways(join_rec=True):
            rec = pathway.rec

            if rec.sample_freq is None:
                continue

            c.execute("SELECT COUNT(*) FROM pathways WHERE pathway_id=?", (pathway.id,))
            r = c.fetchone()
            if r[0] == 0:
                c.execute("INSERT INTO pathways (pathway_id, sample_freq) VALUES (?,?)", (pathway.id, rec.sample_freq))
            else:
                c.execute(
                    "UPDATE pathways SET sample_freq=sample_freq + ? WHERE pathway_id=?", (rec.sample_freq, pathway.id)
                )
            count += 1

        log.info("      {0} pathways".format(count))

        projdb.close()

    log.info("Calculating proportions with {0} samples in total among projects ...".format(sample_total))

    if sample_total > 0:
        c.execute("UPDATE variant_genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total))
        c.execute("UPDATE genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total))
        c.execute("UPDATE pathways SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total))

    c.close()
    conn.commit()

    log.info("Saving results ...")

    c = conn.cursor()

    base_path = paths.combination_path("recurrences")

    log.info("  Variant genes ...")

    with tsv.open(os.path.join(base_path, "variant_gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f:
        tsv.write_param(f, "classifier", classifier["id"])
        tsv.write_param(f, "group_id", group_name)
        tsv.write_param(f, "group_short_name", group_short_name)
        tsv.write_param(f, "group_long_name", group_long_name)
        tsv.write_param(f, "projects", ",".join(project_ids))
        tsv.write_param(f, "SAMPLE_TOTAL", sample_total)
        tsv.write_line(
            f,
            "CHR",
            "STRAND",
            "START",
            "ALLELE",
            "GENE_ID",
            "IMPACT",
            "IMPACT_CLASS",
            "SAMPLE_FREQ",
            "SAMPLE_PROP",
            "PROT_CHANGES",
            "XREFS",
        )
        for r in c.execute(
            "SELECT * FROM variant_genes JOIN variants USING (var_id) ORDER BY chr*1, chr, strand, start, gene_id"
        ):
            strand, ref, alt = r["strand"], r["ref"], r["alt"]
            allele = "{0}/{1}".format(ref, alt)
            tsv.write_line(
                f,
                r["chr"],
                strand,
                r["start"],
                allele,
                r["gene_id"],
                r["impact"],
                TransFIC.class_name(r["impact"]),
                r["sample_freq"],
                r["sample_prop"],
                r["prot_changes"],
                r["xrefs"],
                null_value="-",
            )

    log.info("  Genes ...")

    with tsv.open(os.path.join(base_path, "gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f:
        tsv.write_param(f, "classifier", classifier["id"])
        tsv.write_param(f, "group_id", group_name)
        tsv.write_param(f, "group_short_name", group_short_name)
        tsv.write_param(f, "group_long_name", group_long_name)
        tsv.write_param(f, "projects", ",".join(project_ids))
        tsv.write_param(f, "SAMPLE_TOTAL", sample_total)
        tsv.write_line(f, "GENE_ID", "SAMPLE_FREQ", "SAMPLE_PROP")
        for r in c.execute("SELECT * FROM genes ORDER BY gene_id"):
            tsv.write_line(f, r["gene_id"], r["sample_freq"], r["sample_prop"], null_value="-")

    log.info("  Pathways ...")

    with tsv.open(os.path.join(base_path, "pathway-{0}.tsv.gz".format(group_file_prefix)), "w") as f:
        tsv.write_param(f, "classifier", classifier["id"])
        tsv.write_param(f, "group_id", group_name)
        tsv.write_param(f, "group_short_name", group_short_name)
        tsv.write_param(f, "group_long_name", group_long_name)
        tsv.write_param(f, "projects", ",".join(project_ids))
        tsv.write_param(f, "SAMPLE_TOTAL", sample_total)
        tsv.write_line(f, "PATHWAY_ID", "SAMPLE_FREQ", "SAMPLE_PROP")
        for r in c.execute("SELECT * FROM pathways ORDER BY pathway_id"):
            tsv.write_line(f, r["pathway_id"], r["sample_freq"], r["sample_prop"], null_value="-")

    conn.close()

    remove_temp(task, db_path)
Ejemplo n.º 3
0
def combination_oncodrivefm(projects_set):
	log = task.logger

	config = GlobalConfig(task.conf)
	paths = PathsConfig(config)

	classifier, projects = projects_set

	classifier_id = classifier["id"]

	group_values = classifier["group_values"]
	short_values = classifier["group_short_values"]
	long_values = classifier["group_long_values"]

	group_name = classifier["group_name"]
	group_short_name = classifier["group_short_name"]
	group_long_name = classifier["group_long_name"]

	if len(group_values) == 0:
		group_file_prefix = classifier_id
	else:
		group_file_prefix = "{0}-{1}".format(classifier_id, group_short_name)

	group_file_prefix = normalize_id(group_file_prefix)

	log.info("--- [{0} ({1}) ({2}) ({3})] {4}".format(
		classifier["name"], group_long_name, group_short_name, group_name, "-" * 30))

	log.info("Exporting project data ...")

	base_path = make_temp_dir(task, suffix=".{0}".format(group_file_prefix))

	log.debug("> {0}".format(base_path))

	project_ids = []
	gene_files = []
	pathway_files = []
	for project in projects:
		project_id = project["id"]
		project_ids += [project_id]

		log.info("  Project {0}:".format(project["id"]))

		projdb = ProjectDb(project["db"])

		log.info("    Genes ...")

		count = 0
		file_path = os.path.join(base_path, "{0}-genes.tsv".format(project_id))
		gene_files += [file_path]
		with open(file_path, "w") as f:
			tsv.write_param(f, "classifier", classifier_id)
			tsv.write_param(f, "group_id", group_name)
			tsv.write_param(f, "slice", project_id)
			tsv.write_line(f, "GENE_ID", "PVALUE")
			for gene in projdb.genes():
				if gene.fm_pvalue is not None:
					tsv.write_line(f, gene.id, gene.fm_pvalue, null_value="-")
					count += 1

		log.info("      {0} genes".format(count))

		log.info("    Pathways ...")

		count = 0
		file_path = os.path.join(base_path, "{0}-pathways.tsv".format(project_id))
		pathway_files += [file_path]
		with open(file_path, "w") as f:
			tsv.write_param(f, "classifier", classifier_id)
			tsv.write_param(f, "group_id", group_name)
			tsv.write_param(f, "slice", project_id)
			tsv.write_line(f, "PATHWAY_ID", "ZSCORE")
			for pathway in projdb.pathways():
				if pathway.fm_zscore is not None:
					tsv.write_line(f, pathway.id, pathway.fm_zscore, null_value="-")
					count += 1

		log.info("      {0} pathways".format(count))

		projdb.close()

	log.info("Combining ...")

	combination_path = paths.combination_path("oncodrivefm")

	log.info("  Genes ...")

	cmd = " ".join([
			"oncodrivefm-combine",
			"-m median-empirical",
			"-o '{0}'".format(combination_path),
			"-n 'gene-{0}'".format(group_file_prefix),
			"-D 'classifier={0}'".format(classifier_id),
			"-D 'group_id={0}'".format(group_name),
			"-D 'group_short_name={0}'".format(group_short_name),
			"-D 'group_long_name={0}'".format(group_long_name),
			"--output-format tsv.gz"
	] + ["'{0}'".format(name) for name in gene_files])

	log.debug(cmd)

	ret_code = subprocess.call(cmd, shell=True)
	if ret_code != 0:
		#log.error("OncodriveFM error while combining gene pvalues:\n{0}".format(cmd))
		#return -1
		raise Exception("OncodriveFM error while combining gene pvalues:\n{0}".format(cmd))

	log.info("  Pathways ...")

	cmd = " ".join([
			"oncodrivefm-combine",
			"-m median-zscore",
			"-o '{0}'".format(combination_path),
			"-n 'pathway-{0}'".format(group_file_prefix),
			"-D 'classifier={0}'".format(classifier_id),
			"-D 'group_id={0}'".format(group_name),
			"-D 'group_short_name={0}'".format(group_short_name),
			"-D 'group_long_name={0}'".format(group_long_name),
			"--output-format tsv.gz"
	] + ["'{0}'".format(name) for name in pathway_files])

	log.debug(cmd)

	ret_code = subprocess.call(cmd, shell=True)
	if ret_code != 0:
		#log.error("OncodriveFM error while combining pathway zscores:\n{0}".format(cmd))
		#return -1
		raise Exception("OncodriveFM error while combining pathway zscores:\n{0}".format(cmd))

	remove_temp(task, base_path)
Ejemplo n.º 4
0
def scan_files(project):
	log = task.logger
	conf = task.conf

	config = GlobalConfig(conf)
	paths = PathsConfig(config)

	projects_port, liftover_projects_port = task.ports("projects_out", "liftover_projects")

	project_id = project["id"]
	temp_path = project["temp_path"]
	project_path = project["path"]
	projdb_path = project["db"]
	assembly = project["assembly"]

	log.info("--- [{0}] --------------------------------------------".format(project_id))

	if assembly == "hg18":
		out_port = liftover_projects_port
	elif assembly == "hg19":
		out_port = projects_port
	else:
		raise Exception("Unexpected assembly: {0}".format(assembly))

	#if os.path.exists(projdb_path):
	#	log.warn("Variations database already created, skipping this step.")
	#	out_port.send(project)
	#	return

	if os.path.exists(projdb_path):
		os.remove(projdb_path)

	log.info("Creating variants database ...")

	projdb_tmp_path = make_temp_file(task, suffix=".db")

	log.debug(projdb_tmp_path)

	projdb = ProjectDb(projdb_tmp_path).create()

	data_path = config.data_path

	log.info("Loading genes ...")

	projdb.load_genes(paths.data_ensembl_genes_path())

	log.info("Loading pathways ...")

	projdb.load_pathways(
		paths.data_kegg_def_path(),
		paths.data_kegg_ensg_map_path())

	log.info("Parsing variants ...")

	for obj_name in project["storage_objects"]:
		log.info("Downloading {} ...".format(obj_name))
		dst_path = os.path.join(project_path, "sources", os.path.basename(obj_name))
		dst_dirname = os.path.dirname(dst_path)
		if not os.path.exists(dst_dirname):
			os.makedirs(dst_dirname)
		# TODO: do not copy the source file (do not specify dst_path)
		task.storage.get_object(obj_name).get_data(dst_path)

		for container_name, path, name, ext, f in archived_files(dst_path):
			fname = os.path.join(path, name + ext)
			if container_name is not None:
				source_name = "{0}:{1}".format(os.path.basename(container_name), fname)
			else:
				source_name = name + ext

			log.info("=> {0} ...".format(source_name))

			sample_id = os.path.basename(name)

			if ext.lower() in _SUPPORTED_EXTENSIONS:
				parser_type = ext[1:]
			else:
				parser_type = "tab"

			parser = create_variants_parser(parser_type, f, source_name, sample_id)

			source_id = projdb.add_source(source_name)

			var_ids = set()
			for var in parser:
				for line_num, text in parser.read_lines():
					projdb.add_source_line(source_id, line_num, text)

				var_id = projdb.add_variant(var, source_id=source_id, line_num=parser.get_line_num())
				var_ids.add(var_id)

			for line_num, text in parser.read_lines():
				projdb.add_source_line(source_id, line_num, text)

			num_variants = len(var_ids)
			log.info("   {0} variants".format(num_variants))

			if num_variants == 0:
				raise Exception("No variants found in source '{}'. "
								"Please check the documentation for the expected input for '{}' format.".format(
								source_name, parser.name))

	projdb.commit()
	projdb.close()

	log.info("Copying variants database ...")

	log.debug("{0} -> {1}".format(projdb_tmp_path, projdb_path))

	shutil.copy(projdb_tmp_path, projdb_path)

	remove_temp(task, projdb_tmp_path)

	out_port.send(project)
Ejemplo n.º 5
0
def liftover(project):
    log = task.logger
    conf = task.conf

    config = GlobalConfig(conf)

    lifted_project_port = task.ports("lifted_projects")

    log.info("--- [{0}] --------------------------------------------".format(project["id"]))

    log.info("Preparing liftOver files ...")

    in_path = make_temp_file(task, suffix=".bed")
    in_file = open(in_path, "w")
    out_path = make_temp_file(task, suffix=".bed")
    unmapped_path = os.path.join(project["temp_path"], "liftover_unmapped.bed")

    projdb = ProjectDb(project["db"])

    for var in projdb.variants(order_by="position"):
        in_file.write(tsv.line_text("chr" + var.chr, var.start, var.start + len(var.ref), var.id))

    in_file.close()

    log.info("Running liftOver ...")

    project["from_assembly"] = project["assembly"]
    project["assembly"] = "hg19"

    cmd = " ".join(
        [
            conf["liftover_bin"],
            in_path,
            os.path.join(conf["liftover_chain_path"], "hg18ToHg19.over.chain"),
            out_path,
            unmapped_path,
        ]
    )

    log.debug(cmd)

    subprocess.call(cmd, shell=True)

    log.info("Annotating unmapped variants ...")

    count = 0
    with open(unmapped_path, "r") as f:
        for line in f:
            if line.lstrip().startswith("#"):
                continue
            fields = line.rstrip().split("\t")
            var_id = int(fields[3])
            projdb.update_variant_start(var_id, start=None)
            count += 1

    log.info("  {0} unmapped variants annotated".format(count))

    log.info("Updating variants ...")

    count = 0
    with open(out_path, "r") as f:
        for line in f:
            fields = line.rstrip().split("\t")
            chr, start, end, var_id = fields
            projdb.update_variant_start(var_id, start=start)
            count += 1

    log.info("  {0} variants".format(count))

    remove_temp(task, in_path, out_path)

    projdb.commit()
    projdb.close()

    lifted_project_port.send(project)