Example #1
0
def main():
    parser = argparse.ArgumentParser(description="Extract mutations in VCF and save as simple tabulated file")

    parser.add_argument("vcf_paths", metavar="PATH", nargs="+", help="The VCF files")

    parser.add_argument("-o", dest="out_path", metavar="PATH", help="Output file. Use - for standard output.")

    bglogging.add_logging_arguments(self._parser)

    args = parser.parse_args()

    bglogging.initialize(self.args)

    log = bglogging.get_logger("vcf-to-snvs")

    if args.out_path is None:
        names = []
        for path in args.vcf_paths:
            if path != "-":
                base_path, name, ext = tsv.split_path(path)
                names += [name]

        prefix = os.path.commonprefix(*names) if len(names) > 0 else ""
        prefix = prefix.rstrip(".")
        if len(prefix) == 0:
            prefix = "genome"
        args.out_path = "{}.tsv.gz".format(prefix)

    with tsv.open(args.out_path, "w") as outf:
        tsv.write_line(outf, "CHR", "POS", "REF", "ALT")

        for path in args.vcf_paths:
            log.info("Reading {} ...".format(path))

            with tsv.open(path) as inf:
                types = (str, str, str, str)
                columns = [0, 1, 3, 4]
                for fields in tsv.lines(inf, types, columns=columns):
                    chrom, pos, ref, alt = fields

                    # ref = ref.upper().strip("N")
                    # alt = alt.upper().strip("N")

                    ref_len = len(ref)
                    alt_len = len(alt)

                    if ref_len != alt_len or ref_len == 0 or alt_len == 0:
                        continue

                    try:
                        pos = int(pos)
                    except:
                        continue

                    if ref_len == 1:
                        tsv.write_line(outf, chrom, pos, ref, alt)
                    else:
                        for i in range(ref_len):
                            tsv.write_line(outf, chrom, pos + i, ref[i], alt[i])
Example #2
0
	def save_matrix(self, output_path, analysis_name, output_format,
						 row_names, col_names, data,
						 suffix="", params=None, valid_row=lambda row: True):

		if len(suffix) > 0:
			suffix = "-{0}".format(suffix)

		if params is None:
			params = []

		path = os.path.join(output_path, "{0}{1}.{2}".format(analysis_name, suffix, output_format))
		self.log.debug("  > {0}".format(path))

		with tsv.open(path, 'w') as f:
			tsv.write_line(f, "## version={0}".format(VERSION))
			tsv.write_line(f, "## date={0}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
			for key, value in params + self.parameters:
				tsv.write_line(f, "## {0}={1}".format(key, value))
			tsv.write_line(f, "ID", *col_names)
			for row_index, row_name in enumerate(row_names):
				if len(row_name) == 0:
					self.log.warn("Empty identifier detected")
					continue

				row = data[row_index, :]
				if valid_row(row):
					values = [v if not np.isnan(v) else None for v in row]
					tsv.write_line(f, row_name, *values, null_value="-")
Example #3
0
def extract_snvs(fanns_db, data_path, logger=None):

	logger = logger or logging.getLogger("perf-cosmic")

	snvs = dict()

	logger.info("Reading mutations ...")
	
	progress = RatedProgress(logger, name="mutations")
	
	with tsv.open(data_path, "r") as df:
		columns = [
			"Genome-wide screen",
			"Mutation Description",
			"Mutation CDS",
			"Mutation AA",
			"Mutation GRCh37 genome position",
			"Mutation GRCh37 strand",
			"Accession Number",
			"ID_sample"]

		total_rows = queried_rows = dbfound_rows = 0
		for fields in tsv.rows(df, columns=columns, header=True):
			total_rows += 1
			wide_screen, mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields

			# wide_screen != "y"
			if mut_desc != "Substitution - Missense":
				continue

			queried_rows += 1
			for row in get_transcripts(fanns_db, mut_cds, mut_aa, mut_pos, mut_strand, acc, logger):
				dbfound_rows += 1
				k = tuple([row[k] for k in ["protein", "aa_pos", "aa_ref", "aa_alt"]])
				if k not in snvs:
					snvs[k] = snv = dict(
						transcript=row["transcript"],
						symbol=row["xrefs"]["symbol"],
						msamples=set(), wsamples=set())
				else:
					snv = snvs[k]
				
				if wide_screen == "y":
					snv["wsamples"].add(sample_id)
				else:
					snv["msamples"].add(sample_id)
			
			progress.update()

		progress.log_totals()

	logger.info("Counting the number of samples per mutation ...")
	
	for data in snvs.itervalues():
		data["msamples"] = len(data["msamples"])
		data["wsamples"] = len(data["wsamples"])
    
	logger.info("Total: total_rows={}, queried_rows={}, found_rows={}, protein_changes={}".format(total_rows, queried_rows, dbfound_rows, len(snvs)))

	return snvs
Example #4
0
def update_db(project):
	log = task.logger
	conf = task.conf

	projects_out_port = task.ports("projects_out")

	project_id = project["id"]
	log.info("--- [{0}] --------------------------------------------".format(project_id))

	oclust = project["oncodriveclust"]
	del project["oncodriveclust"]

	if not os.path.exists(oclust["results"]):
		log.warn("No results have been found. Skipping it.")
		return

	log.info("Updating the project database ...")

	projdb = ProjectDb(project["db"])

	exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv")

	log.info("  Excluded gene causes ...")
	log.debug("    > {0}".format(exc_path))

	count = 0
	with tsv.open(exc_path, "r") as exf:
		for gene, cause in tsv.lines(exf, (str, str), header=True):
			projdb.update_gene(Gene(id=gene, clust_exc_cause=cause))
			count += 1

	log.debug("    {0} genes excluded".format(count))

	log.info("  OncodriveCLUST results ...")

	with tsv.open(oclust["results"], "r") as f:
		types = (str, str, float, float, float)
		columns = ("GENE", "CLUST_COORDS", "ZSCORE", "PVALUE", "QVALUE")
		for gene, coords, zscore, pvalue, qvalue in tsv.lines(f, types, columns=columns, header=True, null_value="NA"):
			projdb.update_gene(Gene(id=gene, clust_coords=coords, clust_zscore=zscore, clust_pvalue=pvalue,
									clust_qvalue=qvalue, clust_exc_cause=ProjectDb.NO_GENE_EXC))

	projdb.commit()

	projdb.close()

	projects_out_port.send(project)
Example #5
0
	def load_cds_len(self, path):

		self.logger.info("Loading transcripts CDS length ...")
		self.logger.debug("> {}".format(path))

		cds_len = {}
		with tsv.open(path, "r") as f:
			for gene, transcript, transcript_len in tsv.lines(f, (str, str, int), header=True):
				cds_len[transcript] = transcript_len
		return cds_len
Example #6
0
def main():
    parser = argparse.ArgumentParser(description="Add annotations")

    cmd = DefaultCommandHelper(parser)

    cmd.add_db_args()

    parser.add_argument("id", metavar="ID", help="Annotation identifier.")

    parser.add_argument("name", metavar="NAME", help="Annotation name.")

    parser.add_argument(
        "type", metavar="TYPE", choices=["transcript", "protein"], help="Annotation type: transcript, protein"
    )

    parser.add_argument("path", metavar="PATH", help="Annotation items")

    parser.add_argument(
        "--priority",
        dest="priority",
        default=0,
        help="Priority for translating input annotations. 0 means not considered for translation. Default 0.",
    )

    parser.add_argument(
        "--header",
        dest="header",
        action="store_true",
        default=False,
        help="Specify that the annotation items file have a header.",
    )

    args, logger = cmd.parse_args("ann-add")

    db = cmd.open_db()

    try:
        logger.info("Creating annotation {} ...".format(args.name))

        db.add_map(args.id, args.name, args.type, args.priority)

        logger.info("Loading items ...")

        with tsv.open(args.path) as f:
            for source, value in tsv.lines(f, (str, str), header=args.header):
                if len(source) > 0 and len(value) > 0:
                    db.add_map_item(args.id, source, value)

        db.commit()
    except:
        return cmd.handle_error()
    finally:
        db.close()

    return 0
Example #7
0
	def load_data(self, data_paths, method=None):
		columns = []
		col_names = []
		row_name_index = {}
		for col_index, data_file in enumerate(data_paths):
			self.log.debug("  > {0}".format(data_file))
			names = []
			values = []
			with tsv.open(data_file, "r") as f:
				col_name, ext = os.path.splitext(os.path.basename(data_file))
				params = tsv.params(f)
				if "slice" in params:
					col_name = params["slice"]
				if "method" in params:
					if method is None:
						method = params["method"]
					elif method != params["method"]:
						self.log.warn("Different method of computation used for file {0}".format(data_file))

				for name, value in tsv.lines(f, (str, float), header=True, null_value="-"):
					if len(name) == 0:
						self.log.warn("Empty identifier detected")
						continue

					if name not in row_name_index:
						row_name_index[name] = len(row_name_index)

					names += [name]
					values += [value]
			col_names += [col_name]
			columns += [(names, values)]

		num_cols = len(columns)
		num_rows = len(row_name_index)
		row_names = [None] * num_rows
		for name, index in row_name_index.items():
			row_names[index] = name

		data = np.empty((num_rows, num_cols))
		data[:] = np.nan

		for col_index, (names, values) in enumerate(columns):
			for i, name in enumerate(names):
				data[row_name_index[name], col_index] = values[i]

		return row_names, col_names, data, method
Example #8
0
def open_dataset(project_id, base_path, datasets_path, name, mode, logger):
	name, ext = os.path.splitext(name)
	ext = ext.lower()
	if len(ext) == 0:
		ext = ".gz"
		name = "{0}.tsv{1}".format(name, ext)
	else:
		name = name + ext

	path = os.path.join(datasets_path, name)
	logger.debug("> {0}".format(os.path.relpath(path, base_path)))

	f = tsv.open(path, mode)

	tsv.write_param(f, "version", VERSION)
	tsv.write_param(f, "date", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
	tsv.write_param(f, "PROJECT_ID", project_id)

	return f
Example #9
0
	def save_combined_results(self, output_path, analysis_name, output_format,
								method, row_names, col_names, data, suffix="combination"):

		self.log.info("Saving combination results ...")

		path = os.path.join(self.args.output_path,
							"{0}-{1}.{2}".format(self.args.analysis_name, suffix, self.args.output_format))

		self.log.debug("  > {0}".format(path))
		with tsv.open(path, 'w') as f:
			tsv.write_line(f, "## slices={0}".format(",".join(col_names)))
			tsv.write_line(f, "## method={0}".format(method.name))
			tsv.write_line(f, "## date={0}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
			for key, value in self.parameters:
				tsv.write_line(f, "## {0}={1}".format(key, value))
			tsv.write_line(f, "ID", *method.combination_columns)
			for row_index, row_name in enumerate(row_names):
				if not np.isnan(data[row_index, 0]):
					values = [v if not np.isnan(v) else None for v in data[row_index, :]]
					tsv.write_line(f, row_name, *values, null_value="-")
Example #10
0
def fetch(db, muts_path, out_path, params=None, columns=None, maps=None, predictors=None,
		  labels=None, calc_labels=None, muts_header=False, logger=None):
	
	params = params or {}
	columns = columns or [c.lower() for c in COORD_COLUMNS]
	maps = maps or []
	predictors = predictors or []
	labels = labels or []
	
	state = {}
	
	with tsv.open(out_path, "w") as wf:
		
		metadata = db.metadata
		if "version" in metadata:
			tsv.write_param(wf, "db-version", db.metadata["version"])
		tsv.write_param(wf, "fetched", dt.now().strftime("%Y-%m-%d %H:%M:%S"))
		for k, v in params.items():
			tsv.write_param(wf, k, v)
	
		tsv.write_line(wf, "ID", *[c.upper() for c in columns] + [m.upper() for m in maps] + predictors + labels)
	
		for row in fetch_iter(db, muts_path, maps=maps, predictors=predictors,
							  muts_header=muts_header, state=state, logger=logger):
			
			if calc_labels is not None:
				labels = calc_labels(row) or {}
			else:
				labels = {}
	
			xrefs = row["xrefs"]
			scores = row["scores"]

			tsv.write_line(wf, state[STATE_MUTATION].identifier,
				   *[row[c] for c in columns]
				   + [xrefs[m] for m in maps]
				   + [scores[p] for p in predictors]
				   + [labels.get(l, "") for l in labels])
	
	return {k : state[k] for k in [STATE_HITS, STATE_FAILS]}
Example #11
0
def add_map(db, id, name, type, priority, path, header=True):
	"""
	:param id: map identifier
	:param name: map name
	:param type: xref maps to type: transcript, protein
	:param path: map file
	:param priority: priority for translating input xrefs. 0 means not considered for translation. Default 0.
	:param header: specify that the map file have a header.
	"""

	logger = logging.getLogger("fannsdb.map-add")

	logger.info("Creating map {} ...".format(name))

	db.add_map(id, name, type, priority)

	logger.info("Loading items ...")

	with tsv.open(path) as f:
		for source, value in tsv.lines(f, (str, str), header=header):
			if len(source) > 0 and len(value) > 0:
				db.add_map_item(id, source, value)
Example #12
0
def main():
	parser = argparse.ArgumentParser(
		description="Export SNV's")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("dest_path", metavar="DEST",
						help="The destination file. Use - for standard output.")

	args, log = cmd.parse_args("export-snvs")

	db = cmd.open_db()

	logger.info("Exporting SNV's ...")

	total_count = 0
	total_start_time = time.time()

	try:
		progress = RatedProgress(logger, name="SNVs")
		rows_count = 0
		with tsv.open(args.dest_path, "w") as f:
			for snv in db.snvs():
				rows_count += 1

				tsv.write_line(f, snv["chr"], snv["start"], snv["start"], snv["strand"], "{}>{}".format(snv["ref"], snv["alt"]), "S")

				progress.update()

		log.info("Finished. Total rows = {}, elapsed_time = {}".format(rows_count, progress.elapsed_time))
	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
Example #13
0
	def save_splited_results(self, output_path, analysis_name, output_format,
								matrix, mapping, method, results, slices, suffix=""):

		if len(suffix) > 0:
			suffix = "-{0}".format(suffix)

		for slice_results_index, slice in enumerate(slices):
			slice_name = matrix.slice_names[slice]
			path = os.path.join(output_path, "{0}{1}-{2}.{3}".format(
				analysis_name, suffix, slice_name, output_format))
			self.log.debug("  > {0}".format(path))

			with tsv.open(path, 'w') as f:
				tsv.write_line(f, "## version={0}".format(VERSION))
				tsv.write_line(f, "## slice={0}".format(slice_name))
				tsv.write_line(f, "## method={0}".format(method.name))
				tsv.write_line(f, "## date={0}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
				for key, value in self.parameters:
					tsv.write_line(f, "## {0}={1}".format(key, value))
				tsv.write_line(f, "ID", *method.results_columns)
				for row_index, row_name in enumerate(mapping.group_names):
					value = results[slice_results_index, row_index]
					if not np.isnan(value):
						tsv.write_line(f, row_name, value, null_value="-")
Example #14
0
def combination_recurrences(projects_set):
    log = task.logger

    config = GlobalConfig(task.conf)
    paths = PathsConfig(config)

    classifier, projects = projects_set

    classifier_id = classifier["id"]

    group_values = classifier["group_values"]
    short_values = classifier["group_short_values"]
    long_values = classifier["group_long_values"]

    group_name = classifier["group_name"]
    group_short_name = classifier["group_short_name"]
    group_long_name = classifier["group_long_name"]

    if len(group_values) == 0:
        group_file_prefix = classifier_id
    else:
        group_file_prefix = "{0}-{1}".format(classifier_id, group_short_name)

    group_file_prefix = normalize_id(group_file_prefix)

    log.info(
        "--- [{0} ({1}) ({2}) ({3})] {4}".format(
            classifier["name"], group_long_name, group_short_name, group_name, "-" * 30
        )
    )

    log.info("Creating database ...")

    db_path = make_temp_file(task, suffix="-{0}.db".format(group_file_prefix))
    log.debug("  > {0}".format(db_path))

    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row

    create_db(conn)

    log.info("Combining recurrences ...")

    c = conn.cursor()

    sample_total = 0

    project_ids = []
    for project in projects:
        project_ids += [project["id"]]

        log.info("  Project {0}:".format(project["id"]))

        projdb = ProjectDb(project["db"])

        project_sample_total = projdb.get_total_affected_samples()

        sample_total += project_sample_total

        log.info("    Total samples = {0}".format(project_sample_total))

        log.info("    Variant genes ...")

        count = 0
        for afg in projdb.affected_genes(join_variant=True, join_xrefs=True, join_rec=True):
            var = afg.var
            rec = afg.rec

            if rec.sample_freq is None:
                log.warn("Discarding variant gene without sample frequency: {0}".format(repr(afg)))
                continue

            start, end, ref, alt = var_to_tab(var)

            try:
                c.execute(
                    "INSERT INTO variants (chr, strand, start, ref, alt, xrefs) VALUES (?,?,?,?,?,?)",
                    (var.chr, var.strand, start, ref, alt, ",".join(var.xrefs)),
                )
                var_id = c.lastrowid
            except sqlite3.IntegrityError:
                c.execute(
                    "SELECT var_id FROM variants WHERE chr=? AND strand=? AND start=? AND ref=? AND alt=?",
                    (var.chr, var.strand, start, ref, alt),
                )
                r = c.fetchone()
                var_id = r[0]

            try:
                c.execute(
                    "INSERT INTO variant_genes (var_id, gene_id, impact, coding_region, prot_changes, sample_freq) VALUES (?,?,?,?,?,?)",
                    (var_id, afg.gene_id, afg.impact, afg.coding_region, afg.prot_changes, rec.sample_freq),
                )
            except sqlite3.IntegrityError:
                c.execute(
                    """
					UPDATE variant_genes
					SET sample_freq=sample_freq + ?
					WHERE var_id=? AND gene_id=?""",
                    (rec.sample_freq, var_id, afg.gene_id),
                )

            count += 1

        log.info("      {0} variant genes".format(count))

        log.info("    Genes ...")

        count = 0
        for gene in projdb.genes(join_xrefs=True, join_rec=True):
            rec = gene.rec

            if rec.sample_freq is None:
                continue

            c.execute("SELECT COUNT(*) FROM genes WHERE gene_id=?", (gene.id,))
            r = c.fetchone()
            if r[0] == 0:
                c.execute("INSERT INTO genes (gene_id, sample_freq) VALUES (?,?)", (gene.id, rec.sample_freq))
            else:
                c.execute("UPDATE genes SET sample_freq=sample_freq + ? WHERE gene_id=?", (rec.sample_freq, gene.id))
            count += 1

        log.info("      {0} genes".format(count))

        log.info("    Pathways ...")

        count = 0
        for pathway in projdb.pathways(join_rec=True):
            rec = pathway.rec

            if rec.sample_freq is None:
                continue

            c.execute("SELECT COUNT(*) FROM pathways WHERE pathway_id=?", (pathway.id,))
            r = c.fetchone()
            if r[0] == 0:
                c.execute("INSERT INTO pathways (pathway_id, sample_freq) VALUES (?,?)", (pathway.id, rec.sample_freq))
            else:
                c.execute(
                    "UPDATE pathways SET sample_freq=sample_freq + ? WHERE pathway_id=?", (rec.sample_freq, pathway.id)
                )
            count += 1

        log.info("      {0} pathways".format(count))

        projdb.close()

    log.info("Calculating proportions with {0} samples in total among projects ...".format(sample_total))

    if sample_total > 0:
        c.execute("UPDATE variant_genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total))
        c.execute("UPDATE genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total))
        c.execute("UPDATE pathways SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total))

    c.close()
    conn.commit()

    log.info("Saving results ...")

    c = conn.cursor()

    base_path = paths.combination_path("recurrences")

    log.info("  Variant genes ...")

    with tsv.open(os.path.join(base_path, "variant_gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f:
        tsv.write_param(f, "classifier", classifier["id"])
        tsv.write_param(f, "group_id", group_name)
        tsv.write_param(f, "group_short_name", group_short_name)
        tsv.write_param(f, "group_long_name", group_long_name)
        tsv.write_param(f, "projects", ",".join(project_ids))
        tsv.write_param(f, "SAMPLE_TOTAL", sample_total)
        tsv.write_line(
            f,
            "CHR",
            "STRAND",
            "START",
            "ALLELE",
            "GENE_ID",
            "IMPACT",
            "IMPACT_CLASS",
            "SAMPLE_FREQ",
            "SAMPLE_PROP",
            "PROT_CHANGES",
            "XREFS",
        )
        for r in c.execute(
            "SELECT * FROM variant_genes JOIN variants USING (var_id) ORDER BY chr*1, chr, strand, start, gene_id"
        ):
            strand, ref, alt = r["strand"], r["ref"], r["alt"]
            allele = "{0}/{1}".format(ref, alt)
            tsv.write_line(
                f,
                r["chr"],
                strand,
                r["start"],
                allele,
                r["gene_id"],
                r["impact"],
                TransFIC.class_name(r["impact"]),
                r["sample_freq"],
                r["sample_prop"],
                r["prot_changes"],
                r["xrefs"],
                null_value="-",
            )

    log.info("  Genes ...")

    with tsv.open(os.path.join(base_path, "gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f:
        tsv.write_param(f, "classifier", classifier["id"])
        tsv.write_param(f, "group_id", group_name)
        tsv.write_param(f, "group_short_name", group_short_name)
        tsv.write_param(f, "group_long_name", group_long_name)
        tsv.write_param(f, "projects", ",".join(project_ids))
        tsv.write_param(f, "SAMPLE_TOTAL", sample_total)
        tsv.write_line(f, "GENE_ID", "SAMPLE_FREQ", "SAMPLE_PROP")
        for r in c.execute("SELECT * FROM genes ORDER BY gene_id"):
            tsv.write_line(f, r["gene_id"], r["sample_freq"], r["sample_prop"], null_value="-")

    log.info("  Pathways ...")

    with tsv.open(os.path.join(base_path, "pathway-{0}.tsv.gz".format(group_file_prefix)), "w") as f:
        tsv.write_param(f, "classifier", classifier["id"])
        tsv.write_param(f, "group_id", group_name)
        tsv.write_param(f, "group_short_name", group_short_name)
        tsv.write_param(f, "group_long_name", group_long_name)
        tsv.write_param(f, "projects", ",".join(project_ids))
        tsv.write_param(f, "SAMPLE_TOTAL", sample_total)
        tsv.write_line(f, "PATHWAY_ID", "SAMPLE_FREQ", "SAMPLE_PROP")
        for r in c.execute("SELECT * FROM pathways ORDER BY pathway_id"):
            tsv.write_line(f, r["pathway_id"], r["sample_freq"], r["sample_prop"], null_value="-")

    conn.close()

    remove_temp(task, db_path)
Example #15
0
def prepare_files(project):
	log = task.logger
	conf = task.conf

	projects_out_port = task.ports("projects_out")

	project_id = project["id"]
	log.info("--- [{0}] --------------------------------------------".format(project_id))

	project_results = ProjectResults(project)

	mutations_threshold, genes_filter_enabled, genes_filter, filt = get_oncodriveclust_configuration(log, conf, project)

	log.info("Loading transcripts CDS length ...")

	cds_len = load_cds_len(conf)

	log.info("Retrieving gene alterations ...")

	projdb = ProjectDb(project["db"])

	data = retrieve_data(projdb, cds_len)

	projdb.close()

	data_paths = [
		os.path.join(project["temp_path"], "oncodriveclust-non-syn-data.tsv"),
		os.path.join(project["temp_path"], "oncodriveclust-syn-data.tsv")]

	log.info("Saving data ...")
	log.debug("> {0}".format(data_paths[NON_SYN]))
	log.debug("> {0}".format(data_paths[SYN]))

	df = [tsv.open(path, "w") for path in data_paths]

	gene_sample_count = {}

	for key, value in data.items():
		findex, gene, sample = key
		transcript, transcript_len, protein_pos = value

		if findex == NON_SYN:
			if gene not in gene_sample_count:
				gene_sample_count[gene] = 1
			else:
				gene_sample_count[gene] += 1

			if genes_filter_enabled and not filt.valid(gene):
				continue

		tsv.write_line(df[findex], gene, sample, protein_pos)

	for f in df:
		f.close()

	exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv")

	log.info("Saving excluded gene causes ...")
	log.debug("> {0}".format(exc_path))

	with tsv.open(exc_path, "w") as exf:
		tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE")
		for gene, sample_count in gene_sample_count.items():
			causes = []
			if genes_filter_enabled and not filt.valid(gene):
				causes += [ProjectDb.GENE_EXC_FILTER]
			if sample_count < mutations_threshold:
				causes += [ProjectDb.GENE_EXC_THRESHOLD]
			if len(causes) > 0:
				tsv.write_line(exf, gene, "".join(causes))

	log.info("Sending project ...")

	projects_out_port.send(dict(project,
								oncodriveclust=dict(
									data_paths=data_paths,
									mutations_threshold=mutations_threshold,
									genes_filter_enabled=genes_filter_enabled, # not used
									genes_filter=genes_filter))) # not used
Example #16
0
	def __enter__(self):
		self.f = tsv.open(self.name, "w")
		self._size = 0
		return self
Example #17
0
def main():
	parser = argparse.ArgumentParser(
		description="Fetch Condel scores")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("muts_path", metavar="SNVS_PATH",
						help="SNV's to check. Use - for standard input.")

	parser.add_argument("out_path", metavar="OUTPUT_PATH",
						help="The results path. Use - for standard output.")

	cmd.add_selected_predictors_args()

	cmd.add_selected_annotations_args()

	cmd.add_selected_columns_args()

	args, logger = cmd.parse_args("fetch")

	db = cmd.open_db()

	predictors = cmd.get_selected_predictors()

	annotations = cmd.get_selected_annotations()

	columns = cmd.get_selected_columns()

	logger.info("Reading {} ...".format(args.muts_path if args.muts_path != "-" else "from standard input"))

	try:
		progress = RatedProgress(logger, name="SNVs")

		with tsv.open(args.muts_path) as f:
			with tsv.open(args.out_path, "w") as wf:
				tsv.write_line(wf, "ID", *[c.upper() for c in columns] + [a.upper() for a in annotations] + predictors)

				hit = fail = 0

				mut = DnaAndProtMutationParser()
				for line_num, line in enumerate(f, start=1):
					line = line.rstrip(" \n\r")
					if len(line) == 0 or line.startswith("#"):
						continue

					try:
						mut.parse(line)
					except PrematureEnd:
						logger.error("Missing fields at line {}".format(line_num))
						fail += 1
						continue
					except UnexpectedToken as ex:
						logger.error("Unexpected field '{}' at line {}".format(ex.args[0], line_num))
						fail += 1
						continue

					exists = False
					for row in query_mutation(logger, db, mut, annotations, predictors):

						exists = True

						ann = row["annotations"]
						scores = row["scores"]

						tsv.write_line(wf, mut.identifier,
							   *[row[c] for c in columns]
							   + [ann[a] for a in annotations]
							   + [scores[p] for p in predictors])

						"""
						if logger.isEnabledFor(logging.DEBUG):
							logger.debug("    --> {} {} {} {} {} {} {} {} {} {}".format(
										row["chr"], row["start"], row["ref"], row["alt"], row["transcript"],
										row["protein"], row["aa_pos"], row["aa_ref"], row["aa_alt"],
										mut.identifier or "*"))
						"""

					progress.update()

					if exists:
						hit += 1
					else:
						fail += 1

		progress.log_totals()

		logger.info("Finished. total={}, hits={}, fails={}, elapsed={}".format(hit + fail, hit, fail, progress.elapsed_time))

	except:
		return cmd.handle_error()
	finally:
		db.close()
Example #18
0
def main():
    parser = argparse.ArgumentParser(description="Calculate Baseline Tolerance statistics per gene")

    cmd = DefaultCommandHelper(parser)

    parser.add_argument("tree_path", metavar="TREE_PATH", help="The groups descendant tree")

    parser.add_argument("root_group", metavar="ROOT_GROUP", help="Tree root group")

    parser.add_argument("group_genes_path", metavar="GROUP_FEATS_PATH", help="Map between groups and features")

    parser.add_argument("stats_path", metavar="STATS_PATH", help="Partial gene statistics")

    parser.add_argument("out_path", metavar="OUTPUT_PATH", help="Output gene statistics")

    parser.add_argument(
        "-c",
        "--count-threshold",
        dest="count_threshold",
        metavar="N",
        default=DEFAULT_COUNT_THRESHOLD,
        help="Minimum number of features per group",
    )

    parser.add_argument(
        "--stdev-threshold",
        dest="stdev_threshold",
        metavar="V",
        default=DEFAULT_STDEV_THRESHOLD,
        help="Skip feature statistics with a standard deviation less than V (it will be calculated at the level of groups)",
    )

    args, logger = cmd.parse_args("blt-groups")

    logger.info("Loading groups tree ...")

    group_children = defaultdict(set)
    with tsv.open(args.tree_path) as f:
        for group, children in tsv.lines(f, (str, lambda v: set(v.split(",")))):
            group_children[group] |= children

    logger.info("Loading mappings between groups and features ...")

    group_genes = defaultdict(set)
    with tsv.open(args.group_genes_path) as f:
        for group, genes in tsv.lines(f, (str, lambda v: set(v.split(",")))):
            group_genes[group] |= genes

    logger.info("Loading partial statistics ...")

    partial_stats = {}
    with tsv.open(args.stats_path) as f:
        predictors = f.readline().rstrip("\n").split("\t")[1:]
        num_predictors = len(predictors)
        for line in f:
            fields = line.rstrip("\n").split("\t")
            gene = fields[0]
            gene_stats = [[float(v) if i > 0 else int(v) for i, v in enumerate(ss.split("/"))] for ss in fields[1:]]
            partial_stats[gene] = gene_stats

    logger.info("  Predictors: {}".format(", ".join(predictors)))
    logger.info("  Features: {}".format(len(partial_stats.keys())))

    logger.info("Calculating features ...")

    stats = {}

    feat_count = 0
    feat_partial_count = [0] * num_predictors
    for feature, feat_partial_stats in partial_stats.items():
        feat_with_stats = False
        feat_stats = [None] * (num_predictors + 1)
        for i in range(num_predictors):
            s0, s1, s2 = feat_partial_stats[i]

            if s0 == 0.0:
                continue

            if s0 < args.count_threshold:
                continue

            x = (s0 * s2 - s1 * s1) / (s0 * (s0 - 1))
            if x < -1e-12:
                continue

            mean = s1 / s0
            std = math.sqrt(abs(x))
            if std < args.stdev_threshold:
                continue

            feat_stats[i] = (int(s0), mean, std)
            feat_partial_count[i] += 1
            feat_with_stats = True

        if feat_with_stats:
            feat_count += 1
            stats[feature] = feat_stats
            # print feature, "\t".join(["/".join([str(v) for v in feat_stats[i] or []]) for i in range(num_predictors)])

    logger.info(
        "  {} ({}) features out of {} calculated directly from partial statistics".format(
            feat_count, "/".join(map(str, feat_partial_count)), len(partial_stats)
        )
    )

    logger.info("Calculating groups ...")

    calculate_group(
        logger, args.root_group, args.count_threshold, group_children, group_genes, partial_stats, num_predictors, stats
    )

    logger.info("  {} features calculated in total".format(len(stats)))

    with tsv.open(args.out_path, "w") as of:
        tsv.write_line(of, "GENE", "GROUP", *predictors)
        for gene in sorted(stats.keys()):
            gene_stats = stats[gene]
            sb = [gene]
            stats_group = gene_stats[num_predictors]
            if stats_group is not None:
                sb += [stats_group]
            else:
                sb += ["|" + ("-" * num_predictors)]

            for i in range(num_predictors):
                if gene_stats[i] is not None:
                    sb += ["/".join([str(v) for v in gene_stats[i]])]
                else:
                    sb += ["-/-/-"]
            tsv.write_line(of, *sb)

    return 0
Example #19
0
def main():
	parser = argparse.ArgumentParser(
		description="Import scores into the database")

	cmd = Command.withtraits(DbTrait, PredictorsTrait)(parser)

	cmd.add_db_args()

	parser.add_argument("source_path", metavar="SOURCE",
						help="The source file. Use - for standard input.")

	#TODO: which are the coordinates column

	cmd.add_selected_predictors_args()

	parser.add_argument("--skip-empty-scores", dest="skip_empty_scores", action="store_true", default=False,
						help="Skip SNV's where all the scores are empty")

	parser.add_argument("--skip-update-predictors", dest="skip_update_predictors", action="store_true", default=False,
						help="Skip the update of the predictors.")

	parser.add_argument("--skip-create-index", dest="skip_create_index", action="store_true", default=False,
						help="Skip the creation of the database indices.")

	parser.add_argument("--ignore-errors", dest="ignore_errors", action="store_true", default=False,
						help="When errors on the input file, report them but continue processing the input.")

	args, logger = cmd.parse_args("import")

	db = cmd.open_db()

	try:
		progress = RatedProgress(logger, name="SNVs")

		total_lines = 0

		logger.info("Reading {} ...".format(args.source_path if args.source_path != "-" else "from standard input"))

		with tsv.open(args.source_path) as f:
			# Parse header
			hdr_line = f.readline()
			hdr = {}
			for index, name in enumerate(hdr_line.rstrip("\n").split("\t")):
				hdr[name] = index

			# Predictors to update from the user selection and source availability
			db_predictors = set([p["id"] for p in db.predictors()])
			src_predictors = [name for name in hdr if name not in COORD_COLUMNS]
			predictors = cmd.get_selected_predictors(available_predictors=src_predictors)
			for predictor in predictors:
				if predictor not in db_predictors:
					logger.info("Creating non existing predictor: {}".format(predictor))
					db.add_predictor(predictor, FannsDb.SOURCE_PREDICTOR_TYPE)

			logger.info("Predictors: {}".format(", ".join(predictors)))

			all_columns = COORD_COLUMNS + predictors
			types = COORD_TYPES + ([score_value] * len(predictors))

			missing_columns = [name for name in all_columns if name not in hdr]
			if len(missing_columns) > 0:
				raise Exception("The following columns are missing: {}".format(", ".join(missing_columns)))

			columns = [hdr[name] for name in all_columns]
			max_column = max(columns)

			for line_num, line in enumerate(f, start=2):
				fields = line.rstrip("\n").split("\t")

				if len(fields) < max_column:
					log.error("Missing columns for line {}: {}".format(line_num, " ".join(fields)))
					if not args.ignore_errors:
						raise

				try:
					fields = [type_cast(fields[index]) for type_cast, index in zip(types, columns)]
				except Exception as ex:
					logger.error("{} at line {}: {}".format(str(ex), line_num, " ".join(fields)))
					if not args.ignore_errors:
						raise

				(chr, strand, start, ref, alt, transcript,
				 aa_pos, aa_ref, aa_alt, protein) = fields[:10]

				scores = fields[10:]

				if args.skip_empty_scores and sum([0 if s is None else 1 for s in scores]) == 0:
					continue

				try:
					db.add_snv(
								chr=chr, strand=strand, start=start, ref=ref, alt=alt, transcript=transcript,
								protein=protein, aa_pos=aa_pos, aa_ref=aa_ref, aa_alt=aa_alt,
								scores=dict(zip(predictors, scores)))
				except Exception as ex:
					logger.error("Error importing SNV at line {}: {}".format(line_num, str(ex)))
					if not args.ignore_errors:
						raise

				progress.update()

			total_lines += line_num

		progress.log_totals()

		logger.info("Finalizing database ...")

		if not args.skip_update_predictors:
			logger.info("Updating predictors ...")
			db.update_predictors()

		logger.info("Committing ...")
		db.commit()

		if not args.skip_create_index:
			logger.info("Creating indices ...")
			db.create_indices()

		logger.info("Finished successfully. Elapsed time: {}".format(progress.elapsed_time))

	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
Example #20
0
def drivers():
	log = task.logger

	config = GlobalConfig(task.conf)
	paths = PathsConfig(config)

	db_path = paths.results_path("drivers.db")
	db = SigDb(db_path)
	db.open()

	log.info("Variants ...")

	path = paths.combination_path("recurrences", "variant_gene-global-all.tsv.gz")
	with tsv.open(path, "r") as f:
		types = (str, str, int, str)
		for fields in tsv.lines(f, types, columns=("CHR", "STRAND", "START", "ALLELE"), header=True):
			chr, strand, start, allele = fields[:4]
			db.add_variant(chr, start)

	log.info("Genes ...")

	gene_sites = {}

	gene_fm = set()
	gene_clust = set()

	#SPECIAL_THRESHOLD = ["C18", "C34"]
	SPECIAL_THRESHOLD = []

	log.info("  OncodriveFM ...")

	filename_re = re.compile(r"gene-cancer_site-(.+)\.tsv.gz")
	base_path = paths.combination_path("oncodrivefm")
	for path in os.listdir(base_path):
		m = filename_re.match(path)
		if not m:
			continue

		cancer_site_code = m.group(1)

		if cancer_site_code in SPECIAL_THRESHOLD:
			threshold = 1e-6
		else:
			threshold = 0.01

		with tsv.open(os.path.join(base_path, path), "r") as f:
			params = tsv.params(f)
			cancer_site_name = params["group_long_name"]
			for fields in tsv.lines(f, (str, float), columns=("ID", "QVALUE"), header=True):
				gene, qvalue = fields
				if qvalue < threshold:
					add_cancer_site(gene_sites, gene, cancer_site_code, cancer_site_name)

					gene_fm.add(gene)

	log.info("  OncodriveCLUST ...")

	filename_re = re.compile(r"cancer_site-(.+)\.tsv.gz")
	base_path = paths.combination_path("oncodriveclust")
	for path in os.listdir(base_path):
		m = filename_re.match(path)
		if not m:
			continue

		cancer_site_code = m.group(1)

		with tsv.open(os.path.join(base_path, path), "r") as f:
			params = tsv.params(f)
			cancer_site_name = params["group_long_name"]
			for fields in tsv.lines(f, (str, float), columns=("ID", "QVALUE"), header=True):
				gene, qvalue = fields
				if qvalue < 0.05:
					add_cancer_site(gene_sites, gene, cancer_site_code, cancer_site_name)

					gene_clust.add(gene)

	log.info("  Updating db ...")
	sig_genes = gene_fm | gene_clust
	for gene in sig_genes:
		db.add_gene(gene, gene in gene_fm, gene in gene_clust)

	log.info("Saving driver genes cancer sites dataset ...")
	path = paths.results_path("gene-driver_cancer_sites.tsv")
	log.debug("> {}".format(path))
	with open(path, "w") as f:
		tsv.write_param(f, "date", datetime.now())
		tsv.write_line(f, "GENE_ID", "FM", "CLUST", "CANCER_SITES_COUNT", "CANCER_SITE_CODES", "CANCER_SITE_NAMES")
		for gene, sites in gene_sites.items():
			tsv.write_line(f, gene,
						   1 if gene in gene_fm else 0,
						   1 if gene in gene_clust else 0,
						   len(sites),
						   ", ".join(sorted([code for code, name in sites])),
						   ", ".join(sorted([name for code, name in sites])))

	db.commit()
	db.close()
Example #21
0
	def __open(self):
		self.__f = tsv.open(self.path)
		self.line_num = 0
Example #22
0
def main():
	parser = argparse.ArgumentParser(
		description="Export Scores")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("dest_path", metavar="OUTPUT_PATH",
						help="The output file. Use - for standard output.")

	cmd.add_selected_predictors_args()

	cmd.add_selected_annotations_args()

	cmd.add_selected_columns_args()

	parser.add_argument("--json", dest="to_json", action="store_true", default=False,
						help="Export the results in json format")

	parser.add_argument("--sample", dest="sample", type=int, metavar="PCT",
						help="Export a random sample of PCT %%")

	parser.add_argument("--start", dest="start", type=int, metavar="N",
						help="Start to export from the SNV number N")

	parser.add_argument("--limit", dest="limit", type=int, metavar="N",
						help="Limit the number of SNVs to export to N")

	args, logger = cmd.parse_args("export")

	db = cmd.open_db()

	predictors = cmd.get_selected_predictors()

	annotations = cmd.get_selected_annotations()

	columns = cmd.get_selected_columns()

	logger.info("Exporting ...")

	random.seed(time.time())

	total_count = 0
	total_start_time = time.time()

	try:
		progress = RatedProgress(logger, name="SNVs")

		to_json = args.to_json
		sample = args.sample
		start = args.start or 0
		limit = args.limit

		doc = None
		last_pos = None
		rows_count = 0
		snvs_count = 0
		with tsv.open(args.dest_path, "w") as f:

			if not to_json:
				tsv.write_line(f, *[c.upper() for c in columns] + [a.upper() for a in annotations] + predictors)

			for row in db.query_scores(predictors=predictors, maps=annotations):

				if not to_json:
					if start > 0:
						start -= 1
						continue

					if sample is not None and random.randint(1, 100) > sample:
						continue

				pos = (row["chr"], row["strand"], row["start"], row["ref"], row["alt"])
				if last_pos != pos:
					if to_json:
						if start > 0:
							start -= 1
							continue

						if limit is not None and snvs_count >= limit:
							if doc is not None:
								json.dump(doc, f)
								f.write("\n")
							break

					snvs_count += 1

				rows_count += 1

				ann = row["annotations"]
				scores = row["scores"]

				if to_json:
					tdoc = dict([(k,row[k]) for k in ["transcript", "protein", "aa_pos", "aa_ref", "aa_alt"]] +
										[(k,scores[k]) for k in predictors])

					if pos != last_pos:
						if doc is not None:
							if sample is None or random.randint(1, 100) <= sample:
								json.dump(doc, f)
								f.write("\n")
							else:
								snvs_count -= 1

						doc = dict([(k, row[k]) for k in ["chr", "strand", "start", "ref", "alt"]] +
										[("transcripts", [tdoc])])
					else:
						doc["transcripts"] += [tdoc]

				else:
					tsv.write_line(f,
							   *[row[c] for c in columns]
							   + [ann[a] for a in annotations]
							   + [scores[p] for p in predictors])

				progress.update()

				last_pos = pos

				if not to_json and limit is not None and rows_count >= limit:
					break

		progress.log_totals()

		logger.info("Finished. Total rows = {}, SNVs = {}, elapsed_time = {}".format(rows_count, snvs_count, progress.elapsed_time))
	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
Example #23
0
def main():
	parser = argparse.ArgumentParser(
		description="Calculate Baseline Tolerance partial statistics per feature")

	cmd = DefaultCommandHelper(parser)

	parser.add_argument("scores_path", metavar="SCORES_PATH",
						help="The scores file")

	parser.add_argument("predictors", metavar="PREDICTORS",
						help="Comma separated list of predictors")

	parser.add_argument("out_path", metavar="OUTPUT_PATH",
						help="Output file.")

	cmd.add_transform_args()

	args, logger = cmd.parse_args("blt-partial")

	predictors = [p.strip() for p in args.predictors.split(",") if len(p.strip()) > 0]
	num_predictors = len(predictors)

	if len(predictors) == 0:
		logger.error("At least one predictor is needed")
		exit(-1)

	logger.info("Selected predictors: {}".format(", ".join(predictors)))

	transforms = cmd.get_transforms()

	stats = {}

	lost_snvs = 0
	scores_path = args.scores_path

	logger.info("Reading scores from {} ...".format(
		os.path.basename(scores_path) if scores_path != "-" else "standard input"))

	with tsv.open(scores_path) as sf:
		for line_num, line in enumerate(sf):
			fields = line.rstrip("\n").split("\t")
			chrom, pos, ref, alt, feature = fields[:5]

			if len(feature) == 0:
				lost_snvs += 1
				continue

			scores = fields[5:]

			if len(scores) != num_predictors:
				line_error(logger, scores_path, line_num, "Number of score columns does not match the number of predictors")

			try:
				scores = [float(v) if len(v) > 0 else None for v in scores]
			except:
				line_error(logger, scores_path, line_num, "Scores should be real numbers: {}".format(scores))

			if feature not in stats:
				stats[feature] = tuple([[0, 0.0, 0.0] for p in predictors])

			feature_stats = stats[feature]

			for i, score in enumerate(scores):
				if score is not None:
					predictor = predictors[i]
					if predictor in transforms:
						for name, func in transforms[predictor]:
							try:
								score = func(score)
							except:
								logger.error("Error transforming the {} score {} with {}".format(predictor, score, name))
								exit(-1)

					feature_stats[i][0] += 1
					feature_stats[i][1] += score
					feature_stats[i][2] += score * score

	logger.info("Saving results into {} ...".format(
		os.path.basename(args.out_path) if args.out_path != "-" else "standard output"))

	with tsv.open(args.out_path, "w") as of:
		tsv.write_line(of, "FEATURE", *predictors)
		for feature in sorted(stats.keys()):
			sb = [feature]
			feature_stats = stats[feature]
			for i in range(num_predictors):
				sb += ["/".join([repr(v) for v in feature_stats[i]])]
			tsv.write_line(of, *sb)

	logger.info("Number of SNV's = {}, lost SNV's = {}, number of features = {}".format(line_num, lost_snvs, len(stats)))

	return 0
Example #24
0
def main():
	parser = argparse.ArgumentParser(
		description="Generate datasets needed to evaluate performance from Cosmic mutations")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("data_path", metavar="PATH",
						help="The CosmicMutantExport tsv file")

	parser.add_argument("cgc_path", metavar="PATH",
						help="The list of CGC genes")

	parser.add_argument("drivers_path", metavar="PATH",
						help="The list of CHASM drivers (drivers.tmps)")

	parser.add_argument("-o", dest="prefix", metavar="PREFIX",
						help="Output prefix.")

	args, logger = cmd.parse_args("perf-cosmic")

	prefix = args.prefix or "cosmic-"

	fanns_db = cmd.open_db()

	try:
		snvs = dict()

		logger.info("Counting the number of samples per mutation ...")
		with tsv.open(args.data_path, "r") as df:
			columns = [
				#"Genome-wide screen",
				"Mutation Description",
				"Mutation CDS",
				"Mutation AA",
				"Mutation GRCh37 genome position",
				"Mutation GRCh37 strand",
				"Accession Number",
				"ID_sample"]

			total_rows = queried_rows = 0
			for fields in tsv.rows(df, columns=columns, header=True):
				total_rows += 1
				#wide_screen, mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields
				mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields
				wide_screen = "y"		
				if wide_screen != "y" or mut_desc != "Substitution - Missense":
					continue

				queried_rows += 1
				for row in get_transcripts(fanns_db, mut_cds, mut_aa, mut_pos, mut_strand, acc, logger):
					k = tuple([row[k] for k in ["protein", "aa_pos", "aa_ref", "aa_alt"]])
					if k not in snvs:
						symbol = row["xrefs"]["symbol"]
						snvs[k] = dict(
							transcript=row["transcript"],
							symbol=symbol,
							samples=set([sample_id]))
					else:
						snvs[k]["samples"].add(sample_id)

		logger.info("Total: total_rows={}, queried_rows={}, protein_changes={}".format(total_rows, queried_rows, len(snvs)))

		logger.info("Loading CGC genes ...")
		cgc_genes = set()
		with open(args.cgc_path, "r") as f:
			for line in f:
				cgc_genes.add(line.rstrip("\n"))

		logger.info("Loading CHASM drivers ...")
		drivers = set()
		with open(args.drivers_path, "r") as f:
			for line in f:
				drivers.add(line.rstrip("\n").split("\t")[0])

		logger.info("Creating datasets ...")

		progress = RatedProgress(logger, name="mutations")

		with Dataset(prefix + "1") as rec1,\
			Dataset(prefix + "2") as rec2,\
			Dataset(prefix + "4") as rec4,\
			Dataset(prefix + "CGC") as cgc,\
			Dataset(prefix + "noCGC") as nocgc,\
			Dataset(prefix + "D") as drv,\
			Dataset(prefix + "O") as nodrv:

			for (protein, aa_pos, aa_ref, aa_alt), snv in snvs.items():
				num_samples = len(snv["samples"])
				line = "\t".join([str(v) for v in [protein, aa_pos, aa_ref, aa_alt]])
				if num_samples == 1:
					rec1.write(line)
				if num_samples >= 2:
					rec2.write(line)
				if num_samples >= 4:
					rec4.write(line)
				
				symbol = snv["symbol"]
				if symbol is not None and ((isinstance(symbol, basestring) and symbol in cgc_genes) or len(set(symbol) & cgc_genes) > 0):
					cgc.write(line)
				elif num_samples == 1:
					nocgc.write(line)
			
				if snv["transcript"] in drivers:
					drv.write(line)
				elif num_samples == 1:
					nodrv.write(line)
                    
				progress.update()

			progress.log_totals()

			logger.info("Datasets: {}".format(", ".join(["{}={}".format(os.path.basename(d.name), d.size) for d in [
				rec1, rec2, rec4, cgc, nocgc, drv, nodrv]])))

	except:
		cmd.handle_error()

	return 0
Example #25
0
def prepare_files(project):
	log = task.logger
	conf = task.conf

	projects_out_port = task.ports("projects_out")

	project_id = project["id"]
	log.info("--- [{0}] --------------------------------------------".format(project_id))

	config = GlobalConfig(conf)
	paths = PathsConfig(config) # avoid that project conf override path configurations
	config = GlobalConfig(conf, project["conf"])

	oclust = OncodriveClust(config.oncodriveclust, paths, log)

	project_results = ProjectResults(project)

	projdb = ProjectDb(project["db"])

	data = oclust.retrieve_data(projdb)

	projdb.close()

	data_paths = [
		os.path.join(project["temp_path"], "oncodriveclust-non-syn-data.tsv"),
		os.path.join(project["temp_path"], "oncodriveclust-syn-data.tsv")]

	log.info("Saving data ...")
	log.debug("> {0}".format(data_paths[NON_SYN]))
	log.debug("> {0}".format(data_paths[SYN]))

	df = [tsv.open(path, "w") for path in data_paths]

	gene_sample_count = defaultdict(int)

	for key, value in data.items():
		findex, gene, sample = key
		transcript, transcript_len, protein_pos = value

		if findex == NON_SYN:
			gene_sample_count[gene] += 1

			if oclust.filter_enabled and not oclust.filter.valid(gene):
				continue

		tsv.write_line(df[findex], gene, sample, protein_pos)

	for f in df:
		f.close()

	exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv")

	log.info("Saving excluded gene causes ...")
	log.debug("> {0}".format(exc_path))

	with tsv.open(exc_path, "w") as exf:
		tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE")
		for gene, sample_count in gene_sample_count.items():
			causes = []
			if oclust.filter_enabled and not oclust.filter.valid(gene):
				causes += [ProjectDb.GENE_EXC_FILTER]
			if sample_count < oclust.samples_threshold:
				causes += [ProjectDb.GENE_EXC_THRESHOLD]
			if len(causes) > 0:
				tsv.write_line(exf, gene, "".join(causes))

	log.info("Sending project ...")

	projects_out_port.send(dict(project,
								oncodriveclust=dict(
									data_paths=data_paths,
									samples_threshold=oclust.samples_threshold)))
Example #26
0
def main():
	parser = argparse.ArgumentParser(
		description="Update scores in the database")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("source_path", metavar="SOURCE",
						help="The source file. Use - for standard input.")

	cmd.add_selected_predictors_args()

	parser.add_argument("--update-predictors", dest="update_predictors", action="store_true", default=False,
						help="Update of the predictors.")

	parser.add_argument("--ignore-errors", dest="ignore_errors", action="store_true", default=False,
						help="When errors on the input file, report them but continue processing the input.")

	args, logger = cmd.parse_args("update")

	db = cmd.open_db()

	predictors = cmd.get_selected_predictors(check_missing=False)

	try:
		progress = RatedProgress(logger, name="SNVs")

		total_lines = 0

		logger.info("Reading {} ...".format(args.source_path if args.source_path != "-" else "from standard input"))

		with tsv.open(args.source_path) as f:
			# Parse header
			hdr_line = f.readline()
			hdr = dict([(name, index) for index, name in enumerate(hdr_line.rstrip("\n").split("\t"))])

			db_predictors = set([p["id"] for p in db.predictors()])

			if len(predictors) == 0:
				predictors = [name for name in hdr if name in db_predictors]
				if len(predictors) == 0:
					raise Exception("Any input file header match the available predictors in the database. Please specify them using -p.")

			logger.info("Predictors: {}".format(", ".join(predictors)))

			for predictor in filter(lambda p: p not in db_predictors, predictors):
				logger.info("Creating predictor {} ...".format(predictor))
				db.add_predictor(predictor, FannsDb.SOURCE_PREDICTOR_TYPE)

			use_genome_coords = len(set(hdr.keys()) & set(GENOME_COORD_COLUMNS)) > 0
			use_protein_coords = len(set(hdr.keys()) & set(PROTEIN_COORD_COLUMNS)) > 0

			if not use_genome_coords and not use_protein_coords:
				raise Exception("No coordinate columns found. "
								"Use {} for genomic coordinates or {} for protein coordinates.".format(
									GENOME_COORD_COLUMNS, PROTEIN_COORD_COLUMNS))
			elif use_genome_coords and use_protein_coords:
				logger.warn("Both, genomic and protein coordinates columns found. Using genomic coordinates by default")

			if use_genome_coords:
				coord_column_names = [n for n in hdr if n in set(GENOME_COORD_COLUMNS)]
				coord_column_types = [GENOME_COORD_COLUMN_TYPE[n] for n in coord_column_names]
				#get_rows = db.get_transcripts_by_dna
			elif use_protein_coords:
				coord_column_names = [n for n in hdr if n in set(PROTEIN_COORD_COLUMNS)]
				coord_column_types = [PROTEIN_COORD_COLUMN_TYPE[n] for n in coord_column_names]
				#get_rows = db.get_transcripts_by_protein

			coord_column_indices = [hdr[n] for n in coord_column_names]
			score_indices = [hdr[n] for n in predictors]
			max_column_index = max(coord_column_indices + score_indices)

			for line_num, line in enumerate(f, start=2):
				fields = line.rstrip("\n").split("\t")

				if len(fields) < max_column_index:
					log.error("Missing columns for line {}: {}".format(line_num, " ".join(fields)))
					if not args.ignore_errors:
						raise

				try:
					coords = dict([(name.lower(), type_cast(fields[index])) for name, type_cast, index in zip(
													coord_column_names, coord_column_types, coord_column_indices)])

					scores = dict([(p, score_value(fields[i])) for p, i in zip(predictors, score_indices)])
				except Exception as ex:
					logger.error("{} at line {}: {}".format(str(ex), line_num, " ".join(fields)))
					if not args.ignore_errors:
						raise

				try:
					for row in db.query_scores(fields=[], **coords):
						db.update_scores(row["id"], scores)
				except Exception as ex:
					logger.error("Error updating SNV at line {}: {}".format(line_num, str(ex)))
					logger.error("SNV: {}".format(", ".join(["{}={}".format(k, v) for k, v in coords.items()])))
					if not args.ignore_errors:
						raise

				progress.update()

		progress.log_totals()

		logger.info("Finalizing database ...")

		if args.update_predictors:
			logger.info("Updating predictors ...")
			db.update_predictors()

		logger.info("Committing ...")
		db.commit()

		logger.info("Finished successfully. Elapsed time: {}".format(progress.elapsed_time))

	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
def main():
	parser = argparse.ArgumentParser(
		description="Filter for the longest transcript")

	cmd = DefaultCommandHelper(parser)

	parser.add_argument("len_path", metavar="PATH",
						help="The tsv containing the transcripts length")

	parser.add_argument("data_path", metavar="PATH",
						help="The data file")

	parser.add_argument("out_path", metavar="PATH",
						help="Output file. Use - for standard output.")

	parser.add_argument("-k", "--key", dest="key", metavar="KEY", default="PROTEIN,AA_POS,AA_REF,AA_ALT",
						help="List of columns that conforms the key. Default: PROTEIN,AA_POS,AA_REF,AA_ALT")

	args, logger = cmd.parse_args("filter-transcript")

	try:
		logger.info("Loading transcripts length ...")
		trslen = defaultdict(int)
		with tsv.open(args.len_path) as f:
			for name, length in tsv.rows(f):
				trslen[name] = length

		logger.info("Filtering {} ...".format(os.path.basename(args.data_path)))

		total_count = filter_count = 0

		progress = RatedProgress(logger, name="mutations")

		key_columns = args.key.split(",")
		with tsv.open(args.data_path, "r") as df, tsv.open(args.out_path, "w") as of:
			hdr_line = df.readline()
			of.write(hdr_line)
			_, hdr = tsv.header_from_line(hdr_line)
			key_indices = [hdr[name] for name in key_columns]
			trs_index = hdr["TRANSCRIPT"]

			last_key = None
			longest = (0, "")

			for line in df:
				total_count += 1

				fields = line.rstrip("\n").split("\t")
				key = tuple([fields[index] for index in key_indices])
				trs = fields[trs_index]

				tl = trslen[trs]

				if last_key != key:
					if last_key is not None:
						of.write(longest[1])
						filter_count += 1
					longest = (tl, line)
					last_key = key

				elif tl > longest[0]:
					longest = (tl, line)

				progress.update()

			filter_count += 1
			of.write(longest[1])

		progress.log_totals()

		logger.info("Finished. in={}, out={}, filtered={}, elapsed={}".format(
			total_count, filter_count, total_count - filter_count, progress.elapsed_time))
	except:
		cmd.handle_error()

	return 0
Example #28
0
	def __open(self):
		self.__f = tsv.open(self.path)
		self.line_num = 0
		if self.header:
			tsv.skip_comments_and_empty(self.__f)
def main():
	parser = OptionParser(usage = "usage: %prog [options] <Variants gvf.gz file> ...")

	parser.add_option("--db", dest="db_path",
		help="Database path")

	parser.add_option("-L", "--log-level", dest="log_level",
		default="info", choices=["debug", "info", "warn", "error", "critical", "notset"],
		help="Which log level: debug, info, warn, error, critical, notset")

	(options, args) = parser.parse_args()

	logging.basicConfig(
		level=LOG_LEVEL[options.log_level],
		format="%(asctime)s %(levelname)-5s : %(message)s")

	log = logging.getLogger("var_db")

	if len(args) < 1:
		log.error("At least one variants file is required")
		parser.print_help()
		exit(-1)

	if options.db_path is None:
		log.error("The database path should be specified")
		parser.print_help()
		exit(-1)

	db_path = options.db_path

	log.info("Opening database ...")

	db = VarXrefsDb(db_path)

	db.open()

	db.begin()

	total_count = 0
	total_start_time = time.time()

	src_var_count = {}
	src_ratio = {}

	chromosomes = set()
	chr_var_count = {}
	strands = set()
	
	try:
		partial_count = 0
		partial_start_time = time.time()
		for xref_path in args:
			log.info("Reading {0} ...".format(xref_path))

			if not os.path.isfile(xref_path):
				log.error("File not found: {0}".format(xref_path))
				exit(-1)

			mtime = datetime.fromtimestamp(os.path.getmtime(xref_path))

			f = tsv.open(xref_path, "r")

			src_count = 0
			src_start_time = time.time()

			line_num = 1

			# discard headers
			line = f.readline()
			while line.startswith("#"):
				line = f.readline()
				line_num += 1

			src_var_count[xref_path] = 0

			for line in f:
				try:
					fields = [x if len(x) > 0 else None for x in line.rstrip("\n").split("\t")]

					chr, source, type, start, end, _1, strand, _2, extra = fields

					start = int(start)
					end = int(end)

					ref = None
					alt = None
					xref = None
					try:
						for var in extra.split(";"):
							try:
								key, value = var.split("=")
								if key == "Dbxref":
									pos = value.index(":")
									xref = value[pos + 1:]
								elif key == "Reference_seq":
									ref = value
								elif key == "Variant_seq":
									alt = value
							except:
								continue
					except:
						pass

					if sum([1 if x is None else 0 for x in [chr, start, strand, ref, alt, source, xref]]) > 0:
						log.warn("Discarding incomplete variant: {0}".format(",".join([chr, str(start), strand, ref, alt, source, xref])))
						continue

					src_var_count[xref_path] += 1

					chromosomes.add(chr)
					if chr in chr_var_count:
						chr_var_count[chr] += 1
					else:
						chr_var_count[chr] = 1
					
					strands.add(strand)

					db.add(chr, start, ref, alt, source, xref, strand)

					total_count += 1
					src_count += 1

					partial_count += 1
					elapsed_time = time.time() - partial_start_time
					if elapsed_time >= 10.0:
						ratio = float(partial_count) / elapsed_time
						log.debug("  {0:.1f} variants/second, {1} variants, {2} total variants".format(ratio,
								hsize(src_count), hsize(total_count)))
						partial_count = 0
						partial_start_time = time.time()

				except Exception as ex:
					log.error("Error at line {0}:\n{1}".format(line_num, line.rstrip("\n")))
					import sys
					import traceback
					traceback.print_exc(file=sys.stdout)
					continue
				finally:
					line_num += 1

			elapsed_time = time.time() - src_start_time
			ratio = float(src_count) / elapsed_time
			src_ratio[xref_path] = ratio
			log.info("  {0:.1f} variants/second, {1} variants, {2} total variants".format(ratio,
					hsize(src_count), hsize(total_count)))

			f.close()

		db.commit()
	except KeyboardInterrupt:
		db.commit()
		log.warn("Interrupted by the user with Ctrl-C")
		exit(-1)
	except:
		db.rollback()
		raise
	finally:
		db.close()

	elapsed_time = time.time() - total_start_time
	total_ratio = float(total_count) / elapsed_time

	log.info("Statistics:")
	log.info("  Sources:")
	for xref_path in args:
		log.info("    {0}: {1} variants".format(os.path.basename(xref_path), src_var_count[xref_path]))
	total_size = 0

	log.info("  Chromosomes:")
	for chr in chromosomes:
		log.info("    {0:>2}: {1:>7} variants".format(chr, str(chr_var_count[chr])))
	log.info("  Strands: {0}".format(", ".join(strands)))

	log.info("  Total {0} variants ({1:.1f} variants/sec)".format(hsize(total_count), total_ratio))
Example #30
0
def main():
	parser = argparse.ArgumentParser(
		description="Calculate weights")

	parser.add_argument("ranges_path", metavar="RANGES_PATH",
						help="JSON file generated with pred-list containing predictors stats. Only min and max are used.")

	parser.add_argument("training_path", metavar="TRAINING_PATH",
						help="The training set scores. ID column should be POS/NEG for positive/negative sets.")

	parser.add_argument("-o", dest="out_path", metavar="OUT_PATH",
						help="The file where weights will be saved. Use - for standard output.")

	parser.add_argument("-p", "--predictors", dest="predictors", metavar="PREDICTORS",
						help="Comma separated list of predictors to fetch")

	parser.add_argument("-P", "--precision", dest="precision", metavar="PRECISSION", type=int, default=3,
						help="Distribution precision")

	parser.add_argument("-f", "--full-state", dest="full_state", action="store_true", default=False,
						help="Save intermediate calculations to allow further exploration and plotting")

	bglogging.add_logging_arguments(parser)

	args = parser.parse_args()

	bglogging.initialize(args)

	logger = bglogging.get_logger("weights")
	
	if args.out_path is None:
		prefix = os.path.splitext(os.path.basename(args.training_path))[0]
		if prefix.endswith("-scores"):
			prefix = prefix[:-7]
		args.out_path = os.path.join(os.getcwd(), "{}-weights.json".format(prefix))

	if args.predictors is not None:
		args.predictors = [p.strip() for p in args.predictors.split(",")]

	logger.info("Loading ranges from {} ...".format(os.path.basename(args.ranges_path)))

	with open(args.ranges_path) as f:
		pred_stats = json.load(f)

	predictor_range = {}
	for pid, pstats in pred_stats.items():
		predictor_range[pid] = (pstats["min"], pstats["max"])

	logger.info("Reading training set {} ...".format(args.training_path if args.training_path != "-" else "from standard input"))

	with tsv.open(args.training_path) as f:

		# Select predictors from the available predictors in the dataset or user selection

		column_names, column_indices = tsv.header(f)
		available_predictors = [c for c in column_names if c not in set(COORD_COLUMNS)]
		if args.predictors is None:
			predictors = available_predictors
		else:
			missing_predictors = [p for p in args.predictors if p not in set(available_predictors)]
			if len(missing_predictors) > 0:
				logger.error("Missing predictors: {}".format(", ".join(missing_predictors)))
				exit(-1)
			predictors = args.predictors

	data = pd.read_csv(args.training_path, sep="\t", index_col=False,
					   usecols=["ID"] + predictors,
					   true_values=["POS"], false_values=["NEG"])

	data.rename(columns={"ID" : "EVT"}, inplace=True)

	# Initialize statistics

	logger.info("Initializing metrics ...")

	step = 1.0 / 10**args.precision

	stats = dict()

	state = dict(
		predictor_names = predictors,
		precision = args.precision,
		step = step,
		stats = stats)

	for predictor in predictors:
		d = data[["EVT", predictor]]
		d = d[np.isfinite(d.iloc[:, 1])]

		nump = d.iloc[:, 0].sum()
		numn = d.shape[0] - nump

		rmin, rmax = d.iloc[:, 1].min(), d.iloc[:, 1].max()

		dim = rmax - rmin
		size = int(dim / step) + 1
		values = [(x * step) + rmin for x in xrange(size)]

		logger.info("  {:10}: p={}, n={}, min={}, max={}, bins={}".format(predictor, nump, numn, rmin, rmax, size))

		stats[predictor] = dict(
			rmin = rmin,
			rmax = rmax,
			dim = dim,
			values = values,
			size = size,
			vmin = rmin,
			vmax = rmax,
			dp = [0] * size,
			dn = [0] * size,
			cdp = [0] * size,
			cdn = [0] * size,
			cump = 0,
			cumn = 0,
			tp = [0] * size,
			tn = [0] * size,
			fp = [0] * size,
			fn = [0] * size,
			mcc = [0] * size,
			acc = [0] * size,
			auc = [0] * size,
			cutoff = None,
			cutoff_index = None,
			cutoff_mcc = None,
			cutoff_acc = None,
			cutoff_auc = None)

	positive_count = data.iloc[:, 0].sum()
	negative_count = data.shape[0] - positive_count

	logger.info("  TOTAL     : positive={}, negative={}".format(positive_count, negative_count))

	logger.info("Calculating scores distribution and confusion matrices ...")



	logger.info("Calculating cumulative distribution ...")

	for predictor in predictors:
		predictor_stats = stats[predictor]
		dp, dn, cdp, cdn = [predictor_stats[k] for k in ["dp", "dn", "cdp", "cdn"]]
		cump = 0
		cumn = 0
		i = len(dp) - 1
		while i >= 0:
			cdp[i] = dp[i] + cump
			cump += dp[i]

			cdn[i] = dn[i] + cumn
			cumn += dn[i]

			i -= 1

		predictor_stats["cump"] = cump
		predictor_stats["cumn"] = cumn

		logger.info("  {}: cump={}, cumn={}".format(predictor, cump, cumn))

	logger.info("Calculating accuracy and cutoff ...")

	for predictor in predictors:
		predictor_stats = stats[predictor]
		values, size, tp, tn, fp, fn, mcc, acc = [predictor_stats[k] for k in [
													"values", "size", "tp", "tn", "fp", "fn", "mcc", "acc"]]

		cutoff = -1
		cutoff_index = -1
		best_mcc = -1e6
		for i in xrange(size):
			try:
				#http://en.wikipedia.org/wiki/Matthews_correlation_coefficient
				mcc[i] = (tp[i] * tn[i] - fp[i] * fn[i]) / sqrt((tp[i] + fp[i]) * (tp[i] + fn[i]) * (tn[i] + fp[i]) * (tn[i] + fn[i]))

				#http://en.wikipedia.org/wiki/Accuracy
				acc[i] = (tp[i] + tn[i]) / float(tp[i] + fp[i] + fn[i] + tn[i])
			except ZeroDivisionError:
				mcc[i] = 0
				acc[i] = 0

			if mcc[i] > best_mcc:
				cutoff = values[i]
				cutoff_index = i
				best_mcc = mcc[i]

		best_acc = max(acc)

		predictor_stats["cutoff"] = cutoff
		predictor_stats["cutoff_index"] = cutoff_index
		predictor_stats["cutoff_mcc"] = best_mcc
		predictor_stats["cutoff_acc"] = best_acc

		logger.info("  {}: cutoff={:.3f}, mcc={:.2f}, accuracy={:.2f}".format(
			predictor, cutoff, best_mcc * 100.0, best_acc * 100.0))

	if args.full_state:
		logger.info("Saving weights with full state ...")

		out_path = args.out_path
		save_weights(out_path, state)

	else:
		logger.info("Saving weights ...")

		stats = {}

		reduced_state = dict(
			predictor_names=state["predictor_names"],
			precision=state["precision"],
			step=state["step"],
			stats=stats)

		for predictor in state["predictor_names"]:
			predictor_stats = state["stats"][predictor]
			stats[predictor] = dict(
				rmin=predictor_stats["rmin"],
				rmax=predictor_stats["rmax"],
				dim=predictor_stats["dim"],
				values=predictor_stats["values"],
				size=predictor_stats["size"],
				cdp=predictor_stats["cdp"],
				cdn=predictor_stats["cdn"],
				cutoff=predictor_stats["cutoff"],
				cutoff_index=predictor_stats["cutoff_index"])

		save_weights(args.out_path, reduced_state)

	return 0