def main(): parser = argparse.ArgumentParser(description="Extract mutations in VCF and save as simple tabulated file") parser.add_argument("vcf_paths", metavar="PATH", nargs="+", help="The VCF files") parser.add_argument("-o", dest="out_path", metavar="PATH", help="Output file. Use - for standard output.") bglogging.add_logging_arguments(self._parser) args = parser.parse_args() bglogging.initialize(self.args) log = bglogging.get_logger("vcf-to-snvs") if args.out_path is None: names = [] for path in args.vcf_paths: if path != "-": base_path, name, ext = tsv.split_path(path) names += [name] prefix = os.path.commonprefix(*names) if len(names) > 0 else "" prefix = prefix.rstrip(".") if len(prefix) == 0: prefix = "genome" args.out_path = "{}.tsv.gz".format(prefix) with tsv.open(args.out_path, "w") as outf: tsv.write_line(outf, "CHR", "POS", "REF", "ALT") for path in args.vcf_paths: log.info("Reading {} ...".format(path)) with tsv.open(path) as inf: types = (str, str, str, str) columns = [0, 1, 3, 4] for fields in tsv.lines(inf, types, columns=columns): chrom, pos, ref, alt = fields # ref = ref.upper().strip("N") # alt = alt.upper().strip("N") ref_len = len(ref) alt_len = len(alt) if ref_len != alt_len or ref_len == 0 or alt_len == 0: continue try: pos = int(pos) except: continue if ref_len == 1: tsv.write_line(outf, chrom, pos, ref, alt) else: for i in range(ref_len): tsv.write_line(outf, chrom, pos + i, ref[i], alt[i])
def save_matrix(self, output_path, analysis_name, output_format, row_names, col_names, data, suffix="", params=None, valid_row=lambda row: True): if len(suffix) > 0: suffix = "-{0}".format(suffix) if params is None: params = [] path = os.path.join(output_path, "{0}{1}.{2}".format(analysis_name, suffix, output_format)) self.log.debug(" > {0}".format(path)) with tsv.open(path, 'w') as f: tsv.write_line(f, "## version={0}".format(VERSION)) tsv.write_line(f, "## date={0}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) for key, value in params + self.parameters: tsv.write_line(f, "## {0}={1}".format(key, value)) tsv.write_line(f, "ID", *col_names) for row_index, row_name in enumerate(row_names): if len(row_name) == 0: self.log.warn("Empty identifier detected") continue row = data[row_index, :] if valid_row(row): values = [v if not np.isnan(v) else None for v in row] tsv.write_line(f, row_name, *values, null_value="-")
def extract_snvs(fanns_db, data_path, logger=None): logger = logger or logging.getLogger("perf-cosmic") snvs = dict() logger.info("Reading mutations ...") progress = RatedProgress(logger, name="mutations") with tsv.open(data_path, "r") as df: columns = [ "Genome-wide screen", "Mutation Description", "Mutation CDS", "Mutation AA", "Mutation GRCh37 genome position", "Mutation GRCh37 strand", "Accession Number", "ID_sample"] total_rows = queried_rows = dbfound_rows = 0 for fields in tsv.rows(df, columns=columns, header=True): total_rows += 1 wide_screen, mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields # wide_screen != "y" if mut_desc != "Substitution - Missense": continue queried_rows += 1 for row in get_transcripts(fanns_db, mut_cds, mut_aa, mut_pos, mut_strand, acc, logger): dbfound_rows += 1 k = tuple([row[k] for k in ["protein", "aa_pos", "aa_ref", "aa_alt"]]) if k not in snvs: snvs[k] = snv = dict( transcript=row["transcript"], symbol=row["xrefs"]["symbol"], msamples=set(), wsamples=set()) else: snv = snvs[k] if wide_screen == "y": snv["wsamples"].add(sample_id) else: snv["msamples"].add(sample_id) progress.update() progress.log_totals() logger.info("Counting the number of samples per mutation ...") for data in snvs.itervalues(): data["msamples"] = len(data["msamples"]) data["wsamples"] = len(data["wsamples"]) logger.info("Total: total_rows={}, queried_rows={}, found_rows={}, protein_changes={}".format(total_rows, queried_rows, dbfound_rows, len(snvs))) return snvs
def update_db(project): log = task.logger conf = task.conf projects_out_port = task.ports("projects_out") project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) oclust = project["oncodriveclust"] del project["oncodriveclust"] if not os.path.exists(oclust["results"]): log.warn("No results have been found. Skipping it.") return log.info("Updating the project database ...") projdb = ProjectDb(project["db"]) exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv") log.info(" Excluded gene causes ...") log.debug(" > {0}".format(exc_path)) count = 0 with tsv.open(exc_path, "r") as exf: for gene, cause in tsv.lines(exf, (str, str), header=True): projdb.update_gene(Gene(id=gene, clust_exc_cause=cause)) count += 1 log.debug(" {0} genes excluded".format(count)) log.info(" OncodriveCLUST results ...") with tsv.open(oclust["results"], "r") as f: types = (str, str, float, float, float) columns = ("GENE", "CLUST_COORDS", "ZSCORE", "PVALUE", "QVALUE") for gene, coords, zscore, pvalue, qvalue in tsv.lines(f, types, columns=columns, header=True, null_value="NA"): projdb.update_gene(Gene(id=gene, clust_coords=coords, clust_zscore=zscore, clust_pvalue=pvalue, clust_qvalue=qvalue, clust_exc_cause=ProjectDb.NO_GENE_EXC)) projdb.commit() projdb.close() projects_out_port.send(project)
def load_cds_len(self, path): self.logger.info("Loading transcripts CDS length ...") self.logger.debug("> {}".format(path)) cds_len = {} with tsv.open(path, "r") as f: for gene, transcript, transcript_len in tsv.lines(f, (str, str, int), header=True): cds_len[transcript] = transcript_len return cds_len
def main(): parser = argparse.ArgumentParser(description="Add annotations") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("id", metavar="ID", help="Annotation identifier.") parser.add_argument("name", metavar="NAME", help="Annotation name.") parser.add_argument( "type", metavar="TYPE", choices=["transcript", "protein"], help="Annotation type: transcript, protein" ) parser.add_argument("path", metavar="PATH", help="Annotation items") parser.add_argument( "--priority", dest="priority", default=0, help="Priority for translating input annotations. 0 means not considered for translation. Default 0.", ) parser.add_argument( "--header", dest="header", action="store_true", default=False, help="Specify that the annotation items file have a header.", ) args, logger = cmd.parse_args("ann-add") db = cmd.open_db() try: logger.info("Creating annotation {} ...".format(args.name)) db.add_map(args.id, args.name, args.type, args.priority) logger.info("Loading items ...") with tsv.open(args.path) as f: for source, value in tsv.lines(f, (str, str), header=args.header): if len(source) > 0 and len(value) > 0: db.add_map_item(args.id, source, value) db.commit() except: return cmd.handle_error() finally: db.close() return 0
def load_data(self, data_paths, method=None): columns = [] col_names = [] row_name_index = {} for col_index, data_file in enumerate(data_paths): self.log.debug(" > {0}".format(data_file)) names = [] values = [] with tsv.open(data_file, "r") as f: col_name, ext = os.path.splitext(os.path.basename(data_file)) params = tsv.params(f) if "slice" in params: col_name = params["slice"] if "method" in params: if method is None: method = params["method"] elif method != params["method"]: self.log.warn("Different method of computation used for file {0}".format(data_file)) for name, value in tsv.lines(f, (str, float), header=True, null_value="-"): if len(name) == 0: self.log.warn("Empty identifier detected") continue if name not in row_name_index: row_name_index[name] = len(row_name_index) names += [name] values += [value] col_names += [col_name] columns += [(names, values)] num_cols = len(columns) num_rows = len(row_name_index) row_names = [None] * num_rows for name, index in row_name_index.items(): row_names[index] = name data = np.empty((num_rows, num_cols)) data[:] = np.nan for col_index, (names, values) in enumerate(columns): for i, name in enumerate(names): data[row_name_index[name], col_index] = values[i] return row_names, col_names, data, method
def open_dataset(project_id, base_path, datasets_path, name, mode, logger): name, ext = os.path.splitext(name) ext = ext.lower() if len(ext) == 0: ext = ".gz" name = "{0}.tsv{1}".format(name, ext) else: name = name + ext path = os.path.join(datasets_path, name) logger.debug("> {0}".format(os.path.relpath(path, base_path))) f = tsv.open(path, mode) tsv.write_param(f, "version", VERSION) tsv.write_param(f, "date", datetime.now().strftime("%Y-%m-%d %H:%M:%S")) tsv.write_param(f, "PROJECT_ID", project_id) return f
def save_combined_results(self, output_path, analysis_name, output_format, method, row_names, col_names, data, suffix="combination"): self.log.info("Saving combination results ...") path = os.path.join(self.args.output_path, "{0}-{1}.{2}".format(self.args.analysis_name, suffix, self.args.output_format)) self.log.debug(" > {0}".format(path)) with tsv.open(path, 'w') as f: tsv.write_line(f, "## slices={0}".format(",".join(col_names))) tsv.write_line(f, "## method={0}".format(method.name)) tsv.write_line(f, "## date={0}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) for key, value in self.parameters: tsv.write_line(f, "## {0}={1}".format(key, value)) tsv.write_line(f, "ID", *method.combination_columns) for row_index, row_name in enumerate(row_names): if not np.isnan(data[row_index, 0]): values = [v if not np.isnan(v) else None for v in data[row_index, :]] tsv.write_line(f, row_name, *values, null_value="-")
def fetch(db, muts_path, out_path, params=None, columns=None, maps=None, predictors=None, labels=None, calc_labels=None, muts_header=False, logger=None): params = params or {} columns = columns or [c.lower() for c in COORD_COLUMNS] maps = maps or [] predictors = predictors or [] labels = labels or [] state = {} with tsv.open(out_path, "w") as wf: metadata = db.metadata if "version" in metadata: tsv.write_param(wf, "db-version", db.metadata["version"]) tsv.write_param(wf, "fetched", dt.now().strftime("%Y-%m-%d %H:%M:%S")) for k, v in params.items(): tsv.write_param(wf, k, v) tsv.write_line(wf, "ID", *[c.upper() for c in columns] + [m.upper() for m in maps] + predictors + labels) for row in fetch_iter(db, muts_path, maps=maps, predictors=predictors, muts_header=muts_header, state=state, logger=logger): if calc_labels is not None: labels = calc_labels(row) or {} else: labels = {} xrefs = row["xrefs"] scores = row["scores"] tsv.write_line(wf, state[STATE_MUTATION].identifier, *[row[c] for c in columns] + [xrefs[m] for m in maps] + [scores[p] for p in predictors] + [labels.get(l, "") for l in labels]) return {k : state[k] for k in [STATE_HITS, STATE_FAILS]}
def add_map(db, id, name, type, priority, path, header=True): """ :param id: map identifier :param name: map name :param type: xref maps to type: transcript, protein :param path: map file :param priority: priority for translating input xrefs. 0 means not considered for translation. Default 0. :param header: specify that the map file have a header. """ logger = logging.getLogger("fannsdb.map-add") logger.info("Creating map {} ...".format(name)) db.add_map(id, name, type, priority) logger.info("Loading items ...") with tsv.open(path) as f: for source, value in tsv.lines(f, (str, str), header=header): if len(source) > 0 and len(value) > 0: db.add_map_item(id, source, value)
def main(): parser = argparse.ArgumentParser( description="Export SNV's") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("dest_path", metavar="DEST", help="The destination file. Use - for standard output.") args, log = cmd.parse_args("export-snvs") db = cmd.open_db() logger.info("Exporting SNV's ...") total_count = 0 total_start_time = time.time() try: progress = RatedProgress(logger, name="SNVs") rows_count = 0 with tsv.open(args.dest_path, "w") as f: for snv in db.snvs(): rows_count += 1 tsv.write_line(f, snv["chr"], snv["start"], snv["start"], snv["strand"], "{}>{}".format(snv["ref"], snv["alt"]), "S") progress.update() log.info("Finished. Total rows = {}, elapsed_time = {}".format(rows_count, progress.elapsed_time)) except: return cmd.handle_error() finally: db.close() return 0
def save_splited_results(self, output_path, analysis_name, output_format, matrix, mapping, method, results, slices, suffix=""): if len(suffix) > 0: suffix = "-{0}".format(suffix) for slice_results_index, slice in enumerate(slices): slice_name = matrix.slice_names[slice] path = os.path.join(output_path, "{0}{1}-{2}.{3}".format( analysis_name, suffix, slice_name, output_format)) self.log.debug(" > {0}".format(path)) with tsv.open(path, 'w') as f: tsv.write_line(f, "## version={0}".format(VERSION)) tsv.write_line(f, "## slice={0}".format(slice_name)) tsv.write_line(f, "## method={0}".format(method.name)) tsv.write_line(f, "## date={0}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) for key, value in self.parameters: tsv.write_line(f, "## {0}={1}".format(key, value)) tsv.write_line(f, "ID", *method.results_columns) for row_index, row_name in enumerate(mapping.group_names): value = results[slice_results_index, row_index] if not np.isnan(value): tsv.write_line(f, row_name, value, null_value="-")
def combination_recurrences(projects_set): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) classifier, projects = projects_set classifier_id = classifier["id"] group_values = classifier["group_values"] short_values = classifier["group_short_values"] long_values = classifier["group_long_values"] group_name = classifier["group_name"] group_short_name = classifier["group_short_name"] group_long_name = classifier["group_long_name"] if len(group_values) == 0: group_file_prefix = classifier_id else: group_file_prefix = "{0}-{1}".format(classifier_id, group_short_name) group_file_prefix = normalize_id(group_file_prefix) log.info( "--- [{0} ({1}) ({2}) ({3})] {4}".format( classifier["name"], group_long_name, group_short_name, group_name, "-" * 30 ) ) log.info("Creating database ...") db_path = make_temp_file(task, suffix="-{0}.db".format(group_file_prefix)) log.debug(" > {0}".format(db_path)) conn = sqlite3.connect(db_path) conn.row_factory = sqlite3.Row create_db(conn) log.info("Combining recurrences ...") c = conn.cursor() sample_total = 0 project_ids = [] for project in projects: project_ids += [project["id"]] log.info(" Project {0}:".format(project["id"])) projdb = ProjectDb(project["db"]) project_sample_total = projdb.get_total_affected_samples() sample_total += project_sample_total log.info(" Total samples = {0}".format(project_sample_total)) log.info(" Variant genes ...") count = 0 for afg in projdb.affected_genes(join_variant=True, join_xrefs=True, join_rec=True): var = afg.var rec = afg.rec if rec.sample_freq is None: log.warn("Discarding variant gene without sample frequency: {0}".format(repr(afg))) continue start, end, ref, alt = var_to_tab(var) try: c.execute( "INSERT INTO variants (chr, strand, start, ref, alt, xrefs) VALUES (?,?,?,?,?,?)", (var.chr, var.strand, start, ref, alt, ",".join(var.xrefs)), ) var_id = c.lastrowid except sqlite3.IntegrityError: c.execute( "SELECT var_id FROM variants WHERE chr=? AND strand=? AND start=? AND ref=? AND alt=?", (var.chr, var.strand, start, ref, alt), ) r = c.fetchone() var_id = r[0] try: c.execute( "INSERT INTO variant_genes (var_id, gene_id, impact, coding_region, prot_changes, sample_freq) VALUES (?,?,?,?,?,?)", (var_id, afg.gene_id, afg.impact, afg.coding_region, afg.prot_changes, rec.sample_freq), ) except sqlite3.IntegrityError: c.execute( """ UPDATE variant_genes SET sample_freq=sample_freq + ? WHERE var_id=? AND gene_id=?""", (rec.sample_freq, var_id, afg.gene_id), ) count += 1 log.info(" {0} variant genes".format(count)) log.info(" Genes ...") count = 0 for gene in projdb.genes(join_xrefs=True, join_rec=True): rec = gene.rec if rec.sample_freq is None: continue c.execute("SELECT COUNT(*) FROM genes WHERE gene_id=?", (gene.id,)) r = c.fetchone() if r[0] == 0: c.execute("INSERT INTO genes (gene_id, sample_freq) VALUES (?,?)", (gene.id, rec.sample_freq)) else: c.execute("UPDATE genes SET sample_freq=sample_freq + ? WHERE gene_id=?", (rec.sample_freq, gene.id)) count += 1 log.info(" {0} genes".format(count)) log.info(" Pathways ...") count = 0 for pathway in projdb.pathways(join_rec=True): rec = pathway.rec if rec.sample_freq is None: continue c.execute("SELECT COUNT(*) FROM pathways WHERE pathway_id=?", (pathway.id,)) r = c.fetchone() if r[0] == 0: c.execute("INSERT INTO pathways (pathway_id, sample_freq) VALUES (?,?)", (pathway.id, rec.sample_freq)) else: c.execute( "UPDATE pathways SET sample_freq=sample_freq + ? WHERE pathway_id=?", (rec.sample_freq, pathway.id) ) count += 1 log.info(" {0} pathways".format(count)) projdb.close() log.info("Calculating proportions with {0} samples in total among projects ...".format(sample_total)) if sample_total > 0: c.execute("UPDATE variant_genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total)) c.execute("UPDATE genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total)) c.execute("UPDATE pathways SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total)) c.close() conn.commit() log.info("Saving results ...") c = conn.cursor() base_path = paths.combination_path("recurrences") log.info(" Variant genes ...") with tsv.open(os.path.join(base_path, "variant_gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f: tsv.write_param(f, "classifier", classifier["id"]) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "group_short_name", group_short_name) tsv.write_param(f, "group_long_name", group_long_name) tsv.write_param(f, "projects", ",".join(project_ids)) tsv.write_param(f, "SAMPLE_TOTAL", sample_total) tsv.write_line( f, "CHR", "STRAND", "START", "ALLELE", "GENE_ID", "IMPACT", "IMPACT_CLASS", "SAMPLE_FREQ", "SAMPLE_PROP", "PROT_CHANGES", "XREFS", ) for r in c.execute( "SELECT * FROM variant_genes JOIN variants USING (var_id) ORDER BY chr*1, chr, strand, start, gene_id" ): strand, ref, alt = r["strand"], r["ref"], r["alt"] allele = "{0}/{1}".format(ref, alt) tsv.write_line( f, r["chr"], strand, r["start"], allele, r["gene_id"], r["impact"], TransFIC.class_name(r["impact"]), r["sample_freq"], r["sample_prop"], r["prot_changes"], r["xrefs"], null_value="-", ) log.info(" Genes ...") with tsv.open(os.path.join(base_path, "gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f: tsv.write_param(f, "classifier", classifier["id"]) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "group_short_name", group_short_name) tsv.write_param(f, "group_long_name", group_long_name) tsv.write_param(f, "projects", ",".join(project_ids)) tsv.write_param(f, "SAMPLE_TOTAL", sample_total) tsv.write_line(f, "GENE_ID", "SAMPLE_FREQ", "SAMPLE_PROP") for r in c.execute("SELECT * FROM genes ORDER BY gene_id"): tsv.write_line(f, r["gene_id"], r["sample_freq"], r["sample_prop"], null_value="-") log.info(" Pathways ...") with tsv.open(os.path.join(base_path, "pathway-{0}.tsv.gz".format(group_file_prefix)), "w") as f: tsv.write_param(f, "classifier", classifier["id"]) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "group_short_name", group_short_name) tsv.write_param(f, "group_long_name", group_long_name) tsv.write_param(f, "projects", ",".join(project_ids)) tsv.write_param(f, "SAMPLE_TOTAL", sample_total) tsv.write_line(f, "PATHWAY_ID", "SAMPLE_FREQ", "SAMPLE_PROP") for r in c.execute("SELECT * FROM pathways ORDER BY pathway_id"): tsv.write_line(f, r["pathway_id"], r["sample_freq"], r["sample_prop"], null_value="-") conn.close() remove_temp(task, db_path)
def prepare_files(project): log = task.logger conf = task.conf projects_out_port = task.ports("projects_out") project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) project_results = ProjectResults(project) mutations_threshold, genes_filter_enabled, genes_filter, filt = get_oncodriveclust_configuration(log, conf, project) log.info("Loading transcripts CDS length ...") cds_len = load_cds_len(conf) log.info("Retrieving gene alterations ...") projdb = ProjectDb(project["db"]) data = retrieve_data(projdb, cds_len) projdb.close() data_paths = [ os.path.join(project["temp_path"], "oncodriveclust-non-syn-data.tsv"), os.path.join(project["temp_path"], "oncodriveclust-syn-data.tsv")] log.info("Saving data ...") log.debug("> {0}".format(data_paths[NON_SYN])) log.debug("> {0}".format(data_paths[SYN])) df = [tsv.open(path, "w") for path in data_paths] gene_sample_count = {} for key, value in data.items(): findex, gene, sample = key transcript, transcript_len, protein_pos = value if findex == NON_SYN: if gene not in gene_sample_count: gene_sample_count[gene] = 1 else: gene_sample_count[gene] += 1 if genes_filter_enabled and not filt.valid(gene): continue tsv.write_line(df[findex], gene, sample, protein_pos) for f in df: f.close() exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv") log.info("Saving excluded gene causes ...") log.debug("> {0}".format(exc_path)) with tsv.open(exc_path, "w") as exf: tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE") for gene, sample_count in gene_sample_count.items(): causes = [] if genes_filter_enabled and not filt.valid(gene): causes += [ProjectDb.GENE_EXC_FILTER] if sample_count < mutations_threshold: causes += [ProjectDb.GENE_EXC_THRESHOLD] if len(causes) > 0: tsv.write_line(exf, gene, "".join(causes)) log.info("Sending project ...") projects_out_port.send(dict(project, oncodriveclust=dict( data_paths=data_paths, mutations_threshold=mutations_threshold, genes_filter_enabled=genes_filter_enabled, # not used genes_filter=genes_filter))) # not used
def __enter__(self): self.f = tsv.open(self.name, "w") self._size = 0 return self
def main(): parser = argparse.ArgumentParser( description="Fetch Condel scores") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("muts_path", metavar="SNVS_PATH", help="SNV's to check. Use - for standard input.") parser.add_argument("out_path", metavar="OUTPUT_PATH", help="The results path. Use - for standard output.") cmd.add_selected_predictors_args() cmd.add_selected_annotations_args() cmd.add_selected_columns_args() args, logger = cmd.parse_args("fetch") db = cmd.open_db() predictors = cmd.get_selected_predictors() annotations = cmd.get_selected_annotations() columns = cmd.get_selected_columns() logger.info("Reading {} ...".format(args.muts_path if args.muts_path != "-" else "from standard input")) try: progress = RatedProgress(logger, name="SNVs") with tsv.open(args.muts_path) as f: with tsv.open(args.out_path, "w") as wf: tsv.write_line(wf, "ID", *[c.upper() for c in columns] + [a.upper() for a in annotations] + predictors) hit = fail = 0 mut = DnaAndProtMutationParser() for line_num, line in enumerate(f, start=1): line = line.rstrip(" \n\r") if len(line) == 0 or line.startswith("#"): continue try: mut.parse(line) except PrematureEnd: logger.error("Missing fields at line {}".format(line_num)) fail += 1 continue except UnexpectedToken as ex: logger.error("Unexpected field '{}' at line {}".format(ex.args[0], line_num)) fail += 1 continue exists = False for row in query_mutation(logger, db, mut, annotations, predictors): exists = True ann = row["annotations"] scores = row["scores"] tsv.write_line(wf, mut.identifier, *[row[c] for c in columns] + [ann[a] for a in annotations] + [scores[p] for p in predictors]) """ if logger.isEnabledFor(logging.DEBUG): logger.debug(" --> {} {} {} {} {} {} {} {} {} {}".format( row["chr"], row["start"], row["ref"], row["alt"], row["transcript"], row["protein"], row["aa_pos"], row["aa_ref"], row["aa_alt"], mut.identifier or "*")) """ progress.update() if exists: hit += 1 else: fail += 1 progress.log_totals() logger.info("Finished. total={}, hits={}, fails={}, elapsed={}".format(hit + fail, hit, fail, progress.elapsed_time)) except: return cmd.handle_error() finally: db.close()
def main(): parser = argparse.ArgumentParser(description="Calculate Baseline Tolerance statistics per gene") cmd = DefaultCommandHelper(parser) parser.add_argument("tree_path", metavar="TREE_PATH", help="The groups descendant tree") parser.add_argument("root_group", metavar="ROOT_GROUP", help="Tree root group") parser.add_argument("group_genes_path", metavar="GROUP_FEATS_PATH", help="Map between groups and features") parser.add_argument("stats_path", metavar="STATS_PATH", help="Partial gene statistics") parser.add_argument("out_path", metavar="OUTPUT_PATH", help="Output gene statistics") parser.add_argument( "-c", "--count-threshold", dest="count_threshold", metavar="N", default=DEFAULT_COUNT_THRESHOLD, help="Minimum number of features per group", ) parser.add_argument( "--stdev-threshold", dest="stdev_threshold", metavar="V", default=DEFAULT_STDEV_THRESHOLD, help="Skip feature statistics with a standard deviation less than V (it will be calculated at the level of groups)", ) args, logger = cmd.parse_args("blt-groups") logger.info("Loading groups tree ...") group_children = defaultdict(set) with tsv.open(args.tree_path) as f: for group, children in tsv.lines(f, (str, lambda v: set(v.split(",")))): group_children[group] |= children logger.info("Loading mappings between groups and features ...") group_genes = defaultdict(set) with tsv.open(args.group_genes_path) as f: for group, genes in tsv.lines(f, (str, lambda v: set(v.split(",")))): group_genes[group] |= genes logger.info("Loading partial statistics ...") partial_stats = {} with tsv.open(args.stats_path) as f: predictors = f.readline().rstrip("\n").split("\t")[1:] num_predictors = len(predictors) for line in f: fields = line.rstrip("\n").split("\t") gene = fields[0] gene_stats = [[float(v) if i > 0 else int(v) for i, v in enumerate(ss.split("/"))] for ss in fields[1:]] partial_stats[gene] = gene_stats logger.info(" Predictors: {}".format(", ".join(predictors))) logger.info(" Features: {}".format(len(partial_stats.keys()))) logger.info("Calculating features ...") stats = {} feat_count = 0 feat_partial_count = [0] * num_predictors for feature, feat_partial_stats in partial_stats.items(): feat_with_stats = False feat_stats = [None] * (num_predictors + 1) for i in range(num_predictors): s0, s1, s2 = feat_partial_stats[i] if s0 == 0.0: continue if s0 < args.count_threshold: continue x = (s0 * s2 - s1 * s1) / (s0 * (s0 - 1)) if x < -1e-12: continue mean = s1 / s0 std = math.sqrt(abs(x)) if std < args.stdev_threshold: continue feat_stats[i] = (int(s0), mean, std) feat_partial_count[i] += 1 feat_with_stats = True if feat_with_stats: feat_count += 1 stats[feature] = feat_stats # print feature, "\t".join(["/".join([str(v) for v in feat_stats[i] or []]) for i in range(num_predictors)]) logger.info( " {} ({}) features out of {} calculated directly from partial statistics".format( feat_count, "/".join(map(str, feat_partial_count)), len(partial_stats) ) ) logger.info("Calculating groups ...") calculate_group( logger, args.root_group, args.count_threshold, group_children, group_genes, partial_stats, num_predictors, stats ) logger.info(" {} features calculated in total".format(len(stats))) with tsv.open(args.out_path, "w") as of: tsv.write_line(of, "GENE", "GROUP", *predictors) for gene in sorted(stats.keys()): gene_stats = stats[gene] sb = [gene] stats_group = gene_stats[num_predictors] if stats_group is not None: sb += [stats_group] else: sb += ["|" + ("-" * num_predictors)] for i in range(num_predictors): if gene_stats[i] is not None: sb += ["/".join([str(v) for v in gene_stats[i]])] else: sb += ["-/-/-"] tsv.write_line(of, *sb) return 0
def main(): parser = argparse.ArgumentParser( description="Import scores into the database") cmd = Command.withtraits(DbTrait, PredictorsTrait)(parser) cmd.add_db_args() parser.add_argument("source_path", metavar="SOURCE", help="The source file. Use - for standard input.") #TODO: which are the coordinates column cmd.add_selected_predictors_args() parser.add_argument("--skip-empty-scores", dest="skip_empty_scores", action="store_true", default=False, help="Skip SNV's where all the scores are empty") parser.add_argument("--skip-update-predictors", dest="skip_update_predictors", action="store_true", default=False, help="Skip the update of the predictors.") parser.add_argument("--skip-create-index", dest="skip_create_index", action="store_true", default=False, help="Skip the creation of the database indices.") parser.add_argument("--ignore-errors", dest="ignore_errors", action="store_true", default=False, help="When errors on the input file, report them but continue processing the input.") args, logger = cmd.parse_args("import") db = cmd.open_db() try: progress = RatedProgress(logger, name="SNVs") total_lines = 0 logger.info("Reading {} ...".format(args.source_path if args.source_path != "-" else "from standard input")) with tsv.open(args.source_path) as f: # Parse header hdr_line = f.readline() hdr = {} for index, name in enumerate(hdr_line.rstrip("\n").split("\t")): hdr[name] = index # Predictors to update from the user selection and source availability db_predictors = set([p["id"] for p in db.predictors()]) src_predictors = [name for name in hdr if name not in COORD_COLUMNS] predictors = cmd.get_selected_predictors(available_predictors=src_predictors) for predictor in predictors: if predictor not in db_predictors: logger.info("Creating non existing predictor: {}".format(predictor)) db.add_predictor(predictor, FannsDb.SOURCE_PREDICTOR_TYPE) logger.info("Predictors: {}".format(", ".join(predictors))) all_columns = COORD_COLUMNS + predictors types = COORD_TYPES + ([score_value] * len(predictors)) missing_columns = [name for name in all_columns if name not in hdr] if len(missing_columns) > 0: raise Exception("The following columns are missing: {}".format(", ".join(missing_columns))) columns = [hdr[name] for name in all_columns] max_column = max(columns) for line_num, line in enumerate(f, start=2): fields = line.rstrip("\n").split("\t") if len(fields) < max_column: log.error("Missing columns for line {}: {}".format(line_num, " ".join(fields))) if not args.ignore_errors: raise try: fields = [type_cast(fields[index]) for type_cast, index in zip(types, columns)] except Exception as ex: logger.error("{} at line {}: {}".format(str(ex), line_num, " ".join(fields))) if not args.ignore_errors: raise (chr, strand, start, ref, alt, transcript, aa_pos, aa_ref, aa_alt, protein) = fields[:10] scores = fields[10:] if args.skip_empty_scores and sum([0 if s is None else 1 for s in scores]) == 0: continue try: db.add_snv( chr=chr, strand=strand, start=start, ref=ref, alt=alt, transcript=transcript, protein=protein, aa_pos=aa_pos, aa_ref=aa_ref, aa_alt=aa_alt, scores=dict(zip(predictors, scores))) except Exception as ex: logger.error("Error importing SNV at line {}: {}".format(line_num, str(ex))) if not args.ignore_errors: raise progress.update() total_lines += line_num progress.log_totals() logger.info("Finalizing database ...") if not args.skip_update_predictors: logger.info("Updating predictors ...") db.update_predictors() logger.info("Committing ...") db.commit() if not args.skip_create_index: logger.info("Creating indices ...") db.create_indices() logger.info("Finished successfully. Elapsed time: {}".format(progress.elapsed_time)) except: return cmd.handle_error() finally: db.close() return 0
def drivers(): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) db_path = paths.results_path("drivers.db") db = SigDb(db_path) db.open() log.info("Variants ...") path = paths.combination_path("recurrences", "variant_gene-global-all.tsv.gz") with tsv.open(path, "r") as f: types = (str, str, int, str) for fields in tsv.lines(f, types, columns=("CHR", "STRAND", "START", "ALLELE"), header=True): chr, strand, start, allele = fields[:4] db.add_variant(chr, start) log.info("Genes ...") gene_sites = {} gene_fm = set() gene_clust = set() #SPECIAL_THRESHOLD = ["C18", "C34"] SPECIAL_THRESHOLD = [] log.info(" OncodriveFM ...") filename_re = re.compile(r"gene-cancer_site-(.+)\.tsv.gz") base_path = paths.combination_path("oncodrivefm") for path in os.listdir(base_path): m = filename_re.match(path) if not m: continue cancer_site_code = m.group(1) if cancer_site_code in SPECIAL_THRESHOLD: threshold = 1e-6 else: threshold = 0.01 with tsv.open(os.path.join(base_path, path), "r") as f: params = tsv.params(f) cancer_site_name = params["group_long_name"] for fields in tsv.lines(f, (str, float), columns=("ID", "QVALUE"), header=True): gene, qvalue = fields if qvalue < threshold: add_cancer_site(gene_sites, gene, cancer_site_code, cancer_site_name) gene_fm.add(gene) log.info(" OncodriveCLUST ...") filename_re = re.compile(r"cancer_site-(.+)\.tsv.gz") base_path = paths.combination_path("oncodriveclust") for path in os.listdir(base_path): m = filename_re.match(path) if not m: continue cancer_site_code = m.group(1) with tsv.open(os.path.join(base_path, path), "r") as f: params = tsv.params(f) cancer_site_name = params["group_long_name"] for fields in tsv.lines(f, (str, float), columns=("ID", "QVALUE"), header=True): gene, qvalue = fields if qvalue < 0.05: add_cancer_site(gene_sites, gene, cancer_site_code, cancer_site_name) gene_clust.add(gene) log.info(" Updating db ...") sig_genes = gene_fm | gene_clust for gene in sig_genes: db.add_gene(gene, gene in gene_fm, gene in gene_clust) log.info("Saving driver genes cancer sites dataset ...") path = paths.results_path("gene-driver_cancer_sites.tsv") log.debug("> {}".format(path)) with open(path, "w") as f: tsv.write_param(f, "date", datetime.now()) tsv.write_line(f, "GENE_ID", "FM", "CLUST", "CANCER_SITES_COUNT", "CANCER_SITE_CODES", "CANCER_SITE_NAMES") for gene, sites in gene_sites.items(): tsv.write_line(f, gene, 1 if gene in gene_fm else 0, 1 if gene in gene_clust else 0, len(sites), ", ".join(sorted([code for code, name in sites])), ", ".join(sorted([name for code, name in sites]))) db.commit() db.close()
def __open(self): self.__f = tsv.open(self.path) self.line_num = 0
def main(): parser = argparse.ArgumentParser( description="Export Scores") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("dest_path", metavar="OUTPUT_PATH", help="The output file. Use - for standard output.") cmd.add_selected_predictors_args() cmd.add_selected_annotations_args() cmd.add_selected_columns_args() parser.add_argument("--json", dest="to_json", action="store_true", default=False, help="Export the results in json format") parser.add_argument("--sample", dest="sample", type=int, metavar="PCT", help="Export a random sample of PCT %%") parser.add_argument("--start", dest="start", type=int, metavar="N", help="Start to export from the SNV number N") parser.add_argument("--limit", dest="limit", type=int, metavar="N", help="Limit the number of SNVs to export to N") args, logger = cmd.parse_args("export") db = cmd.open_db() predictors = cmd.get_selected_predictors() annotations = cmd.get_selected_annotations() columns = cmd.get_selected_columns() logger.info("Exporting ...") random.seed(time.time()) total_count = 0 total_start_time = time.time() try: progress = RatedProgress(logger, name="SNVs") to_json = args.to_json sample = args.sample start = args.start or 0 limit = args.limit doc = None last_pos = None rows_count = 0 snvs_count = 0 with tsv.open(args.dest_path, "w") as f: if not to_json: tsv.write_line(f, *[c.upper() for c in columns] + [a.upper() for a in annotations] + predictors) for row in db.query_scores(predictors=predictors, maps=annotations): if not to_json: if start > 0: start -= 1 continue if sample is not None and random.randint(1, 100) > sample: continue pos = (row["chr"], row["strand"], row["start"], row["ref"], row["alt"]) if last_pos != pos: if to_json: if start > 0: start -= 1 continue if limit is not None and snvs_count >= limit: if doc is not None: json.dump(doc, f) f.write("\n") break snvs_count += 1 rows_count += 1 ann = row["annotations"] scores = row["scores"] if to_json: tdoc = dict([(k,row[k]) for k in ["transcript", "protein", "aa_pos", "aa_ref", "aa_alt"]] + [(k,scores[k]) for k in predictors]) if pos != last_pos: if doc is not None: if sample is None or random.randint(1, 100) <= sample: json.dump(doc, f) f.write("\n") else: snvs_count -= 1 doc = dict([(k, row[k]) for k in ["chr", "strand", "start", "ref", "alt"]] + [("transcripts", [tdoc])]) else: doc["transcripts"] += [tdoc] else: tsv.write_line(f, *[row[c] for c in columns] + [ann[a] for a in annotations] + [scores[p] for p in predictors]) progress.update() last_pos = pos if not to_json and limit is not None and rows_count >= limit: break progress.log_totals() logger.info("Finished. Total rows = {}, SNVs = {}, elapsed_time = {}".format(rows_count, snvs_count, progress.elapsed_time)) except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser( description="Calculate Baseline Tolerance partial statistics per feature") cmd = DefaultCommandHelper(parser) parser.add_argument("scores_path", metavar="SCORES_PATH", help="The scores file") parser.add_argument("predictors", metavar="PREDICTORS", help="Comma separated list of predictors") parser.add_argument("out_path", metavar="OUTPUT_PATH", help="Output file.") cmd.add_transform_args() args, logger = cmd.parse_args("blt-partial") predictors = [p.strip() for p in args.predictors.split(",") if len(p.strip()) > 0] num_predictors = len(predictors) if len(predictors) == 0: logger.error("At least one predictor is needed") exit(-1) logger.info("Selected predictors: {}".format(", ".join(predictors))) transforms = cmd.get_transforms() stats = {} lost_snvs = 0 scores_path = args.scores_path logger.info("Reading scores from {} ...".format( os.path.basename(scores_path) if scores_path != "-" else "standard input")) with tsv.open(scores_path) as sf: for line_num, line in enumerate(sf): fields = line.rstrip("\n").split("\t") chrom, pos, ref, alt, feature = fields[:5] if len(feature) == 0: lost_snvs += 1 continue scores = fields[5:] if len(scores) != num_predictors: line_error(logger, scores_path, line_num, "Number of score columns does not match the number of predictors") try: scores = [float(v) if len(v) > 0 else None for v in scores] except: line_error(logger, scores_path, line_num, "Scores should be real numbers: {}".format(scores)) if feature not in stats: stats[feature] = tuple([[0, 0.0, 0.0] for p in predictors]) feature_stats = stats[feature] for i, score in enumerate(scores): if score is not None: predictor = predictors[i] if predictor in transforms: for name, func in transforms[predictor]: try: score = func(score) except: logger.error("Error transforming the {} score {} with {}".format(predictor, score, name)) exit(-1) feature_stats[i][0] += 1 feature_stats[i][1] += score feature_stats[i][2] += score * score logger.info("Saving results into {} ...".format( os.path.basename(args.out_path) if args.out_path != "-" else "standard output")) with tsv.open(args.out_path, "w") as of: tsv.write_line(of, "FEATURE", *predictors) for feature in sorted(stats.keys()): sb = [feature] feature_stats = stats[feature] for i in range(num_predictors): sb += ["/".join([repr(v) for v in feature_stats[i]])] tsv.write_line(of, *sb) logger.info("Number of SNV's = {}, lost SNV's = {}, number of features = {}".format(line_num, lost_snvs, len(stats))) return 0
def main(): parser = argparse.ArgumentParser( description="Generate datasets needed to evaluate performance from Cosmic mutations") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("data_path", metavar="PATH", help="The CosmicMutantExport tsv file") parser.add_argument("cgc_path", metavar="PATH", help="The list of CGC genes") parser.add_argument("drivers_path", metavar="PATH", help="The list of CHASM drivers (drivers.tmps)") parser.add_argument("-o", dest="prefix", metavar="PREFIX", help="Output prefix.") args, logger = cmd.parse_args("perf-cosmic") prefix = args.prefix or "cosmic-" fanns_db = cmd.open_db() try: snvs = dict() logger.info("Counting the number of samples per mutation ...") with tsv.open(args.data_path, "r") as df: columns = [ #"Genome-wide screen", "Mutation Description", "Mutation CDS", "Mutation AA", "Mutation GRCh37 genome position", "Mutation GRCh37 strand", "Accession Number", "ID_sample"] total_rows = queried_rows = 0 for fields in tsv.rows(df, columns=columns, header=True): total_rows += 1 #wide_screen, mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields wide_screen = "y" if wide_screen != "y" or mut_desc != "Substitution - Missense": continue queried_rows += 1 for row in get_transcripts(fanns_db, mut_cds, mut_aa, mut_pos, mut_strand, acc, logger): k = tuple([row[k] for k in ["protein", "aa_pos", "aa_ref", "aa_alt"]]) if k not in snvs: symbol = row["xrefs"]["symbol"] snvs[k] = dict( transcript=row["transcript"], symbol=symbol, samples=set([sample_id])) else: snvs[k]["samples"].add(sample_id) logger.info("Total: total_rows={}, queried_rows={}, protein_changes={}".format(total_rows, queried_rows, len(snvs))) logger.info("Loading CGC genes ...") cgc_genes = set() with open(args.cgc_path, "r") as f: for line in f: cgc_genes.add(line.rstrip("\n")) logger.info("Loading CHASM drivers ...") drivers = set() with open(args.drivers_path, "r") as f: for line in f: drivers.add(line.rstrip("\n").split("\t")[0]) logger.info("Creating datasets ...") progress = RatedProgress(logger, name="mutations") with Dataset(prefix + "1") as rec1,\ Dataset(prefix + "2") as rec2,\ Dataset(prefix + "4") as rec4,\ Dataset(prefix + "CGC") as cgc,\ Dataset(prefix + "noCGC") as nocgc,\ Dataset(prefix + "D") as drv,\ Dataset(prefix + "O") as nodrv: for (protein, aa_pos, aa_ref, aa_alt), snv in snvs.items(): num_samples = len(snv["samples"]) line = "\t".join([str(v) for v in [protein, aa_pos, aa_ref, aa_alt]]) if num_samples == 1: rec1.write(line) if num_samples >= 2: rec2.write(line) if num_samples >= 4: rec4.write(line) symbol = snv["symbol"] if symbol is not None and ((isinstance(symbol, basestring) and symbol in cgc_genes) or len(set(symbol) & cgc_genes) > 0): cgc.write(line) elif num_samples == 1: nocgc.write(line) if snv["transcript"] in drivers: drv.write(line) elif num_samples == 1: nodrv.write(line) progress.update() progress.log_totals() logger.info("Datasets: {}".format(", ".join(["{}={}".format(os.path.basename(d.name), d.size) for d in [ rec1, rec2, rec4, cgc, nocgc, drv, nodrv]]))) except: cmd.handle_error() return 0
def prepare_files(project): log = task.logger conf = task.conf projects_out_port = task.ports("projects_out") project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) config = GlobalConfig(conf) paths = PathsConfig(config) # avoid that project conf override path configurations config = GlobalConfig(conf, project["conf"]) oclust = OncodriveClust(config.oncodriveclust, paths, log) project_results = ProjectResults(project) projdb = ProjectDb(project["db"]) data = oclust.retrieve_data(projdb) projdb.close() data_paths = [ os.path.join(project["temp_path"], "oncodriveclust-non-syn-data.tsv"), os.path.join(project["temp_path"], "oncodriveclust-syn-data.tsv")] log.info("Saving data ...") log.debug("> {0}".format(data_paths[NON_SYN])) log.debug("> {0}".format(data_paths[SYN])) df = [tsv.open(path, "w") for path in data_paths] gene_sample_count = defaultdict(int) for key, value in data.items(): findex, gene, sample = key transcript, transcript_len, protein_pos = value if findex == NON_SYN: gene_sample_count[gene] += 1 if oclust.filter_enabled and not oclust.filter.valid(gene): continue tsv.write_line(df[findex], gene, sample, protein_pos) for f in df: f.close() exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv") log.info("Saving excluded gene causes ...") log.debug("> {0}".format(exc_path)) with tsv.open(exc_path, "w") as exf: tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE") for gene, sample_count in gene_sample_count.items(): causes = [] if oclust.filter_enabled and not oclust.filter.valid(gene): causes += [ProjectDb.GENE_EXC_FILTER] if sample_count < oclust.samples_threshold: causes += [ProjectDb.GENE_EXC_THRESHOLD] if len(causes) > 0: tsv.write_line(exf, gene, "".join(causes)) log.info("Sending project ...") projects_out_port.send(dict(project, oncodriveclust=dict( data_paths=data_paths, samples_threshold=oclust.samples_threshold)))
def main(): parser = argparse.ArgumentParser( description="Update scores in the database") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("source_path", metavar="SOURCE", help="The source file. Use - for standard input.") cmd.add_selected_predictors_args() parser.add_argument("--update-predictors", dest="update_predictors", action="store_true", default=False, help="Update of the predictors.") parser.add_argument("--ignore-errors", dest="ignore_errors", action="store_true", default=False, help="When errors on the input file, report them but continue processing the input.") args, logger = cmd.parse_args("update") db = cmd.open_db() predictors = cmd.get_selected_predictors(check_missing=False) try: progress = RatedProgress(logger, name="SNVs") total_lines = 0 logger.info("Reading {} ...".format(args.source_path if args.source_path != "-" else "from standard input")) with tsv.open(args.source_path) as f: # Parse header hdr_line = f.readline() hdr = dict([(name, index) for index, name in enumerate(hdr_line.rstrip("\n").split("\t"))]) db_predictors = set([p["id"] for p in db.predictors()]) if len(predictors) == 0: predictors = [name for name in hdr if name in db_predictors] if len(predictors) == 0: raise Exception("Any input file header match the available predictors in the database. Please specify them using -p.") logger.info("Predictors: {}".format(", ".join(predictors))) for predictor in filter(lambda p: p not in db_predictors, predictors): logger.info("Creating predictor {} ...".format(predictor)) db.add_predictor(predictor, FannsDb.SOURCE_PREDICTOR_TYPE) use_genome_coords = len(set(hdr.keys()) & set(GENOME_COORD_COLUMNS)) > 0 use_protein_coords = len(set(hdr.keys()) & set(PROTEIN_COORD_COLUMNS)) > 0 if not use_genome_coords and not use_protein_coords: raise Exception("No coordinate columns found. " "Use {} for genomic coordinates or {} for protein coordinates.".format( GENOME_COORD_COLUMNS, PROTEIN_COORD_COLUMNS)) elif use_genome_coords and use_protein_coords: logger.warn("Both, genomic and protein coordinates columns found. Using genomic coordinates by default") if use_genome_coords: coord_column_names = [n for n in hdr if n in set(GENOME_COORD_COLUMNS)] coord_column_types = [GENOME_COORD_COLUMN_TYPE[n] for n in coord_column_names] #get_rows = db.get_transcripts_by_dna elif use_protein_coords: coord_column_names = [n for n in hdr if n in set(PROTEIN_COORD_COLUMNS)] coord_column_types = [PROTEIN_COORD_COLUMN_TYPE[n] for n in coord_column_names] #get_rows = db.get_transcripts_by_protein coord_column_indices = [hdr[n] for n in coord_column_names] score_indices = [hdr[n] for n in predictors] max_column_index = max(coord_column_indices + score_indices) for line_num, line in enumerate(f, start=2): fields = line.rstrip("\n").split("\t") if len(fields) < max_column_index: log.error("Missing columns for line {}: {}".format(line_num, " ".join(fields))) if not args.ignore_errors: raise try: coords = dict([(name.lower(), type_cast(fields[index])) for name, type_cast, index in zip( coord_column_names, coord_column_types, coord_column_indices)]) scores = dict([(p, score_value(fields[i])) for p, i in zip(predictors, score_indices)]) except Exception as ex: logger.error("{} at line {}: {}".format(str(ex), line_num, " ".join(fields))) if not args.ignore_errors: raise try: for row in db.query_scores(fields=[], **coords): db.update_scores(row["id"], scores) except Exception as ex: logger.error("Error updating SNV at line {}: {}".format(line_num, str(ex))) logger.error("SNV: {}".format(", ".join(["{}={}".format(k, v) for k, v in coords.items()]))) if not args.ignore_errors: raise progress.update() progress.log_totals() logger.info("Finalizing database ...") if args.update_predictors: logger.info("Updating predictors ...") db.update_predictors() logger.info("Committing ...") db.commit() logger.info("Finished successfully. Elapsed time: {}".format(progress.elapsed_time)) except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser( description="Filter for the longest transcript") cmd = DefaultCommandHelper(parser) parser.add_argument("len_path", metavar="PATH", help="The tsv containing the transcripts length") parser.add_argument("data_path", metavar="PATH", help="The data file") parser.add_argument("out_path", metavar="PATH", help="Output file. Use - for standard output.") parser.add_argument("-k", "--key", dest="key", metavar="KEY", default="PROTEIN,AA_POS,AA_REF,AA_ALT", help="List of columns that conforms the key. Default: PROTEIN,AA_POS,AA_REF,AA_ALT") args, logger = cmd.parse_args("filter-transcript") try: logger.info("Loading transcripts length ...") trslen = defaultdict(int) with tsv.open(args.len_path) as f: for name, length in tsv.rows(f): trslen[name] = length logger.info("Filtering {} ...".format(os.path.basename(args.data_path))) total_count = filter_count = 0 progress = RatedProgress(logger, name="mutations") key_columns = args.key.split(",") with tsv.open(args.data_path, "r") as df, tsv.open(args.out_path, "w") as of: hdr_line = df.readline() of.write(hdr_line) _, hdr = tsv.header_from_line(hdr_line) key_indices = [hdr[name] for name in key_columns] trs_index = hdr["TRANSCRIPT"] last_key = None longest = (0, "") for line in df: total_count += 1 fields = line.rstrip("\n").split("\t") key = tuple([fields[index] for index in key_indices]) trs = fields[trs_index] tl = trslen[trs] if last_key != key: if last_key is not None: of.write(longest[1]) filter_count += 1 longest = (tl, line) last_key = key elif tl > longest[0]: longest = (tl, line) progress.update() filter_count += 1 of.write(longest[1]) progress.log_totals() logger.info("Finished. in={}, out={}, filtered={}, elapsed={}".format( total_count, filter_count, total_count - filter_count, progress.elapsed_time)) except: cmd.handle_error() return 0
def __open(self): self.__f = tsv.open(self.path) self.line_num = 0 if self.header: tsv.skip_comments_and_empty(self.__f)
def main(): parser = OptionParser(usage = "usage: %prog [options] <Variants gvf.gz file> ...") parser.add_option("--db", dest="db_path", help="Database path") parser.add_option("-L", "--log-level", dest="log_level", default="info", choices=["debug", "info", "warn", "error", "critical", "notset"], help="Which log level: debug, info, warn, error, critical, notset") (options, args) = parser.parse_args() logging.basicConfig( level=LOG_LEVEL[options.log_level], format="%(asctime)s %(levelname)-5s : %(message)s") log = logging.getLogger("var_db") if len(args) < 1: log.error("At least one variants file is required") parser.print_help() exit(-1) if options.db_path is None: log.error("The database path should be specified") parser.print_help() exit(-1) db_path = options.db_path log.info("Opening database ...") db = VarXrefsDb(db_path) db.open() db.begin() total_count = 0 total_start_time = time.time() src_var_count = {} src_ratio = {} chromosomes = set() chr_var_count = {} strands = set() try: partial_count = 0 partial_start_time = time.time() for xref_path in args: log.info("Reading {0} ...".format(xref_path)) if not os.path.isfile(xref_path): log.error("File not found: {0}".format(xref_path)) exit(-1) mtime = datetime.fromtimestamp(os.path.getmtime(xref_path)) f = tsv.open(xref_path, "r") src_count = 0 src_start_time = time.time() line_num = 1 # discard headers line = f.readline() while line.startswith("#"): line = f.readline() line_num += 1 src_var_count[xref_path] = 0 for line in f: try: fields = [x if len(x) > 0 else None for x in line.rstrip("\n").split("\t")] chr, source, type, start, end, _1, strand, _2, extra = fields start = int(start) end = int(end) ref = None alt = None xref = None try: for var in extra.split(";"): try: key, value = var.split("=") if key == "Dbxref": pos = value.index(":") xref = value[pos + 1:] elif key == "Reference_seq": ref = value elif key == "Variant_seq": alt = value except: continue except: pass if sum([1 if x is None else 0 for x in [chr, start, strand, ref, alt, source, xref]]) > 0: log.warn("Discarding incomplete variant: {0}".format(",".join([chr, str(start), strand, ref, alt, source, xref]))) continue src_var_count[xref_path] += 1 chromosomes.add(chr) if chr in chr_var_count: chr_var_count[chr] += 1 else: chr_var_count[chr] = 1 strands.add(strand) db.add(chr, start, ref, alt, source, xref, strand) total_count += 1 src_count += 1 partial_count += 1 elapsed_time = time.time() - partial_start_time if elapsed_time >= 10.0: ratio = float(partial_count) / elapsed_time log.debug(" {0:.1f} variants/second, {1} variants, {2} total variants".format(ratio, hsize(src_count), hsize(total_count))) partial_count = 0 partial_start_time = time.time() except Exception as ex: log.error("Error at line {0}:\n{1}".format(line_num, line.rstrip("\n"))) import sys import traceback traceback.print_exc(file=sys.stdout) continue finally: line_num += 1 elapsed_time = time.time() - src_start_time ratio = float(src_count) / elapsed_time src_ratio[xref_path] = ratio log.info(" {0:.1f} variants/second, {1} variants, {2} total variants".format(ratio, hsize(src_count), hsize(total_count))) f.close() db.commit() except KeyboardInterrupt: db.commit() log.warn("Interrupted by the user with Ctrl-C") exit(-1) except: db.rollback() raise finally: db.close() elapsed_time = time.time() - total_start_time total_ratio = float(total_count) / elapsed_time log.info("Statistics:") log.info(" Sources:") for xref_path in args: log.info(" {0}: {1} variants".format(os.path.basename(xref_path), src_var_count[xref_path])) total_size = 0 log.info(" Chromosomes:") for chr in chromosomes: log.info(" {0:>2}: {1:>7} variants".format(chr, str(chr_var_count[chr]))) log.info(" Strands: {0}".format(", ".join(strands))) log.info(" Total {0} variants ({1:.1f} variants/sec)".format(hsize(total_count), total_ratio))
def main(): parser = argparse.ArgumentParser( description="Calculate weights") parser.add_argument("ranges_path", metavar="RANGES_PATH", help="JSON file generated with pred-list containing predictors stats. Only min and max are used.") parser.add_argument("training_path", metavar="TRAINING_PATH", help="The training set scores. ID column should be POS/NEG for positive/negative sets.") parser.add_argument("-o", dest="out_path", metavar="OUT_PATH", help="The file where weights will be saved. Use - for standard output.") parser.add_argument("-p", "--predictors", dest="predictors", metavar="PREDICTORS", help="Comma separated list of predictors to fetch") parser.add_argument("-P", "--precision", dest="precision", metavar="PRECISSION", type=int, default=3, help="Distribution precision") parser.add_argument("-f", "--full-state", dest="full_state", action="store_true", default=False, help="Save intermediate calculations to allow further exploration and plotting") bglogging.add_logging_arguments(parser) args = parser.parse_args() bglogging.initialize(args) logger = bglogging.get_logger("weights") if args.out_path is None: prefix = os.path.splitext(os.path.basename(args.training_path))[0] if prefix.endswith("-scores"): prefix = prefix[:-7] args.out_path = os.path.join(os.getcwd(), "{}-weights.json".format(prefix)) if args.predictors is not None: args.predictors = [p.strip() for p in args.predictors.split(",")] logger.info("Loading ranges from {} ...".format(os.path.basename(args.ranges_path))) with open(args.ranges_path) as f: pred_stats = json.load(f) predictor_range = {} for pid, pstats in pred_stats.items(): predictor_range[pid] = (pstats["min"], pstats["max"]) logger.info("Reading training set {} ...".format(args.training_path if args.training_path != "-" else "from standard input")) with tsv.open(args.training_path) as f: # Select predictors from the available predictors in the dataset or user selection column_names, column_indices = tsv.header(f) available_predictors = [c for c in column_names if c not in set(COORD_COLUMNS)] if args.predictors is None: predictors = available_predictors else: missing_predictors = [p for p in args.predictors if p not in set(available_predictors)] if len(missing_predictors) > 0: logger.error("Missing predictors: {}".format(", ".join(missing_predictors))) exit(-1) predictors = args.predictors data = pd.read_csv(args.training_path, sep="\t", index_col=False, usecols=["ID"] + predictors, true_values=["POS"], false_values=["NEG"]) data.rename(columns={"ID" : "EVT"}, inplace=True) # Initialize statistics logger.info("Initializing metrics ...") step = 1.0 / 10**args.precision stats = dict() state = dict( predictor_names = predictors, precision = args.precision, step = step, stats = stats) for predictor in predictors: d = data[["EVT", predictor]] d = d[np.isfinite(d.iloc[:, 1])] nump = d.iloc[:, 0].sum() numn = d.shape[0] - nump rmin, rmax = d.iloc[:, 1].min(), d.iloc[:, 1].max() dim = rmax - rmin size = int(dim / step) + 1 values = [(x * step) + rmin for x in xrange(size)] logger.info(" {:10}: p={}, n={}, min={}, max={}, bins={}".format(predictor, nump, numn, rmin, rmax, size)) stats[predictor] = dict( rmin = rmin, rmax = rmax, dim = dim, values = values, size = size, vmin = rmin, vmax = rmax, dp = [0] * size, dn = [0] * size, cdp = [0] * size, cdn = [0] * size, cump = 0, cumn = 0, tp = [0] * size, tn = [0] * size, fp = [0] * size, fn = [0] * size, mcc = [0] * size, acc = [0] * size, auc = [0] * size, cutoff = None, cutoff_index = None, cutoff_mcc = None, cutoff_acc = None, cutoff_auc = None) positive_count = data.iloc[:, 0].sum() negative_count = data.shape[0] - positive_count logger.info(" TOTAL : positive={}, negative={}".format(positive_count, negative_count)) logger.info("Calculating scores distribution and confusion matrices ...") logger.info("Calculating cumulative distribution ...") for predictor in predictors: predictor_stats = stats[predictor] dp, dn, cdp, cdn = [predictor_stats[k] for k in ["dp", "dn", "cdp", "cdn"]] cump = 0 cumn = 0 i = len(dp) - 1 while i >= 0: cdp[i] = dp[i] + cump cump += dp[i] cdn[i] = dn[i] + cumn cumn += dn[i] i -= 1 predictor_stats["cump"] = cump predictor_stats["cumn"] = cumn logger.info(" {}: cump={}, cumn={}".format(predictor, cump, cumn)) logger.info("Calculating accuracy and cutoff ...") for predictor in predictors: predictor_stats = stats[predictor] values, size, tp, tn, fp, fn, mcc, acc = [predictor_stats[k] for k in [ "values", "size", "tp", "tn", "fp", "fn", "mcc", "acc"]] cutoff = -1 cutoff_index = -1 best_mcc = -1e6 for i in xrange(size): try: #http://en.wikipedia.org/wiki/Matthews_correlation_coefficient mcc[i] = (tp[i] * tn[i] - fp[i] * fn[i]) / sqrt((tp[i] + fp[i]) * (tp[i] + fn[i]) * (tn[i] + fp[i]) * (tn[i] + fn[i])) #http://en.wikipedia.org/wiki/Accuracy acc[i] = (tp[i] + tn[i]) / float(tp[i] + fp[i] + fn[i] + tn[i]) except ZeroDivisionError: mcc[i] = 0 acc[i] = 0 if mcc[i] > best_mcc: cutoff = values[i] cutoff_index = i best_mcc = mcc[i] best_acc = max(acc) predictor_stats["cutoff"] = cutoff predictor_stats["cutoff_index"] = cutoff_index predictor_stats["cutoff_mcc"] = best_mcc predictor_stats["cutoff_acc"] = best_acc logger.info(" {}: cutoff={:.3f}, mcc={:.2f}, accuracy={:.2f}".format( predictor, cutoff, best_mcc * 100.0, best_acc * 100.0)) if args.full_state: logger.info("Saving weights with full state ...") out_path = args.out_path save_weights(out_path, state) else: logger.info("Saving weights ...") stats = {} reduced_state = dict( predictor_names=state["predictor_names"], precision=state["precision"], step=state["step"], stats=stats) for predictor in state["predictor_names"]: predictor_stats = state["stats"][predictor] stats[predictor] = dict( rmin=predictor_stats["rmin"], rmax=predictor_stats["rmax"], dim=predictor_stats["dim"], values=predictor_stats["values"], size=predictor_stats["size"], cdp=predictor_stats["cdp"], cdn=predictor_stats["cdn"], cutoff=predictor_stats["cutoff"], cutoff_index=predictor_stats["cutoff_index"]) save_weights(args.out_path, reduced_state) return 0