def save_matrix(self, output_path, analysis_name, output_format, row_names, col_names, data, suffix="", params=None, valid_row=lambda row: True): if len(suffix) > 0: suffix = "-{0}".format(suffix) if params is None: params = [] path = os.path.join(output_path, "{0}{1}.{2}".format(analysis_name, suffix, output_format)) self.log.debug(" > {0}".format(path)) with tsv.open(path, 'w') as f: tsv.write_line(f, "## version={0}".format(VERSION)) tsv.write_line(f, "## date={0}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) for key, value in params + self.parameters: tsv.write_line(f, "## {0}={1}".format(key, value)) tsv.write_line(f, "ID", *col_names) for row_index, row_name in enumerate(row_names): if len(row_name) == 0: self.log.warn("Empty identifier detected") continue row = data[row_index, :] if valid_row(row): values = [v if not np.isnan(v) else None for v in row] tsv.write_line(f, row_name, *values, null_value="-")
def ma_run(partition): log = task.logger conf = task.conf results_port = task.ports("results") project = partition["project"] log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"])) offline = conf["offline"] if offline == "yes": log.info("Running Mutation assessor in local mode.") ma = MaLocal(conf["ma_cache_path"]) else: log.info("Running Mutation assessor using web services.") ma = MaService(project["assembly"], cache_path=os.path.join(conf["cache_path"], "ma.db")) results_path = os.path.join(partition["base_path"], "{0:08d}.ma".format(partition["index"])) if not os.path.exists(results_path) or conf.get("consequences_overwrite", True): log.info("Querying Mutation assessor for 'missense_variant' consequences ...") projdb = ProjectDb(project["db"]) missense_variants = set() with open(partition["vep_path"], "r") as f: for line in f: fields = line.rstrip().split("\t") var_id = int(fields[0]) ctypes = fields[3].split(",") if so.match(ctypes, so.NON_SYNONYMOUS): missense_variants.add(var_id) with open(results_path, "w") as mf: for var_id in missense_variants: var = projdb.get_variant(var_id) start, end, ref, alt = var_to_tab(var) r = ma.get(var.chr, var.strand, start, ref, alt, var_id) if r is not None: tsv.write_line(mf, var_id, r.uniprot, r.fi_score, null_value="-") projdb.close() else: log.warn("Skipping MA, results already exist.") log.debug("MA results: {0}".format(results_path)) ma.close() # Send results to the next module partition["ma_path"] = results_path results_port.send(partition)
def datasets(projects_set): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) classifier, projects = projects_set classifier_id = classifier["id"] group_values = classifier["group_values"] short_values = classifier["group_short_values"] long_values = classifier["group_long_values"] group_name = classifier["group_name"] group_short_name = classifier["group_short_name"] group_long_name = classifier["group_long_name"] group_file_prefix = normalize_id(classifier_id) log.info("--- [{0} ({1}) ({2}) ({3})] {4}".format( classifier["name"], group_long_name, group_short_name, group_name, "-" * 30)) log.info("Reading number of samples per project ...") project_ids = [] total_samples = 0 for project in projects: project_id = project["id"] project_ids += [project_id] log.info(" Project {0}".format(project["id"])) projdb = ProjectDb(project["db"]) num_samples = projdb.get_total_affected_samples() total_samples += num_samples log.debug(" {0} samples".format(num_samples)) projdb.close() log.debug(" {0} samples in total".format(total_samples)) log.info("Updating ...") combination_path = paths.combination_path() path = os.path.join(combination_path, "{0}.tsv".format(group_file_prefix)) if not os.path.exists(path): with open(path, "w") as f: tsv.write_line(f, "NAME", "SHORT_NAME", "LONG_NAME", "SAMPLES_TOTAL", "PROJECT_IDS") with open(path, "a") as f: tsv.write_line(f, group_name, group_short_name, group_long_name, total_samples, ",".join(project_ids))
def vep_run(partition): log = task.logger conf = task.conf results_port = task.ports("results") project_id = partition["project"]["id"] log.info("--- [{0} @ {1}] --------------------------------------------".format(project_id, partition["index"])) offline = conf["offline"] if offline == "yes": log.info("Running VEP in local mode.") vep = VepLocal( perl_path=conf["perl_bin"], lib_path=conf["perl_lib"], script_path=os.path.join(conf["ext_bin_path"], "variant_effect_predictor", "variant_effect_predictor.pl"), cache_path=os.path.join(conf["data_path"], "vep_cache")) else: log.info("Running VEP using web services.") vep = VepService(cache_path=os.path.join(conf["cache_path"], "vep.db")) results_path = os.path.join(partition["base_path"], "{0:08d}.vep".format(partition["index"])) if not os.path.exists(results_path) or conf.get("consequences_overwrite", True): # Run VEP vep.run(partition["bed_path"]) log.info("Saving results ...") log.debug("VEP results: {0}".format(vep.results_path)) # Save results with open(results_path, "w") as f: for r in vep.results(): tsv.write_line(f, r.var_id, r.gene, r.transcript, ",".join(r.consequences), r.protein_pos, r.aa_change, r.protein, r.sift, r.polyphen, null_value="-") else: log.warn("Skipping VEP, results already exist.") log.debug("VEP results: {0}".format(results_path)) vep.close() # Send results to the next module partition["vep_path"] = results_path results_port.send(partition)
def split_variants(project): log = task.logger config = GlobalConfig(task.conf) partition_port = task.ports("partitions") log.info("--- [{}] --------------------------------------------".format(project["id"])) projdb = ProjectDb(project["db"]) log.info("Preparing variants for VEP ...") base_path = os.path.join(project["temp_path"], "consequences") ensure_path_exists(base_path) project["csq_path"] = base_path partition_size = config.vep_partition_size partition = -1 f = None count = 0 for var in projdb.variants(order_by="position"): start, end, ref, alt = var_to_tab(var) if count % partition_size == 0: if f is not None: f.close() partition += 1 partition_path = os.path.join(base_path, "{0:08d}.vep_in".format(partition)) f = open(partition_path, "w") partition_port.send( {"project": project, "index": partition, "bed_path": partition_path, "base_path": base_path} ) tsv.write_line(f, var.chr, start, end, ref + "/" + alt, var.strand, var.id) count += 1 if f is not None: f.close() log.info("{} variants split into {} partitions".format(count, partition + 1)) projdb.close()
def fetch(db, muts_path, out_path, params=None, columns=None, maps=None, predictors=None, labels=None, calc_labels=None, muts_header=False, logger=None): params = params or {} columns = columns or [c.lower() for c in COORD_COLUMNS] maps = maps or [] predictors = predictors or [] labels = labels or [] state = {} with tsv.open(out_path, "w") as wf: metadata = db.metadata if "version" in metadata: tsv.write_param(wf, "db-version", db.metadata["version"]) tsv.write_param(wf, "fetched", dt.now().strftime("%Y-%m-%d %H:%M:%S")) for k, v in params.items(): tsv.write_param(wf, k, v) tsv.write_line(wf, "ID", *[c.upper() for c in columns] + [m.upper() for m in maps] + predictors + labels) for row in fetch_iter(db, muts_path, maps=maps, predictors=predictors, muts_header=muts_header, state=state, logger=logger): if calc_labels is not None: labels = calc_labels(row) or {} else: labels = {} xrefs = row["xrefs"] scores = row["scores"] tsv.write_line(wf, state[STATE_MUTATION].identifier, *[row[c] for c in columns] + [xrefs[m] for m in maps] + [scores[p] for p in predictors] + [labels.get(l, "") for l in labels]) return {k : state[k] for k in [STATE_HITS, STATE_FAILS]}
def main(): parser = argparse.ArgumentParser(description="Extract mutations in VCF and save as simple tabulated file") parser.add_argument("vcf_paths", metavar="PATH", nargs="+", help="The VCF files") parser.add_argument("-o", dest="out_path", metavar="PATH", help="Output file. Use - for standard output.") bglogging.add_logging_arguments(self._parser) args = parser.parse_args() bglogging.initialize(self.args) log = bglogging.get_logger("vcf-to-snvs") if args.out_path is None: names = [] for path in args.vcf_paths: if path != "-": base_path, name, ext = tsv.split_path(path) names += [name] prefix = os.path.commonprefix(*names) if len(names) > 0 else "" prefix = prefix.rstrip(".") if len(prefix) == 0: prefix = "genome" args.out_path = "{}.tsv.gz".format(prefix) with tsv.open(args.out_path, "w") as outf: tsv.write_line(outf, "CHR", "POS", "REF", "ALT") for path in args.vcf_paths: log.info("Reading {} ...".format(path)) with tsv.open(path) as inf: types = (str, str, str, str) columns = [0, 1, 3, 4] for fields in tsv.lines(inf, types, columns=columns): chrom, pos, ref, alt = fields # ref = ref.upper().strip("N") # alt = alt.upper().strip("N") ref_len = len(ref) alt_len = len(alt) if ref_len != alt_len or ref_len == 0 or alt_len == 0: continue try: pos = int(pos) except: continue if ref_len == 1: tsv.write_line(outf, chrom, pos, ref, alt) else: for i in range(ref_len): tsv.write_line(outf, chrom, pos + i, ref[i], alt[i])
def main(): parser = argparse.ArgumentParser( description="Export SNV's") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("dest_path", metavar="DEST", help="The destination file. Use - for standard output.") args, log = cmd.parse_args("export-snvs") db = cmd.open_db() logger.info("Exporting SNV's ...") total_count = 0 total_start_time = time.time() try: progress = RatedProgress(logger, name="SNVs") rows_count = 0 with tsv.open(args.dest_path, "w") as f: for snv in db.snvs(): rows_count += 1 tsv.write_line(f, snv["chr"], snv["start"], snv["start"], snv["strand"], "{}>{}".format(snv["ref"], snv["alt"]), "S") progress.update() log.info("Finished. Total rows = {}, elapsed_time = {}".format(rows_count, progress.elapsed_time)) except: return cmd.handle_error() finally: db.close() return 0
def create_datasets(project): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) project_path = project["path"] temp_path = project["temp_path"] datasets_path = paths.project_results_path(project_path) ensure_path_exists(datasets_path) sigdb = SigDb(config.sigdb_path) sigdb.open() projdb = ProjectDb(project["db"]) gene_sym = projdb.get_gene_symbols() total_samples = projdb.get_total_affected_samples() log.info("Exporting variant genes ...") vf = open_dataset(project_id, project_path, datasets_path, "variant_gene", "w", log) tsv.write_param(vf, "SAMPLE_TOTAL", total_samples) tsv.write_line(vf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "GENE_ID", "IMPACT", "IMPACT_CLASS", "SAMPLE_FREQ", "SAMPLE_PROP", "CODING_REGION", "PROTEIN_CHANGES", "INTOGEN_DRIVER", "XREFS") sf = open_dataset(project_id, project_path, datasets_path, "variant-samples", "w", log) tsv.write_line(sf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "SAMPLE") count = 0 for afg in projdb.affected_genes(join_variant=True, join_samples=True, join_xrefs=True, join_rec=True): var = afg.var rec = afg.rec start, end, ref, alt = var_to_tab(var) allele = "{0}/{1}".format(ref, alt) xrefs = [xref for xref in var.xrefs] if sigdb.exists_variant(var.chr, start): xrefs += ["I:1"] xrefs = ",".join(xrefs) intogen_driver = 1 if sigdb.exists_gene(afg.gene_id) else 0 tsv.write_line(vf, var.id, var.chr, var.strand, start, allele, afg.gene_id, afg.impact, TransFIC.class_name(afg.impact), rec.sample_freq, rec.sample_prop, afg.coding_region, afg.prot_changes, intogen_driver, xrefs, null_value="\N") for sample in var.samples: tsv.write_line(sf, var.id, var.chr, var.strand, start, allele, sample.name, null_value="\N") count += 1 vf.close() sf.close() log.info(" {0} variant genes".format(count)) log.info("Exporting consequences ...") cf = open_dataset(project_id, project_path, datasets_path, "consequence", "w", log) tsv.write_line(cf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "TRANSCRIPT_ID", "CT", "GENE_ID", "SYMBOL", "UNIPROT_ID", "PROTEIN_ID", "PROTEIN_POS", "AA_CHANGE", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS", "IMPACT", "IMPACT_CLASS") count = 0 for csq in projdb.consequences(join_variant=True): var = csq.var start, end, ref, alt = var_to_tab(var) allele = "{0}/{1}".format(ref, alt) uniprot = protein = protein_pos = aa_change = None sift_score = sift_tfic = sift_tfic_class = None pph2_score = pph2_tfic = pph2_tfic_class = None ma_score = ma_tfic = ma_tfic_class = None if so.match(csq.ctypes, so.ONCODRIVEFM): uniprot, protein = csq.uniprot, csq.protein if so.match(csq.ctypes, so.NON_SYNONYMOUS): protein_pos, aa_change = csq.protein_pos, csq.aa_change sift_score, sift_tfic, sift_tfic_class = csq.sift_score, csq.sift_tfic, TransFIC.class_name(csq.sift_tfic_class) pph2_score, pph2_tfic, pph2_tfic_class = csq.pph2_score, csq.pph2_tfic, TransFIC.class_name(csq.pph2_tfic_class) ma_score, ma_tfic, ma_tfic_class = csq.ma_score, csq.ma_tfic, TransFIC.class_name(csq.ma_tfic_class) tsv.write_line(cf, var.id, var.chr, var.strand, start, allele, csq.transcript, ",".join(csq.ctypes), csq.gene, gene_sym.get(csq.gene), uniprot, protein, protein_pos, aa_change, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class, csq.impact, TransFIC.class_name(csq.impact), null_value="\N") count += 1 cf.close() log.info(" {0} consequences".format(count)) log.info("Exporting genes ...") gf = open_dataset(project_id, project_path, datasets_path, "gene", "w", log) tsv.write_param(gf, "SAMPLE_TOTAL", total_samples) tsv.write_line(gf, "GENE_ID", "FM_PVALUE", "FM_QVALUE", "FM_EXC_CAUSE", "CLUST_ZSCORE", "CLUST_PVALUE", "CLUST_QVALUE", "CLUST_EXC_CAUSE", "CLUST_COORDS", "SAMPLE_FREQ", "SAMPLE_PROP", "INTOGEN_DRIVER") for gene in projdb.genes(join_rec=True): rec = gene.rec if rec.sample_freq is None or rec.sample_freq == 0: continue intogen_driver = 1 if sigdb.exists_gene(gene.id) else 0 tsv.write_line(gf, gene.id, gene.fm_pvalue, gene.fm_qvalue, gene.fm_exc_cause, gene.clust_zscore, gene.clust_pvalue, gene.clust_qvalue, gene.clust_exc_cause, gene.clust_coords, rec.sample_freq or 0, rec.sample_prop or 0, intogen_driver, null_value="\N") gf.close() log.info("Exporting pathways ...") pf = open_dataset(project_id, project_path, datasets_path, "pathway", "w", log) tsv.write_param(pf, "SAMPLE_TOTAL", total_samples) tsv.write_line(pf, "PATHWAY_ID", "GENE_COUNT", "FM_ZSCORE", "FM_PVALUE", "FM_QVALUE", "SAMPLE_FREQ", "SAMPLE_PROP", "GENE_FREQ", "GENE_TOTAL", "GENE_PROP") for pathway in projdb.pathways(join_rec=True): rec = pathway.rec if rec.sample_freq is None or rec.sample_freq == 0: continue tsv.write_line(pf, pathway.id, pathway.gene_count, pathway.fm_zscore, pathway.fm_pvalue, pathway.fm_qvalue, rec.sample_freq or 0, rec.sample_prop or 0, rec.gene_freq or 0, pathway.gene_count, rec.gene_prop or 0, null_value="\N") pf.close() if not config.skip_oncodrivefm: log.info("Exporting genes per sample functional impact ...") with open_dataset(project_id, project_path, datasets_path, "gene_sample-fimpact", "w", log) as f: tsv.write_line(f, "GENE_ID", "SAMPLE", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS") for fields in projdb.sample_gene_fimpacts(): (gene, sample, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class) = fields tsv.write_line(f, gene, sample, sift_score, sift_tfic, TransFIC.class_name(sift_tfic_class), pph2_score, pph2_tfic, TransFIC.class_name(pph2_tfic_class), ma_score, ma_tfic, TransFIC.class_name(ma_tfic_class), null_value="\N") projdb.close() sigdb.close() log.info("Saving project configuration ...") projres = ProjectResults(project) with open_dataset(project_id, project_path, datasets_path, "project.tsv", "w", log) as f: names = ["ASSEMBLY", "SAMPLES_TOTAL"] values = [project["assembly"], total_samples] names, values = projres.get_annotations_to_save(config.project.annotations, project["annotations"], names=names, values=values) tsv.write_line(f, *names) tsv.write_line(f, *values, null_value="\N") projects_port = task.ports("projects_out") projects_port.send(project)
def save_splited_results(self, output_path, analysis_name, output_format, matrix, mapping, method, results, slices, suffix=""): if len(suffix) > 0: suffix = "-{0}".format(suffix) for slice_results_index, slice in enumerate(slices): slice_name = matrix.slice_names[slice] path = os.path.join(output_path, "{0}{1}-{2}.{3}".format( analysis_name, suffix, slice_name, output_format)) self.log.debug(" > {0}".format(path)) with tsv.open(path, 'w') as f: tsv.write_line(f, "## version={0}".format(VERSION)) tsv.write_line(f, "## slice={0}".format(slice_name)) tsv.write_line(f, "## method={0}".format(method.name)) tsv.write_line(f, "## date={0}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) for key, value in self.parameters: tsv.write_line(f, "## {0}={1}".format(key, value)) tsv.write_line(f, "ID", *method.results_columns) for row_index, row_name in enumerate(mapping.group_names): value = results[slice_results_index, row_index] if not np.isnan(value): tsv.write_line(f, row_name, value, null_value="-")
def drivers(): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) db_path = paths.results_path("drivers.db") db = SigDb(db_path) db.open() log.info("Variants ...") path = paths.combination_path("recurrences", "variant_gene-global-all.tsv.gz") with tsv.open(path, "r") as f: types = (str, str, int, str) for fields in tsv.lines(f, types, columns=("CHR", "STRAND", "START", "ALLELE"), header=True): chr, strand, start, allele = fields[:4] db.add_variant(chr, start) log.info("Genes ...") gene_sites = {} gene_fm = set() gene_clust = set() #SPECIAL_THRESHOLD = ["C18", "C34"] SPECIAL_THRESHOLD = [] log.info(" OncodriveFM ...") filename_re = re.compile(r"gene-cancer_site-(.+)\.tsv.gz") base_path = paths.combination_path("oncodrivefm") for path in os.listdir(base_path): m = filename_re.match(path) if not m: continue cancer_site_code = m.group(1) if cancer_site_code in SPECIAL_THRESHOLD: threshold = 1e-6 else: threshold = 0.01 with tsv.open(os.path.join(base_path, path), "r") as f: params = tsv.params(f) cancer_site_name = params["group_long_name"] for fields in tsv.lines(f, (str, float), columns=("ID", "QVALUE"), header=True): gene, qvalue = fields if qvalue < threshold: add_cancer_site(gene_sites, gene, cancer_site_code, cancer_site_name) gene_fm.add(gene) log.info(" OncodriveCLUST ...") filename_re = re.compile(r"cancer_site-(.+)\.tsv.gz") base_path = paths.combination_path("oncodriveclust") for path in os.listdir(base_path): m = filename_re.match(path) if not m: continue cancer_site_code = m.group(1) with tsv.open(os.path.join(base_path, path), "r") as f: params = tsv.params(f) cancer_site_name = params["group_long_name"] for fields in tsv.lines(f, (str, float), columns=("ID", "QVALUE"), header=True): gene, qvalue = fields if qvalue < 0.05: add_cancer_site(gene_sites, gene, cancer_site_code, cancer_site_name) gene_clust.add(gene) log.info(" Updating db ...") sig_genes = gene_fm | gene_clust for gene in sig_genes: db.add_gene(gene, gene in gene_fm, gene in gene_clust) log.info("Saving driver genes cancer sites dataset ...") path = paths.results_path("gene-driver_cancer_sites.tsv") log.debug("> {}".format(path)) with open(path, "w") as f: tsv.write_param(f, "date", datetime.now()) tsv.write_line(f, "GENE_ID", "FM", "CLUST", "CANCER_SITES_COUNT", "CANCER_SITE_CODES", "CANCER_SITE_NAMES") for gene, sites in gene_sites.items(): tsv.write_line(f, gene, 1 if gene in gene_fm else 0, 1 if gene in gene_clust else 0, len(sites), ", ".join(sorted([code for code, name in sites])), ", ".join(sorted([name for code, name in sites]))) db.commit() db.close()
def gene_impact(project): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) projects_port = task.ports("projects") log.info("--- [{0}] --------------------------------------------".format(project["id"])) partitions = project["partitions"] log.info("Reading {} partitions ...".format(len(partitions))) aff_gene_attrs = {} for partition in partitions: log.info(" Partition {} ...".format(partition["index"])) with open(partition["tfi_path"], "r") as f: bool_type = lambda val: bool(int(val)) if val is not None else False types = (int, str, str, bool_type, int, int, int, int) columns = [0, 2, 4, 5, 6, 10, 14, 18] for fields in tsv.lines(f, types, columns=columns, null_value="-"): (var_id, gene, prot_change, coding_region, tr_impact, sift_impact, pph2_impact, ma_impact) = fields coding_region = coding_region == 1 aff_gene = (var_id, gene) # update aggregated impact for all the predictors update_attr(aff_gene_attrs, aff_gene, "sift_impact", sift_impact, update=TransFIC.higher_impact) update_attr(aff_gene_attrs, aff_gene, "pph2_impact", pph2_impact, update=TransFIC.higher_impact) update_attr(aff_gene_attrs, aff_gene, "ma_impact", ma_impact, update=TransFIC.higher_impact) # update whether the affected gene is a coding region or not update_attr(aff_gene_attrs, aff_gene, "coding_region", coding_region, update=lambda prev_value, value: prev_value or value) # aggregate protein changes per affected_gene if prot_change is not None: update_attr(aff_gene_attrs, aff_gene, "prot_changes", prot_change, new=lambda value: set([value]), update=lambda prev_value, value: prev_value | set([value])) num_vars = len(set([var_id for var_id, gene in aff_gene_attrs.keys()])) num_genes = len(set([gene for var_id, gene in aff_gene_attrs.keys()])) log.info("Saving {} variant-gene impacts ({} variants and {} genes) ...".format(len(aff_gene_attrs), num_vars, num_genes)) gfi_path = os.path.join(project["csq_path"], "variant-gene_impact.tsv") with open(gfi_path, "w") as vf: for aff_gene, attrs in aff_gene_attrs.items(): var_id, gene = aff_gene # get the impact by trust priority: ma, pph2, sift impact = attrs.get("ma_impact") or attrs.get("pph2_impact") or attrs.get("sift_impact") or TransFIC.UNKNOWN_IMPACT_CLASS coding_region = attrs.get("coding_region", False) coding_region = 1 if coding_region else 0 prot_changes = attrs.get("prot_changes") prot_changes = ",".join(prot_changes) if prot_changes is not None else None tsv.write_line(vf, var_id, gene, impact, coding_region, prot_changes, null_value="-") # Send results to the next module project["gfi_path"] = gfi_path projects_port.send(project)
def write_line(f, *v): tsv.write_line(f, *v, null_value="-")
def prepare_files(project): log = task.logger conf = task.conf projects_out_port = task.ports("projects_out") project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) project_results = ProjectResults(project) projdb = ProjectDb(project["db"]) log.info("Retrieving functional impact scores for genes ...") data = retrieve_data(projdb) projdb.close() # save data matrix dst_path = os.path.join(project["temp_path"], "oncodrivefm-data.tdm") sgfi_path = os.path.join(project["temp_path"], "sample_gene-fimpact.tsv.gz") project["sample_gene_fi_data"] = sgfi_path log.info("Saving functional impact scores ...") log.debug("> {0}".format(dst_path)) with open(dst_path, "w") as f: sgff = tsv.open(sgfi_path, "w") tsv.write_line(f, "SAMPLE", "GENE", "SIFT", "PPH2", "MA") tsv.write_line(sgff, "SAMPLE", "GENE", "SIFT_SCORE", "SIFT_TFIC", "SIFT_TFIC_CLASS", "PPH2_SCORE", "PPH2_TFIC", "PPH2_TFIC_CLASS", "MA_SCORE", "MA_TFIC", "MA_TFIC_CLASS") for key, values in data.iteritems(): sample, gene = key (sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class) = values tsv.write_line(f, sample, gene, sift_score, pph2_score, ma_score) tsv.write_line(sgff, sample, gene, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class, null_value="-") sgff.close() # count samples samples = set() gene_sample_count = {} for sample, gene in data.keys(): samples.add(sample) if gene not in gene_sample_count: gene_sample_count[gene] = 1 else: gene_sample_count[gene] += 1 num_samples = len(samples) if num_samples == 0: log.warn("There are no samples data, skipping OncodriveFM for this project") return (num_cores, estimator, genes_num_samplings, genes_threshold, genes_filter_enabled, genes_filter, filt, pathways_num_samplings, pathways_threshold) = get_oncodrivefm_configuration(log, conf, project, num_samples) # Create a dataset with information on why some genes are not considered for calculation in OncodriveFM # There are basically two possible reasons: # - It does not pass the filter # - There are less samples mutated than the threshold exc_path = os.path.join(project["temp_path"], "oncodrivefm-excluded-cause.tsv") log.info("Saving excluded gene causes ...") log.debug("> {0}".format(exc_path)) with tsv.open(exc_path, "w") as exf: tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE") for gene, sample_count in gene_sample_count.items(): causes = [] if genes_filter_enabled and not filt.valid(gene): causes += [ProjectDb.GENE_EXC_FILTER] if sample_count < genes_threshold: causes += [ProjectDb.GENE_EXC_THRESHOLD] if len(causes) > 0: tsv.write_line(exf, gene, "".join(causes)) ofm = dict( data=dst_path, num_cores=num_cores, estimator=estimator) for slice_name in ["SIFT", "PPH2", "MA"]: projects_out_port.send(dict(project, oncodrivefm=dict(ofm, feature="genes", slice=slice_name, num_samplings=genes_num_samplings, threshold=genes_threshold, filter_enabled=genes_filter_enabled, filter=genes_filter))) for slice_name in ["SIFT", "PPH2", "MA"]: projects_out_port.send(dict(project, oncodrivefm=dict(ofm, feature="pathways", slice=slice_name, num_samplings=pathways_num_samplings, threshold=pathways_threshold, filter_enabled=genes_filter_enabled, filter=genes_filter)))
def prepare_files(project): log = task.logger conf = task.conf projects_out_port = task.ports("projects_out") project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) config = GlobalConfig(conf) paths = PathsConfig(config) # avoid that project conf override path configurations config = GlobalConfig(conf, project["conf"]) oclust = OncodriveClust(config.oncodriveclust, paths, log) project_results = ProjectResults(project) projdb = ProjectDb(project["db"]) data = oclust.retrieve_data(projdb) projdb.close() data_paths = [ os.path.join(project["temp_path"], "oncodriveclust-non-syn-data.tsv"), os.path.join(project["temp_path"], "oncodriveclust-syn-data.tsv")] log.info("Saving data ...") log.debug("> {0}".format(data_paths[NON_SYN])) log.debug("> {0}".format(data_paths[SYN])) df = [tsv.open(path, "w") for path in data_paths] gene_sample_count = defaultdict(int) for key, value in data.items(): findex, gene, sample = key transcript, transcript_len, protein_pos = value if findex == NON_SYN: gene_sample_count[gene] += 1 if oclust.filter_enabled and not oclust.filter.valid(gene): continue tsv.write_line(df[findex], gene, sample, protein_pos) for f in df: f.close() exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv") log.info("Saving excluded gene causes ...") log.debug("> {0}".format(exc_path)) with tsv.open(exc_path, "w") as exf: tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE") for gene, sample_count in gene_sample_count.items(): causes = [] if oclust.filter_enabled and not oclust.filter.valid(gene): causes += [ProjectDb.GENE_EXC_FILTER] if sample_count < oclust.samples_threshold: causes += [ProjectDb.GENE_EXC_THRESHOLD] if len(causes) > 0: tsv.write_line(exf, gene, "".join(causes)) log.info("Sending project ...") projects_out_port.send(dict(project, oncodriveclust=dict( data_paths=data_paths, samples_threshold=oclust.samples_threshold)))
def main(): parser = argparse.ArgumentParser( description="Calculate Baseline Tolerance partial statistics per feature") cmd = DefaultCommandHelper(parser) parser.add_argument("scores_path", metavar="SCORES_PATH", help="The scores file") parser.add_argument("predictors", metavar="PREDICTORS", help="Comma separated list of predictors") parser.add_argument("out_path", metavar="OUTPUT_PATH", help="Output file.") cmd.add_transform_args() args, logger = cmd.parse_args("blt-partial") predictors = [p.strip() for p in args.predictors.split(",") if len(p.strip()) > 0] num_predictors = len(predictors) if len(predictors) == 0: logger.error("At least one predictor is needed") exit(-1) logger.info("Selected predictors: {}".format(", ".join(predictors))) transforms = cmd.get_transforms() stats = {} lost_snvs = 0 scores_path = args.scores_path logger.info("Reading scores from {} ...".format( os.path.basename(scores_path) if scores_path != "-" else "standard input")) with tsv.open(scores_path) as sf: for line_num, line in enumerate(sf): fields = line.rstrip("\n").split("\t") chrom, pos, ref, alt, feature = fields[:5] if len(feature) == 0: lost_snvs += 1 continue scores = fields[5:] if len(scores) != num_predictors: line_error(logger, scores_path, line_num, "Number of score columns does not match the number of predictors") try: scores = [float(v) if len(v) > 0 else None for v in scores] except: line_error(logger, scores_path, line_num, "Scores should be real numbers: {}".format(scores)) if feature not in stats: stats[feature] = tuple([[0, 0.0, 0.0] for p in predictors]) feature_stats = stats[feature] for i, score in enumerate(scores): if score is not None: predictor = predictors[i] if predictor in transforms: for name, func in transforms[predictor]: try: score = func(score) except: logger.error("Error transforming the {} score {} with {}".format(predictor, score, name)) exit(-1) feature_stats[i][0] += 1 feature_stats[i][1] += score feature_stats[i][2] += score * score logger.info("Saving results into {} ...".format( os.path.basename(args.out_path) if args.out_path != "-" else "standard output")) with tsv.open(args.out_path, "w") as of: tsv.write_line(of, "FEATURE", *predictors) for feature in sorted(stats.keys()): sb = [feature] feature_stats = stats[feature] for i in range(num_predictors): sb += ["/".join([repr(v) for v in feature_stats[i]])] tsv.write_line(of, *sb) logger.info("Number of SNV's = {}, lost SNV's = {}, number of features = {}".format(line_num, lost_snvs, len(stats))) return 0
def prepare_files(project): log = task.logger conf = task.conf projects_out_port = task.ports("projects_out") project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) project_results = ProjectResults(project) mutations_threshold, genes_filter_enabled, genes_filter, filt = get_oncodriveclust_configuration(log, conf, project) log.info("Loading transcripts CDS length ...") cds_len = load_cds_len(conf) log.info("Retrieving gene alterations ...") projdb = ProjectDb(project["db"]) data = retrieve_data(projdb, cds_len) projdb.close() data_paths = [ os.path.join(project["temp_path"], "oncodriveclust-non-syn-data.tsv"), os.path.join(project["temp_path"], "oncodriveclust-syn-data.tsv")] log.info("Saving data ...") log.debug("> {0}".format(data_paths[NON_SYN])) log.debug("> {0}".format(data_paths[SYN])) df = [tsv.open(path, "w") for path in data_paths] gene_sample_count = {} for key, value in data.items(): findex, gene, sample = key transcript, transcript_len, protein_pos = value if findex == NON_SYN: if gene not in gene_sample_count: gene_sample_count[gene] = 1 else: gene_sample_count[gene] += 1 if genes_filter_enabled and not filt.valid(gene): continue tsv.write_line(df[findex], gene, sample, protein_pos) for f in df: f.close() exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv") log.info("Saving excluded gene causes ...") log.debug("> {0}".format(exc_path)) with tsv.open(exc_path, "w") as exf: tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE") for gene, sample_count in gene_sample_count.items(): causes = [] if genes_filter_enabled and not filt.valid(gene): causes += [ProjectDb.GENE_EXC_FILTER] if sample_count < mutations_threshold: causes += [ProjectDb.GENE_EXC_THRESHOLD] if len(causes) > 0: tsv.write_line(exf, gene, "".join(causes)) log.info("Sending project ...") projects_out_port.send(dict(project, oncodriveclust=dict( data_paths=data_paths, mutations_threshold=mutations_threshold, genes_filter_enabled=genes_filter_enabled, # not used genes_filter=genes_filter))) # not used
def fimpact_run(partition): log = task.logger conf = task.conf results_port = task.ports("results") project = partition["project"] log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"])) log.info("Reading MA scores ...") ma_uniprot = {} ma_scores = {} with open(partition["ma_path"], "r") as f: for var_id, uniprot, fi_score in tsv.lines(f, (int, str, float), null_value="-"): ma_uniprot[var_id] = uniprot ma_scores[var_id] = fi_score log.info("Reading VEP results and calculating functional impact ...") tfic = TransFIC(data_path=os.path.join(conf["data_path"], "TransFIC")) tfi_path = os.path.join(partition["base_path"], "{0:08d}.tfi".format(partition["index"])) cf = open(tfi_path, "w") aff_gene_attrs = {} with open(partition["vep_path"], "r") as f: for fields in tsv.lines(f, (int, str, str, str, str, str, str, float, float), null_value="-"): (var_id, gene, transcript, ct, protein_pos, aa_change, protein, sift_score, pph2_score) = fields if ct is not None: ct = ct.split(",") else: ct = [] # Invert sift score if sift_score is not None: sift_score = 1.0 - sift_score ma_score = None uniprot = ma_uniprot[var_id] if var_id in ma_uniprot else None sift_impact = pph2_impact = ma_impact = None # TransFIC.UNKNOWN_IMPACT_CLASS coding_region = so.match(ct, so.CODING_REGION) calculate_transfic = True ct_type = None if so.match(ct, so.NON_SYNONYMOUS): # missense ct_type = TransFIC.CT_NON_SYNONYMOUS ma_score = ma_scores[var_id] if var_id in ma_scores else None elif so.match(ct, so.STOP): # stop ct_type = TransFIC.CT_STOP sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.FRAMESHIFT): # frameshift ct_type = TransFIC.CT_FRAMESHIFT sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.SPLICE): # splice ct_type = "splice" sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS if so.match(ct, so.SPLICE_JUNCTION) else TransFIC.UNKNOWN_IMPACT_CLASS calculate_transfic = False elif so.match(ct, so.SYNONYMOUS): # synonymous ct_type = TransFIC.CT_SYNONYMOUS sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS sift_score = pph2_score = 0.0 ma_score = -2 else: sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS calculate_transfic = False if calculate_transfic: (sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class) = tfic.calculate("gosmf", gene, ct_type, sift_score, pph2_score, ma_score) # if the impact was not preassigned get it from the transFIC calculated class sift_impact = sift_class if sift_impact is None and sift_class in IMPACT_CLASSES else sift_impact pph2_impact = pph2_class if pph2_impact is None and pph2_class in IMPACT_CLASSES else pph2_impact ma_impact = ma_class if ma_impact is None and ma_class in IMPACT_CLASSES else ma_impact else: sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class = (None, None, None, None, None, None) aff_gene = (var_id, gene) # update aggregated impact for all the predictors update_attr(aff_gene_attrs, aff_gene, "sift_impact", sift_impact, update=TransFIC.higher_impact) update_attr(aff_gene_attrs, aff_gene, "pph2_impact", pph2_impact, update=TransFIC.higher_impact) update_attr(aff_gene_attrs, aff_gene, "ma_impact", ma_impact, update=TransFIC.higher_impact) # update whether the affected gene is a coding region or not update_attr(aff_gene_attrs, aff_gene, "coding_region", coding_region, update=lambda prev_value, value: prev_value or value) # aggregate protein changes per affected_gene # try to follow the convention http://www.hgvs.org/mutnomen/recs-prot.html prot_change = None if ct_type == TransFIC.CT_FRAMESHIFT: if protein_pos is None: prot_change = "fs" else: prot_change = "fs {0}".format(protein_pos) #log.debug("FRAMESHIFT: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change)) elif ct_type == "splice": prot_change = "r.spl?" #log.debug("SPLICE: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change)) elif protein_pos is not None and aa_change is not None: rc = ReContext() if rc.match(SIMPLE_AA_CHANGE_RE, aa_change): prot_change = "{ref}{pos}{alt}".format(pos=protein_pos, ref=rc.group(1), alt=rc.group(2) or "=") elif rc.match(COMPLEX_AA_CHANGE_RE, aa_change): prot_change = "{0} {1}".format(aa_change, protein_pos) else: log.warn("Unmatched aa change: gene={}, protein={}, pos={}, change={}, ct=[{}]".format( gene, protein, protein_pos, aa_change, ", ".join(ct))) if prot_change is not None: update_attr(aff_gene_attrs, aff_gene, "prot_changes", prot_change, new=lambda value: set([value]), update=lambda prev_value, value: prev_value | set([value])) impact = ma_impact or pph2_impact or sift_impact or TransFIC.UNKNOWN_IMPACT_CLASS tsv.write_line(cf, var_id, transcript, uniprot, sift_score, sift_tfic, sift_class, pph2_score, pph2_tfic, pph2_class, ma_score, ma_tfic, ma_class, impact, null_value="-") cf.close() log.info("Saving variant impacts ...") gfi_path = os.path.join(partition["base_path"], "{0:08d}.gfi".format(partition["index"])) vf = open(gfi_path, "w") for aff_gene, attrs in aff_gene_attrs.items(): var_id, gene = aff_gene # get the impact by trust priority: ma, pph2, sift impact = attrs.get("ma_impact") or attrs.get("pph2_impact") or attrs.get("sift_impact") or TransFIC.UNKNOWN_IMPACT_CLASS coding_region = attrs.get("coding_region", False) coding_region = 1 if coding_region else 0 prot_changes = attrs.get("prot_changes") prot_changes = ",".join(prot_changes) if prot_changes is not None else None tsv.write_line(vf, var_id, gene, impact, coding_region, prot_changes, null_value="-") vf.close() # Send results to the next module partition["tfi_path"] = tfi_path partition["gfi_path"] = gfi_path results_port.send(partition)
def fimpact_run(partition): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) results_port = task.ports("results") project = partition["project"] log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"])) log.info("Reading MA scores ...") ma_uniprot = {} ma_scores = {} with open(partition["ma_path"], "r") as f: for var_id, uniprot, fi_score in tsv.lines(f, (int, str, float), null_value="-"): ma_uniprot[var_id] = uniprot ma_scores[var_id] = fi_score log.info("Reading VEP results and calculating functional impact ...") tfic = TransFIC(data_path=paths.data_transfic_path()) tfi_path = os.path.join(partition["base_path"], "{0:08d}.tfi".format(partition["index"])) cf = open(tfi_path, "w") with open(partition["vep_path"], "r") as f: for fields in tsv.lines(f, (int, str, str, str, str, str, str, float, float), null_value="-"): (var_id, gene, transcript, ct, protein_pos, aa_change, protein, sift_score, pph2_score) = fields ct = (ct or "").split(",") # Invert sift score if sift_score is not None: sift_score = 1.0 - sift_score ma_score = None uniprot = ma_uniprot.get(var_id) sift_impact = pph2_impact = ma_impact = None # TransFIC.UNKNOWN_IMPACT_CLASS coding_region = 1 if so.match(ct, so.CODING_REGION) else 0 sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class = (None, None, None, None, None, None) ct_type = None if so.match(ct, so.NON_SYNONYMOUS): # missense ct_type = TransFIC.CT_NON_SYNONYMOUS ma_score = ma_scores.get(var_id) (sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class) = tfic.calculate("gosmf", gene, ct_type, sift_score, pph2_score, ma_score) sift_impact = sift_class if sift_class in IMPACT_CLASSES else sift_impact pph2_impact = pph2_class if pph2_class in IMPACT_CLASSES else pph2_impact ma_impact = ma_class if ma_class in IMPACT_CLASSES else ma_impact elif so.match(ct, so.STOP): # stop sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.FRAMESHIFT): # frameshift sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.SPLICE_JUNCTION): # splice junction sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.SPLICE_REGION): # splice region sift_impact = pph2_impact = ma_impact = TransFIC.UNKNOWN_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.SYNONYMOUS): # synonymous sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS sift_score = pph2_score = 0.0 ma_score = -2 else: sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS aff_gene = (var_id, gene) # try to follow the convention http://www.hgvs.org/mutnomen/recs-prot.html prot_change = None if ct_type == TransFIC.CT_FRAMESHIFT: if protein_pos is None: prot_change = "fs" else: prot_change = "fs {0}".format(protein_pos) #log.debug("FRAMESHIFT: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change)) elif ct_type == "splice": prot_change = "r.spl?" #log.debug("SPLICE: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change)) elif protein_pos is not None and aa_change is not None: rc = ReContext() if rc.match(SIMPLE_AA_CHANGE_RE, aa_change): prot_change = "{ref}{pos}{alt}".format(pos=protein_pos, ref=rc.group(1), alt=rc.group(2) or "=") elif rc.match(COMPLEX_AA_CHANGE_RE, aa_change): prot_change = "{0} {1}".format(aa_change, protein_pos) else: log.warn("Unmatched aa change: gene={}, protein={}, pos={}, change={}, ct=[{}]".format( gene, protein, protein_pos, aa_change, ", ".join(ct))) tr_impact = ma_impact or pph2_impact or sift_impact or TransFIC.UNKNOWN_IMPACT_CLASS tsv.write_line(cf, var_id, transcript, gene, uniprot, prot_change, coding_region, tr_impact, sift_score, sift_tfic, sift_class, sift_impact, pph2_score, pph2_tfic, pph2_class, pph2_impact, ma_score, ma_tfic, ma_class, ma_impact, null_value="-") cf.close() # Send results to the next module partition["tfi_path"] = tfi_path results_port.send(partition)
def save_combined_results(self, output_path, analysis_name, output_format, method, row_names, col_names, data, suffix="combination"): self.log.info("Saving combination results ...") path = os.path.join(self.args.output_path, "{0}-{1}.{2}".format(self.args.analysis_name, suffix, self.args.output_format)) self.log.debug(" > {0}".format(path)) with tsv.open(path, 'w') as f: tsv.write_line(f, "## slices={0}".format(",".join(col_names))) tsv.write_line(f, "## method={0}".format(method.name)) tsv.write_line(f, "## date={0}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) for key, value in self.parameters: tsv.write_line(f, "## {0}={1}".format(key, value)) tsv.write_line(f, "ID", *method.combination_columns) for row_index, row_name in enumerate(row_names): if not np.isnan(data[row_index, 0]): values = [v if not np.isnan(v) else None for v in data[row_index, :]] tsv.write_line(f, row_name, *values, null_value="-")
def main(): parser = argparse.ArgumentParser( description="Calculate Baseline Tolerance statistics") cmd = DefaultCommandHelper(parser) parser.add_argument("tree_path", metavar="TREE_PATH", help="The groups descendant tree") parser.add_argument("root_group", metavar="ROOT_GROUP", help="Tree root group") parser.add_argument("group_genes_path", metavar="GROUP_FEATS_PATH", help="Map between groups and features") parser.add_argument("stats_path", metavar="STATS_PATH", help="Partial feature statistics") parser.add_argument("out_path", metavar="OUTPUT_PATH", help="Output feature statistics") parser.add_argument("--tsv", dest="tsv_path", metavar="PATH", help="Store baseline tolerance in tsv format too.") parser.add_argument("-c", "--count-threshold", dest="count_threshold", metavar="N", default=DEFAULT_COUNT_THRESHOLD, help="Minimum number of features per group") parser.add_argument("--stdev-threshold", dest="stdev_threshold", metavar="V", default=DEFAULT_STDEV_THRESHOLD, help="Skip feature statistics with a standard deviation less than V (it will be calculated at the level of groups)") args, logger = cmd.parse_args("blt-groups") logger.info("Loading groups tree ...") tree = Tree() with tsv.open(args.tree_path) as f: for group, children in tsv.lines(f, (str, lambda v: set(v.split(",")))): tree.add_node(group, children) logger.info(" Nodes: {}".format(tree.node_count)) logger.info("Loading mappings between groups and features ...") all_groups = set() all_features = set() with tsv.open(args.group_genes_path) as f: for group, features in tsv.lines(f, (str, lambda v: set(v.split(",")))): tree.add_node(group, features) all_groups.add(group) all_features.update(features) logger.info(" Nodes: {}".format(tree.node_count)) logger.info(" Groups: {}".format(len(all_groups))) logger.info(" Features: {}".format(len(all_features))) logger.info("Loading partial statistics ...") with tsv.open(args.stats_path) as f: predictors = f.readline().rstrip("\n").split("\t")[1:] num_predictors = len(predictors) num_features = 0 for line in f: try: fields = line.rstrip("\n").split("\t") feature = fields[0] node = tree.get_or_create_node(feature) for p, ss in zip(predictors, fields[1:]): try: s0, s1, s2 = [float(v) if i > 0 else int(v) for i, v in enumerate(ss.split("/"))] node.set_pblt(p, PartialBLT(s0, s1, s2, sources=set([feature]))) except: import traceback traceback.print_exc() logger.warn("Failed to parse partial baseline tolerance" " for {}/{} from {}".format(feature, p, ss)) exit(-1) continue num_features += 1 except: logger.warn("Failed to parse partial baseline tolerance" " for {} from {}".format(feature, line)) continue logger.info(" Nodes: {}".format(tree.node_count)) logger.info(" Features: {}".format(num_features)) logger.info(" Predictors: {}".format(", ".join(predictors))) logger.info("Calculating baseline tolerance ...") for predictor in predictors: logger.info("For {} ...".format(predictor)) calculate_blt( parent=None, node=tree.get_or_create_node(args.root_group), predictor=predictor, count_threshold=args.count_threshold, stdev_threshold=args.stdev_threshold, logger=logger) # TODO log summary info logger.info("Writing results into {} ...".format(os.path.basename(args.out_path))) if args.tsv_path is not None: with tsv.open(args.tsv_path, "w") as of: tsv.write_line(of, "FEATURE", *predictors) for feature in all_features: sb = [feature] node = tree.get_node(feature) predictors_with_blt = 0 for predictor in predictors: blt = node.get_blt(predictor) if blt is None or blt.n < args.count_threshold: sb += ["/".join(["-"] * 5)] continue predictors_with_blt += 1 sb += ["/".join(map(str, [blt.from_node, blt.scope, blt.n, blt.mean, blt.stdev]))] if predictors_with_blt > 0: tsv.write_line(of, *sb) with tsv.open(args.out_path, "w") as of: tree_blt = {} for node_name, node in tree.nodes.items(): predictors_blt = {} for predictor in predictors: pred_blt = node.get_blt(predictor) if pred_blt is None or pred_blt.n < args.count_threshold: continue predictors_blt[predictor] = dict( from_node=pred_blt.from_node, scope=pred_blt.scope, N=pred_blt.n, mean=pred_blt.mean, stdev=pred_blt.stdev) if len(predictors_blt) > 0: tree_blt[node.name] = predictors_blt doc = dict( created=str(datetime.now()), predictors=predictors, count_threshold=args.count_threshold, stdev_threshold=args.stdev_threshold, tree=None, # tree relations features=list(all_features), pblt=None, # TODO blt=tree_blt ) json.dump(doc, of, indent=True) return 0
def combination_recurrences(projects_set): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) classifier, projects = projects_set classifier_id = classifier["id"] group_values = classifier["group_values"] short_values = classifier["group_short_values"] long_values = classifier["group_long_values"] group_name = classifier["group_name"] group_short_name = classifier["group_short_name"] group_long_name = classifier["group_long_name"] if len(group_values) == 0: group_file_prefix = classifier_id else: group_file_prefix = "{0}-{1}".format(classifier_id, group_short_name) group_file_prefix = normalize_id(group_file_prefix) log.info( "--- [{0} ({1}) ({2}) ({3})] {4}".format( classifier["name"], group_long_name, group_short_name, group_name, "-" * 30 ) ) log.info("Creating database ...") db_path = make_temp_file(task, suffix="-{0}.db".format(group_file_prefix)) log.debug(" > {0}".format(db_path)) conn = sqlite3.connect(db_path) conn.row_factory = sqlite3.Row create_db(conn) log.info("Combining recurrences ...") c = conn.cursor() sample_total = 0 project_ids = [] for project in projects: project_ids += [project["id"]] log.info(" Project {0}:".format(project["id"])) projdb = ProjectDb(project["db"]) project_sample_total = projdb.get_total_affected_samples() sample_total += project_sample_total log.info(" Total samples = {0}".format(project_sample_total)) log.info(" Variant genes ...") count = 0 for afg in projdb.affected_genes(join_variant=True, join_xrefs=True, join_rec=True): var = afg.var rec = afg.rec if rec.sample_freq is None: log.warn("Discarding variant gene without sample frequency: {0}".format(repr(afg))) continue start, end, ref, alt = var_to_tab(var) try: c.execute( "INSERT INTO variants (chr, strand, start, ref, alt, xrefs) VALUES (?,?,?,?,?,?)", (var.chr, var.strand, start, ref, alt, ",".join(var.xrefs)), ) var_id = c.lastrowid except sqlite3.IntegrityError: c.execute( "SELECT var_id FROM variants WHERE chr=? AND strand=? AND start=? AND ref=? AND alt=?", (var.chr, var.strand, start, ref, alt), ) r = c.fetchone() var_id = r[0] try: c.execute( "INSERT INTO variant_genes (var_id, gene_id, impact, coding_region, prot_changes, sample_freq) VALUES (?,?,?,?,?,?)", (var_id, afg.gene_id, afg.impact, afg.coding_region, afg.prot_changes, rec.sample_freq), ) except sqlite3.IntegrityError: c.execute( """ UPDATE variant_genes SET sample_freq=sample_freq + ? WHERE var_id=? AND gene_id=?""", (rec.sample_freq, var_id, afg.gene_id), ) count += 1 log.info(" {0} variant genes".format(count)) log.info(" Genes ...") count = 0 for gene in projdb.genes(join_xrefs=True, join_rec=True): rec = gene.rec if rec.sample_freq is None: continue c.execute("SELECT COUNT(*) FROM genes WHERE gene_id=?", (gene.id,)) r = c.fetchone() if r[0] == 0: c.execute("INSERT INTO genes (gene_id, sample_freq) VALUES (?,?)", (gene.id, rec.sample_freq)) else: c.execute("UPDATE genes SET sample_freq=sample_freq + ? WHERE gene_id=?", (rec.sample_freq, gene.id)) count += 1 log.info(" {0} genes".format(count)) log.info(" Pathways ...") count = 0 for pathway in projdb.pathways(join_rec=True): rec = pathway.rec if rec.sample_freq is None: continue c.execute("SELECT COUNT(*) FROM pathways WHERE pathway_id=?", (pathway.id,)) r = c.fetchone() if r[0] == 0: c.execute("INSERT INTO pathways (pathway_id, sample_freq) VALUES (?,?)", (pathway.id, rec.sample_freq)) else: c.execute( "UPDATE pathways SET sample_freq=sample_freq + ? WHERE pathway_id=?", (rec.sample_freq, pathway.id) ) count += 1 log.info(" {0} pathways".format(count)) projdb.close() log.info("Calculating proportions with {0} samples in total among projects ...".format(sample_total)) if sample_total > 0: c.execute("UPDATE variant_genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total)) c.execute("UPDATE genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total)) c.execute("UPDATE pathways SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total)) c.close() conn.commit() log.info("Saving results ...") c = conn.cursor() base_path = paths.combination_path("recurrences") log.info(" Variant genes ...") with tsv.open(os.path.join(base_path, "variant_gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f: tsv.write_param(f, "classifier", classifier["id"]) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "group_short_name", group_short_name) tsv.write_param(f, "group_long_name", group_long_name) tsv.write_param(f, "projects", ",".join(project_ids)) tsv.write_param(f, "SAMPLE_TOTAL", sample_total) tsv.write_line( f, "CHR", "STRAND", "START", "ALLELE", "GENE_ID", "IMPACT", "IMPACT_CLASS", "SAMPLE_FREQ", "SAMPLE_PROP", "PROT_CHANGES", "XREFS", ) for r in c.execute( "SELECT * FROM variant_genes JOIN variants USING (var_id) ORDER BY chr*1, chr, strand, start, gene_id" ): strand, ref, alt = r["strand"], r["ref"], r["alt"] allele = "{0}/{1}".format(ref, alt) tsv.write_line( f, r["chr"], strand, r["start"], allele, r["gene_id"], r["impact"], TransFIC.class_name(r["impact"]), r["sample_freq"], r["sample_prop"], r["prot_changes"], r["xrefs"], null_value="-", ) log.info(" Genes ...") with tsv.open(os.path.join(base_path, "gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f: tsv.write_param(f, "classifier", classifier["id"]) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "group_short_name", group_short_name) tsv.write_param(f, "group_long_name", group_long_name) tsv.write_param(f, "projects", ",".join(project_ids)) tsv.write_param(f, "SAMPLE_TOTAL", sample_total) tsv.write_line(f, "GENE_ID", "SAMPLE_FREQ", "SAMPLE_PROP") for r in c.execute("SELECT * FROM genes ORDER BY gene_id"): tsv.write_line(f, r["gene_id"], r["sample_freq"], r["sample_prop"], null_value="-") log.info(" Pathways ...") with tsv.open(os.path.join(base_path, "pathway-{0}.tsv.gz".format(group_file_prefix)), "w") as f: tsv.write_param(f, "classifier", classifier["id"]) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "group_short_name", group_short_name) tsv.write_param(f, "group_long_name", group_long_name) tsv.write_param(f, "projects", ",".join(project_ids)) tsv.write_param(f, "SAMPLE_TOTAL", sample_total) tsv.write_line(f, "PATHWAY_ID", "SAMPLE_FREQ", "SAMPLE_PROP") for r in c.execute("SELECT * FROM pathways ORDER BY pathway_id"): tsv.write_line(f, r["pathway_id"], r["sample_freq"], r["sample_prop"], null_value="-") conn.close() remove_temp(task, db_path)
def main(): parser = argparse.ArgumentParser( description="Export dbNSFP scores") cmd = DefaultCommandHelper(parser) parser.add_argument("source_path", metavar="SOURCE", help="The original zip file") parser.add_argument("ensp_map_path", metavar="MAP", help="The mapping between Ensembl protein id's and Ensembl transcript id's and Uniprot id's") parser.add_argument("uniprot_map_path", metavar="MAP", help="The mapping between Ensembl protein id's and Uniprot id's") parser.add_argument("-o", "--output", dest="out_path", metavar="OUT_PATH", help="The output file") parser.add_argument("--temp", dest="temp_path", metavar="TEMP_PATH", help="A temporary path for zip extraction") parser.add_argument("--chr", dest="chr", metavar="CHROMOSOMES", help="Chromosomes to include: list separated by commas.") parser.add_argument("--skip-empty-scores", dest="skip_empty_scores", action="store_true", default=False, help="Skip SNV's where all the scores are empty") args, logger = cmd.parse_args("dbnsfp-export") if args.out_path is None: basename = os.path.basename(args.source_path) prefix = os.path.splitext(basename)[0] args.out_path = "{}.tsv.gz".format(prefix) logger.info("Loading maps ...") uniprot_map = {} trs_map = {} with tsv.open(args.ensp_map_path) as f: for ensp, enst in tsv.lines(f, (str, str)): if len(enst) > 0: trs_map[enst] = ensp with tsv.open(args.uniprot_map_path) as f: for ensp, uniprot_id in tsv.lines(f, (str, str)): if len(uniprot_id) > 0: uniprot_map[uniprot_id] = ensp logger.info("Opening {} ...".format(args.source_path)) chromosomes = None if args.chr is not None: chromosomes = [c.strip().upper() for c in args.chr.split(",") if len(c.strip()) > 0] logger.info("Selected chromosomes: {}".format(", ".join(chromosomes))) chromosomes = set(chromosomes) name_pattern = re.compile(r"dbNSFP.+_variant.chr(.+)") COLUMNS = [ "#chr", "pos(1-coor)", "ref", "alt", "cds_strand", "genename", "Uniprot_id", "Uniprot_aapos", "aaref", "aaalt", "Ensembl_geneid", "Ensembl_transcriptid", "aapos", "SIFT_score", "Polyphen2_HVAR_score", "MutationAssessor_score", "FATHMM_score", "MutationTaster_score", # "GERP_RS", "GERP++_RS", # "PhyloP_score" "phyloP" ] tmp_prefix = args.temp_path or tempfile.gettempdir() if not os.path.exists(tmp_prefix): os.makedirs(tmp_prefix) if tmp_prefix[-1] != "/": tmp_prefix += "/" extract_path = tempfile.mkdtemp(prefix=tmp_prefix) try: logger.info("Output: {}".format(args.out_path if args.out_path != "-" else "standard output")) total_start_time = time.time() total_lines = 0 with ZipFile(args.source_path, "r") as zf,\ tsv.open(args.out_path, "w") as of: #,\ #tsv.open(args.noprot_path, "w") as npf: tsv.write_line(of, "CHR", "STRAND", "START", "REF", "ALT", "TRANSCRIPT", "PROTEIN", "AA_POS", "AA_REF", "AA_ALT", "SIFT", "PPH2", "MA", "FATHMM", "MT", "GERPRS", "PHYLOP") #tsv.write_line(npf, "#CHR", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO") entries = [] for entry in zf.infolist(): m = name_pattern.match(entry.filename) if not m: continue chr = m.group(1) index = CHR_INDEX[chr] if chr in CHR_INDEX else 99 if chromosomes is not None and chr not in chromosomes: logger.debug("Skipping chromosome {} ...".format(chr)) continue entries += [(index, chr, entry)] for index, chr, entry in sorted(entries, key=lambda x: x[0]): logger.info("Reading chromosome {} ...".format(chr)) zf.extract(entry, extract_path) fpath = os.path.join(extract_path, entry.filename) with open(fpath) as f: # Parse header hdr_line = f.readline() hdr = {} for index, name in enumerate(hdr_line.rstrip("\n").split("\t")): hdr[name] = index columns = [hdr[name] if name in hdr else None for name in COLUMNS] read = set() start_time = time.time() partial_start_time = start_time for line_num, line in enumerate(f, start=2): fields = line.rstrip("\n").split("\t") try: fields = [fields[i] if i is not None and i < len(fields) else None for i in columns] (chr, start, ref, alt, strand, symbol, uniprot, uniprot_aapos, aa_ref, aa_alt, gene, transcript, aapos, sift, pph2, ma, fathmm, mt, gerprs, phylop) = fields start = safe_int(start) ref = ref.upper() if ref is not None else None alt = alt.upper() if alt is not None else None aa_ref = aa_ref.upper() if aa_ref is not None else None aa_alt = aa_alt.upper() if aa_alt is not None else None sift = safe_float(sift) ma = safe_float(ma) fathmm = safe_float(fathmm) mt = safe_float(mt) gerprs = safe_float(gerprs) phylop = safe_float(phylop) if start is None or ref is None or alt is None: logger.warn("None value for pos or ref or alt at line {}: {}".format(line_num, fields)) continue elif ref not in BASE_INDEX or alt not in BASE_INDEX: logger.warn("Unknown ref or alt at line {}: {}".format(line_num, fields)) continue elif len(ref) != 1 or len(alt) != 1: logger.warn("Length != 1 for ref or alt len at line {}: {}".format(line_num, fields)) continue #elif aa_ref not in AA_INDEX or aa_alt not in AA_INDEX: # logger.warn("Unknown aa_ref or aa_alt at line {}: {}".format(line_num, fields)) # continue elif transcript is None or aapos is None or uniprot is None or uniprot_aapos is None: logger.warn("None value for transcript or aapos or uniprot or uniprot_aapos at line {}: {}".format(line_num, fields)) continue if aa_ref not in AA_INDEX: aa_ref = None if aa_alt not in AA_INDEX: aa_alt = None trs_values = transcript.split(";") aapos_values = [safe_int(v) for v in aapos.split(";")] l = len(trs_values) - len(aapos_values) if l > 0: aapos_values += [aapos_values[-1]] * l uniprot_values = uniprot.split(";") uniprot_aapos_values = [safe_int(v) for v in uniprot_aapos.split(";")] l = len(uniprot_values) - len(uniprot_aapos_values) if l > 0: uniprot_aapos_values += [uniprot_aapos_values[-1]] * l pph2_values = [safe_float(v) for v in pph2.split(";")] if pph2 is not None else [None] l = len(uniprot_values) - len(pph2_values) if l > 0: pph2_values += [pph2_values[-1]] * l uniprot_index = {} for i, id in enumerate(uniprot_values): if uniprot_aapos_values[i] is not None: uniprot_index[uniprot_aapos_values[i]] = i for i, trs in enumerate(trs_values): pos = aapos_values[i] if pos < 0: pos = None if pos is not None and pos in uniprot_index: j = uniprot_index[pos] uniprot_value = uniprot_values[j] pph2_value = pph2_values[j] else: uniprot_value = pph2_value = None if trs in trs_map: prot_id = trs_map[trs] elif uniprot_value in uniprot_map: prot_id = uniprot_map[uniprot_value] else: logger.warn("Couldn't map neither protein {} or transcript {} at line {}: {}".format(uniprot_value, trs, line_num, "|".join([str(v) for v in fields]))) continue #if pos < 0: # logger.warn("Negative protein position at line {}: {}".format(line_num, pos)) # continue #elif ... if pph2_value is not None and (pph2_value < 0.0 or pph2_value > 1.0): logger.warn("PPH2 score {} out of range at line {}: {}".format(pph2_value, line_num, fields)) continue if aa_alt == "X": # fix stop codons having a sift score sift = None if args.skip_empty_scores and sift is None and pph2_value is None and ma is None \ and mt is None and gerprs is None and phylop is None: continue #log.info((chr, strand, start, ref, alt, aapos_values[i], aa_ref, aa_alt, trs, sift, pph2_value, ma)) if pos is None or aa_ref is None or aa_alt is None: pass #tsv.write_line(npf, chr, start, ".", ref, alt, ".", "PASS", # "dbNSFP={}|{}|{}|{}|{}|{}".format(trs, prot_id, # sift or "", pph2_value or "", ma or "", fathmm or "")) else: tsv.write_line(of, chr, strand, start, ref, alt, trs, prot_id, pos, aa_ref, aa_alt, sift, pph2_value, ma, fathmm, mt, gerprs, phylop) except KeyboardInterrupt: raise except: logger.warn("Malformed line {}: {}".format(line_num, "|".join([str(v) for v in fields]))) raise #continue partial_time = time.time() - partial_start_time if partial_time >= 5.0: partial_start_time = time.time() elapsed_time = time.time() - start_time logger.debug(" {} lines, {:.1f} lines/second".format(hsize(line_num-1), (line_num-1) / float(elapsed_time))) total_lines += line_num logger.info(" > {} lines, {:.1f} lines/second".format(hsize(line_num), line_num / float(time.time() - start_time))) logger.info(" >> {} lines, {:.1f} lines/second".format(hsize(total_lines), total_lines / float(time.time() - total_start_time))) os.remove(fpath) total_elapsed_time = timedelta(seconds=time.time() - total_start_time) logger.info("Finished successfully. Elapsed time: {}".format(total_elapsed_time)) except: return cmd.handle_error() finally: shutil.rmtree(extract_path) return 0
def main(): parser = argparse.ArgumentParser( description="Fetch Condel scores") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("muts_path", metavar="SNVS_PATH", help="SNV's to check. Use - for standard input.") parser.add_argument("out_path", metavar="OUTPUT_PATH", help="The results path. Use - for standard output.") cmd.add_selected_predictors_args() cmd.add_selected_annotations_args() cmd.add_selected_columns_args() args, logger = cmd.parse_args("fetch") db = cmd.open_db() predictors = cmd.get_selected_predictors() annotations = cmd.get_selected_annotations() columns = cmd.get_selected_columns() logger.info("Reading {} ...".format(args.muts_path if args.muts_path != "-" else "from standard input")) try: progress = RatedProgress(logger, name="SNVs") with tsv.open(args.muts_path) as f: with tsv.open(args.out_path, "w") as wf: tsv.write_line(wf, "ID", *[c.upper() for c in columns] + [a.upper() for a in annotations] + predictors) hit = fail = 0 mut = DnaAndProtMutationParser() for line_num, line in enumerate(f, start=1): line = line.rstrip(" \n\r") if len(line) == 0 or line.startswith("#"): continue try: mut.parse(line) except PrematureEnd: logger.error("Missing fields at line {}".format(line_num)) fail += 1 continue except UnexpectedToken as ex: logger.error("Unexpected field '{}' at line {}".format(ex.args[0], line_num)) fail += 1 continue exists = False for row in query_mutation(logger, db, mut, annotations, predictors): exists = True ann = row["annotations"] scores = row["scores"] tsv.write_line(wf, mut.identifier, *[row[c] for c in columns] + [ann[a] for a in annotations] + [scores[p] for p in predictors]) """ if logger.isEnabledFor(logging.DEBUG): logger.debug(" --> {} {} {} {} {} {} {} {} {} {}".format( row["chr"], row["start"], row["ref"], row["alt"], row["transcript"], row["protein"], row["aa_pos"], row["aa_ref"], row["aa_alt"], mut.identifier or "*")) """ progress.update() if exists: hit += 1 else: fail += 1 progress.log_totals() logger.info("Finished. total={}, hits={}, fails={}, elapsed={}".format(hit + fail, hit, fail, progress.elapsed_time)) except: return cmd.handle_error() finally: db.close()
def main(): parser = argparse.ArgumentParser( description="Export Scores") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("dest_path", metavar="OUTPUT_PATH", help="The output file. Use - for standard output.") cmd.add_selected_predictors_args() cmd.add_selected_annotations_args() cmd.add_selected_columns_args() parser.add_argument("--json", dest="to_json", action="store_true", default=False, help="Export the results in json format") parser.add_argument("--sample", dest="sample", type=int, metavar="PCT", help="Export a random sample of PCT %%") parser.add_argument("--start", dest="start", type=int, metavar="N", help="Start to export from the SNV number N") parser.add_argument("--limit", dest="limit", type=int, metavar="N", help="Limit the number of SNVs to export to N") args, logger = cmd.parse_args("export") db = cmd.open_db() predictors = cmd.get_selected_predictors() annotations = cmd.get_selected_annotations() columns = cmd.get_selected_columns() logger.info("Exporting ...") random.seed(time.time()) total_count = 0 total_start_time = time.time() try: progress = RatedProgress(logger, name="SNVs") to_json = args.to_json sample = args.sample start = args.start or 0 limit = args.limit doc = None last_pos = None rows_count = 0 snvs_count = 0 with tsv.open(args.dest_path, "w") as f: if not to_json: tsv.write_line(f, *[c.upper() for c in columns] + [a.upper() for a in annotations] + predictors) for row in db.query_scores(predictors=predictors, maps=annotations): if not to_json: if start > 0: start -= 1 continue if sample is not None and random.randint(1, 100) > sample: continue pos = (row["chr"], row["strand"], row["start"], row["ref"], row["alt"]) if last_pos != pos: if to_json: if start > 0: start -= 1 continue if limit is not None and snvs_count >= limit: if doc is not None: json.dump(doc, f) f.write("\n") break snvs_count += 1 rows_count += 1 ann = row["annotations"] scores = row["scores"] if to_json: tdoc = dict([(k,row[k]) for k in ["transcript", "protein", "aa_pos", "aa_ref", "aa_alt"]] + [(k,scores[k]) for k in predictors]) if pos != last_pos: if doc is not None: if sample is None or random.randint(1, 100) <= sample: json.dump(doc, f) f.write("\n") else: snvs_count -= 1 doc = dict([(k, row[k]) for k in ["chr", "strand", "start", "ref", "alt"]] + [("transcripts", [tdoc])]) else: doc["transcripts"] += [tdoc] else: tsv.write_line(f, *[row[c] for c in columns] + [ann[a] for a in annotations] + [scores[p] for p in predictors]) progress.update() last_pos = pos if not to_json and limit is not None and rows_count >= limit: break progress.log_totals() logger.info("Finished. Total rows = {}, SNVs = {}, elapsed_time = {}".format(rows_count, snvs_count, progress.elapsed_time)) except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser(description="Calculate Baseline Tolerance statistics per gene") cmd = DefaultCommandHelper(parser) parser.add_argument("tree_path", metavar="TREE_PATH", help="The groups descendant tree") parser.add_argument("root_group", metavar="ROOT_GROUP", help="Tree root group") parser.add_argument("group_genes_path", metavar="GROUP_FEATS_PATH", help="Map between groups and features") parser.add_argument("stats_path", metavar="STATS_PATH", help="Partial gene statistics") parser.add_argument("out_path", metavar="OUTPUT_PATH", help="Output gene statistics") parser.add_argument( "-c", "--count-threshold", dest="count_threshold", metavar="N", default=DEFAULT_COUNT_THRESHOLD, help="Minimum number of features per group", ) parser.add_argument( "--stdev-threshold", dest="stdev_threshold", metavar="V", default=DEFAULT_STDEV_THRESHOLD, help="Skip feature statistics with a standard deviation less than V (it will be calculated at the level of groups)", ) args, logger = cmd.parse_args("blt-groups") logger.info("Loading groups tree ...") group_children = defaultdict(set) with tsv.open(args.tree_path) as f: for group, children in tsv.lines(f, (str, lambda v: set(v.split(",")))): group_children[group] |= children logger.info("Loading mappings between groups and features ...") group_genes = defaultdict(set) with tsv.open(args.group_genes_path) as f: for group, genes in tsv.lines(f, (str, lambda v: set(v.split(",")))): group_genes[group] |= genes logger.info("Loading partial statistics ...") partial_stats = {} with tsv.open(args.stats_path) as f: predictors = f.readline().rstrip("\n").split("\t")[1:] num_predictors = len(predictors) for line in f: fields = line.rstrip("\n").split("\t") gene = fields[0] gene_stats = [[float(v) if i > 0 else int(v) for i, v in enumerate(ss.split("/"))] for ss in fields[1:]] partial_stats[gene] = gene_stats logger.info(" Predictors: {}".format(", ".join(predictors))) logger.info(" Features: {}".format(len(partial_stats.keys()))) logger.info("Calculating features ...") stats = {} feat_count = 0 feat_partial_count = [0] * num_predictors for feature, feat_partial_stats in partial_stats.items(): feat_with_stats = False feat_stats = [None] * (num_predictors + 1) for i in range(num_predictors): s0, s1, s2 = feat_partial_stats[i] if s0 == 0.0: continue if s0 < args.count_threshold: continue x = (s0 * s2 - s1 * s1) / (s0 * (s0 - 1)) if x < -1e-12: continue mean = s1 / s0 std = math.sqrt(abs(x)) if std < args.stdev_threshold: continue feat_stats[i] = (int(s0), mean, std) feat_partial_count[i] += 1 feat_with_stats = True if feat_with_stats: feat_count += 1 stats[feature] = feat_stats # print feature, "\t".join(["/".join([str(v) for v in feat_stats[i] or []]) for i in range(num_predictors)]) logger.info( " {} ({}) features out of {} calculated directly from partial statistics".format( feat_count, "/".join(map(str, feat_partial_count)), len(partial_stats) ) ) logger.info("Calculating groups ...") calculate_group( logger, args.root_group, args.count_threshold, group_children, group_genes, partial_stats, num_predictors, stats ) logger.info(" {} features calculated in total".format(len(stats))) with tsv.open(args.out_path, "w") as of: tsv.write_line(of, "GENE", "GROUP", *predictors) for gene in sorted(stats.keys()): gene_stats = stats[gene] sb = [gene] stats_group = gene_stats[num_predictors] if stats_group is not None: sb += [stats_group] else: sb += ["|" + ("-" * num_predictors)] for i in range(num_predictors): if gene_stats[i] is not None: sb += ["/".join([str(v) for v in gene_stats[i]])] else: sb += ["-/-/-"] tsv.write_line(of, *sb) return 0
def combination_oncodrivefm(projects_set): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) classifier, projects = projects_set classifier_id = classifier["id"] group_values = classifier["group_values"] short_values = classifier["group_short_values"] long_values = classifier["group_long_values"] group_name = classifier["group_name"] group_short_name = classifier["group_short_name"] group_long_name = classifier["group_long_name"] if len(group_values) == 0: group_file_prefix = classifier_id else: group_file_prefix = "{0}-{1}".format(classifier_id, group_short_name) group_file_prefix = normalize_id(group_file_prefix) log.info("--- [{0} ({1}) ({2}) ({3})] {4}".format( classifier["name"], group_long_name, group_short_name, group_name, "-" * 30)) log.info("Exporting project data ...") base_path = make_temp_dir(task, suffix=".{0}".format(group_file_prefix)) log.debug("> {0}".format(base_path)) project_ids = [] gene_files = [] pathway_files = [] for project in projects: project_id = project["id"] project_ids += [project_id] log.info(" Project {0}:".format(project["id"])) projdb = ProjectDb(project["db"]) log.info(" Genes ...") count = 0 file_path = os.path.join(base_path, "{0}-genes.tsv".format(project_id)) gene_files += [file_path] with open(file_path, "w") as f: tsv.write_param(f, "classifier", classifier_id) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "slice", project_id) tsv.write_line(f, "GENE_ID", "PVALUE") for gene in projdb.genes(): if gene.fm_pvalue is not None: tsv.write_line(f, gene.id, gene.fm_pvalue, null_value="-") count += 1 log.info(" {0} genes".format(count)) log.info(" Pathways ...") count = 0 file_path = os.path.join(base_path, "{0}-pathways.tsv".format(project_id)) pathway_files += [file_path] with open(file_path, "w") as f: tsv.write_param(f, "classifier", classifier_id) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "slice", project_id) tsv.write_line(f, "PATHWAY_ID", "ZSCORE") for pathway in projdb.pathways(): if pathway.fm_zscore is not None: tsv.write_line(f, pathway.id, pathway.fm_zscore, null_value="-") count += 1 log.info(" {0} pathways".format(count)) projdb.close() log.info("Combining ...") combination_path = paths.combination_path("oncodrivefm") log.info(" Genes ...") cmd = " ".join([ "oncodrivefm-combine", "-m median-empirical", "-o '{0}'".format(combination_path), "-n 'gene-{0}'".format(group_file_prefix), "-D 'classifier={0}'".format(classifier_id), "-D 'group_id={0}'".format(group_name), "-D 'group_short_name={0}'".format(group_short_name), "-D 'group_long_name={0}'".format(group_long_name), "--output-format tsv.gz" ] + ["'{0}'".format(name) for name in gene_files]) log.debug(cmd) ret_code = subprocess.call(cmd, shell=True) if ret_code != 0: #log.error("OncodriveFM error while combining gene pvalues:\n{0}".format(cmd)) #return -1 raise Exception("OncodriveFM error while combining gene pvalues:\n{0}".format(cmd)) log.info(" Pathways ...") cmd = " ".join([ "oncodrivefm-combine", "-m median-zscore", "-o '{0}'".format(combination_path), "-n 'pathway-{0}'".format(group_file_prefix), "-D 'classifier={0}'".format(classifier_id), "-D 'group_id={0}'".format(group_name), "-D 'group_short_name={0}'".format(group_short_name), "-D 'group_long_name={0}'".format(group_long_name), "--output-format tsv.gz" ] + ["'{0}'".format(name) for name in pathway_files]) log.debug(cmd) ret_code = subprocess.call(cmd, shell=True) if ret_code != 0: #log.error("OncodriveFM error while combining pathway zscores:\n{0}".format(cmd)) #return -1 raise Exception("OncodriveFM error while combining pathway zscores:\n{0}".format(cmd)) remove_temp(task, base_path)
def pack_datasets(project): log = task.logger config = GlobalConfig(task.conf) project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) if not config.results.create_zip: log.info("Creation of the results compressed file is deactivated. Skipped.") return project_path = project["path"] temp_path = project["temp_path"] dest_path = os.path.join(project_path, "results.zip") sigdb = SigDb(config.sigdb_path) sigdb.open() projdb = ProjectDb(project["db"]) projres = ProjectResults(project) gene_sym = projdb.get_gene_symbols() total_samples = projdb.get_total_affected_samples() log.info("Compressing files ...") arc = None try: arc = Archive(dest_path, mode="w", fmt="zip") log.info(" Variant genes ...") with ArcFile(task, arc, project_id, "variant_genes", "w") as vf: write_line(vf, "PROJECT_ID", "CHR", "STRAND", "START", "ALLELE", "GENE_ID", "SYMBOL", "VAR_IMPACT", "VAR_IMPACT_DESC", "SAMPLE_FREQ", "SAMPLE_TOTAL", "SAMPLE_PROP", "CODING_REGION", "PROTEIN_CHANGES", "INTOGEN_DRIVER", "XREFS") for afg in projdb.affected_genes(join_variant=True, join_xrefs=True, join_rec=True): var = afg.var rec = afg.rec start, end, ref, alt = var_to_tab(var) xrefs = [xref for xref in var.xrefs] if sigdb.exists_variant(var.chr, start): xrefs += ["I:1"] xrefs = ",".join(xrefs) intogen_driver = 1 if sigdb.exists_gene(afg.gene_id) else 0 write_line(vf, project_id, var.chr, var.strand, start, "{0}/{1}".format(ref, alt), afg.gene_id, gene_sym.get(afg.gene_id), afg.impact, TransFIC.class_name(afg.impact), rec.sample_freq or 0, total_samples, rec.sample_prop or 0, afg.coding_region, afg.prot_changes, intogen_driver, xrefs) log.info(" Variant samples ...") with ArcFile(task, arc, project_id, "variant_samples", "w") as vf: write_line(vf, "PROJECT_ID", "CHR", "STRAND", "START", "ALLELE", "SAMPLES") for var in projdb.variants(join_samples=True): start, end, ref, alt = var_to_tab(var) write_line(vf, project_id, var.chr, var.strand, start, "{0}/{1}".format(ref, alt), ",".join([s.name for s in var.samples])) log.info(" Consequences ...") with ArcFile(task, arc, project_id, "consequences", "w") as cf: write_line(cf, "PROJECT_ID", "CHR", "STRAND", "START", "ALLELE", "TRANSCRIPT_ID", "CT", "GENE_ID", "SYMBOL", "UNIPROT_ID", "PROTEIN_ID", "PROTEIN_POS", "AA_CHANGE", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS", "IMPACT", "IMPACT_CLASS") for csq in projdb.consequences(join_variant=True): var = csq.var start, end, ref, alt = var_to_tab(var) allele = "{0}/{1}".format(ref, alt) uniprot = protein = protein_pos = aa_change = None sift_score = sift_tfic = sift_tfic_class = None pph2_score = pph2_tfic = pph2_tfic_class = None ma_score = ma_tfic = ma_tfic_class = None if so.match(csq.ctypes, so.ONCODRIVEFM): uniprot, protein = csq.uniprot, csq.protein if so.match(csq.ctypes, so.NON_SYNONYMOUS): protein_pos, aa_change = csq.protein_pos, csq.aa_change sift_score, sift_tfic, sift_tfic_class = csq.sift_score, csq.sift_tfic, TransFIC.class_name(csq.sift_tfic_class) pph2_score, pph2_tfic, pph2_tfic_class = csq.pph2_score, csq.pph2_tfic, TransFIC.class_name(csq.pph2_tfic_class) ma_score, ma_tfic, ma_tfic_class = csq.ma_score, csq.ma_tfic, TransFIC.class_name(csq.ma_tfic_class) write_line(cf, project_id, var.chr, var.strand, start, allele, csq.transcript, ",".join(csq.ctypes), csq.gene, gene_sym.get(csq.gene), uniprot, protein, protein_pos, aa_change, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class, csq.impact, TransFIC.class_name(csq.impact)) log.info(" Genes ...") with ArcFile(task, arc, project_id, "genes", "w") as gf: write_line(gf, "PROJECT_ID", "GENE_ID", "SYMBOL", "FM_PVALUE", "FM_QVALUE", "FM_EXC_CAUSE", "SAMPLE_FREQ", "SAMPLE_TOTAL", "SAMPLE_PROP", "CLUST_ZSCORE", "CLUST_PVALUE", "CLUST_QVALUE", "CLUST_EXC_CAUSE", "CLUST_COORDS", "INTOGEN_DRIVER", "XREFS") for gene in projdb.genes(join_xrefs=True, join_rec=True): if gene.rec.sample_freq is not None and gene.rec.sample_freq > 0: intogen_driver = 1 if sigdb.exists_gene(gene.id) else 0 write_line(gf, project_id, gene.id, gene.symbol, gene.fm_pvalue, gene.fm_qvalue, gene.fm_exc_cause, gene.rec.sample_freq, total_samples, gene.rec.sample_prop or 0, gene.clust_zscore, gene.clust_pvalue, gene.clust_qvalue, gene.clust_exc_cause, gene.clust_coords, intogen_driver, ",".join(gene.xrefs)) log.info(" Pathways ...") with ArcFile(task, arc, project_id, "pathways", "w") as pf: write_line(pf, "PROJECT_ID", "PATHWAY_ID", "GENE_COUNT", "FM_ZSCORE", "FM_PVALUE", "FM_QVALUE", "SAMPLE_FREQ", "SAMPLE_TOTAL", "SAMPLE_PROP", "GENE_FREQ", "GENE_TOTAL", "GENE_PROP") for pathway in projdb.pathways(join_rec=True): if pathway.rec.sample_freq is not None and pathway.rec.sample_freq > 0: write_line(pf, project_id, pathway.id, pathway.gene_count, pathway.fm_zscore, pathway.fm_pvalue, pathway.fm_qvalue, pathway.rec.sample_freq or 0, total_samples, pathway.rec.sample_prop or 0, pathway.rec.gene_freq or 0, pathway.gene_count, pathway.rec.gene_prop or 0) if not config.skip_oncodrivefm: log.info(" Genes per sample functional impact ...") with ArcFile(task, arc, project_id, "fimpact.gitools.tdm", "w") as f: write_line(f, "SAMPLE", "GENE_ID", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS") for fields in projdb.sample_gene_fimpacts(): (gene, sample, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class) = fields write_line(f, sample, gene, sift_score, sift_tfic, TransFIC.class_name(sift_tfic_class), pph2_score, pph2_tfic, TransFIC.class_name(pph2_tfic_class), ma_score, ma_tfic, TransFIC.class_name(ma_tfic_class)) log.info("Saving project configuration ...") with ArcFile(task, arc, project_id, "project", "w") as f: names = ["PROJECT_ID", "ASSEMBLY", "SAMPLES_TOTAL"] values = [project_id, project["assembly"], total_samples] names, values = projres.get_annotations_to_save(config.project.annotations, project["annotations"], names=names, values=values) tsv.write_line(f, *names) tsv.write_line(f, *values, null_value="-") finally: if arc is not None: arc.close() projdb.close() sigdb.close()
def main(): parser = argparse.ArgumentParser( description="Prepare SNV's dataset from individual training sets") parser.add_argument("pos_path", metavar="POS_SET", help="The positive training set file") parser.add_argument("neg_path", metavar="NEG_SET", help="The negative training set file") parser.add_argument("-m", "--map", dest="map_path", metavar="MAP", help="Optional mapping file for feature id's. Format: DST SRC") parser.add_argument("-o", dest="out_path", metavar="PATH", help="Output file. Use - for standard output.") bglogging.add_logging_arguments(parser) args = parser.parse_args() bglogging.initialize(args) logger = bglogging.get_logger("training-sets") if args.out_path is None: prefix = os.path.commonprefix([ os.path.splitext(os.path.basename(args.pos_path))[0], os.path.splitext(os.path.basename(args.neg_path))[0]]) prefix = prefix.rstrip(".") args.out_path = os.path.join(os.getcwd(), "{}-training.tsv".format(prefix)) if args.map_path is not None: logger.info("Loading map ...") prot_map = {} with tsv.open(args.map_path) as f: for dst_feature, src_feature in tsv.lines(f, (str, str)): if len(src_feature) > 0: if src_feature not in prot_map: prot_map[src_feature] = set([dst_feature]) else: prot_map[src_feature].add(dst_feature) else: prot_map = None logger.info("Processing ...") hits = dict(POS=0, NEG=0) fails = dict(POS=0, NEG=0) start_time = datetime.now() with tsv.open(args.out_path, "w") as wf: for event_type, path in (("POS", args.pos_path), ("NEG", args.neg_path)): logger.info(" [{}] Reading {} ...".format(event_type, path)) with tsv.open(path) as f: types = (str, int, str, str) for protein, pos, aa1, aa2 in tsv.lines(f, types): protein = protein.strip() if prot_map is not None: if protein not in prot_map: logger.debug("[{}] Unmapped protein: {}".format(event_type, protein)) fails[event_type] += 1 continue proteins = prot_map[protein] else: proteins = [protein] hits[event_type] += 1 for p in proteins: tsv.write_line(wf, p, pos, aa1.strip(), aa2.strip(), event_type) logger.info(" POS NEG") logger.info("SNVs {POS:>8} {NEG:>8}".format(**hits)) if args.map_path is not None: logger.info("unmapped {POS:>8} {NEG:>8}".format(**fails)) logger.info("Finished. Elapsed time: {}".format(datetime.now() - start_time))