def get_transcripts(fanns_db, mut_cds, mut_aa, mut_pos, mut_strand, acc, logger): fields = [mut_cds, mut_aa, mut_pos, mut_strand, acc] cds_ctx = ReContext(mut_cds) aa_ctx = ReContext(mut_aa) if cds_ctx.match(MUT_CDS_RE): ref, alt = [cds_ctx.group(i) for i in xrange(1, 3)] aa_ref_len = len(ref) if aa_ref_len != len(alt): logger.warn("Found substitution with different alleles: {}".format(fields)) if mut_strand == "-": ref = complementary_sequence(ref) alt = complementary_sequence(alt) pos_ctx = ReContext(mut_pos) if not pos_ctx.match(MUT_POS_RE): logger.warn("Unexpected mutation position: {}".format(fields)) return chrom, start = [pos_ctx.group(i) for i in xrange(1, 3)] if chrom == "25": return start = int(start) for i in xrange(aa_ref_len): #logger.info("{}{}:{}:{}/{}:{} ({}, {})".format(chrom, mut_strand, start+i, ref[i], alt[i], acc, mut_cds, mut_aa)) for row in fanns_db.query_scores(chr=chrom, start=start + i, ref=ref[i], alt=alt[i], strand=mut_strand, transcript=acc, maps=["symbol"]): #logger.info(" -> {}".format(row)) yield row elif aa_ctx.match(MUT_AA_RE): aa_ref, aa_pos, aa_alt = [aa_ctx.group(i) for i in xrange(1, 4)] aa_ref_len = len(aa_ref) if aa_ref_len != len(aa_alt): logger.warn("Found substitution with different alleles: {}".format(fields)) aa_pos = int(aa_pos) for i in xrange(aa_ref_len): for row in fanns_db.query_scores(protein=acc, aa_pos=aa_pos + i, aa_ref=aa_ref[i], aa_alt=aa_alt[i], maps=["symbol", "prot_transcript"]): yield row
def fimpact_run(partition): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) results_port = task.ports("results") project = partition["project"] log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"])) log.info("Reading MA scores ...") ma_uniprot = {} ma_scores = {} with open(partition["ma_path"], "r") as f: for var_id, uniprot, fi_score in tsv.lines(f, (int, str, float), null_value="-"): ma_uniprot[var_id] = uniprot ma_scores[var_id] = fi_score log.info("Reading VEP results and calculating functional impact ...") tfic = TransFIC(data_path=paths.data_transfic_path()) tfi_path = os.path.join(partition["base_path"], "{0:08d}.tfi".format(partition["index"])) cf = open(tfi_path, "w") with open(partition["vep_path"], "r") as f: for fields in tsv.lines(f, (int, str, str, str, str, str, str, float, float), null_value="-"): (var_id, gene, transcript, ct, protein_pos, aa_change, protein, sift_score, pph2_score) = fields ct = (ct or "").split(",") # Invert sift score if sift_score is not None: sift_score = 1.0 - sift_score ma_score = None uniprot = ma_uniprot.get(var_id) sift_impact = pph2_impact = ma_impact = None # TransFIC.UNKNOWN_IMPACT_CLASS coding_region = 1 if so.match(ct, so.CODING_REGION) else 0 sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class = (None, None, None, None, None, None) ct_type = None if so.match(ct, so.NON_SYNONYMOUS): # missense ct_type = TransFIC.CT_NON_SYNONYMOUS ma_score = ma_scores.get(var_id) (sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class) = tfic.calculate("gosmf", gene, ct_type, sift_score, pph2_score, ma_score) sift_impact = sift_class if sift_class in IMPACT_CLASSES else sift_impact pph2_impact = pph2_class if pph2_class in IMPACT_CLASSES else pph2_impact ma_impact = ma_class if ma_class in IMPACT_CLASSES else ma_impact elif so.match(ct, so.STOP): # stop sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.FRAMESHIFT): # frameshift sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.SPLICE_JUNCTION): # splice junction sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.SPLICE_REGION): # splice region sift_impact = pph2_impact = ma_impact = TransFIC.UNKNOWN_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.SYNONYMOUS): # synonymous sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS sift_score = pph2_score = 0.0 ma_score = -2 else: sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS aff_gene = (var_id, gene) # try to follow the convention http://www.hgvs.org/mutnomen/recs-prot.html prot_change = None if ct_type == TransFIC.CT_FRAMESHIFT: if protein_pos is None: prot_change = "fs" else: prot_change = "fs {0}".format(protein_pos) #log.debug("FRAMESHIFT: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change)) elif ct_type == "splice": prot_change = "r.spl?" #log.debug("SPLICE: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change)) elif protein_pos is not None and aa_change is not None: rc = ReContext() if rc.match(SIMPLE_AA_CHANGE_RE, aa_change): prot_change = "{ref}{pos}{alt}".format(pos=protein_pos, ref=rc.group(1), alt=rc.group(2) or "=") elif rc.match(COMPLEX_AA_CHANGE_RE, aa_change): prot_change = "{0} {1}".format(aa_change, protein_pos) else: log.warn("Unmatched aa change: gene={}, protein={}, pos={}, change={}, ct=[{}]".format( gene, protein, protein_pos, aa_change, ", ".join(ct))) tr_impact = ma_impact or pph2_impact or sift_impact or TransFIC.UNKNOWN_IMPACT_CLASS tsv.write_line(cf, var_id, transcript, gene, uniprot, prot_change, coding_region, tr_impact, sift_score, sift_tfic, sift_class, sift_impact, pph2_score, pph2_tfic, pph2_class, pph2_impact, ma_score, ma_tfic, ma_class, ma_impact, null_value="-") cf.close() # Send results to the next module partition["tfi_path"] = tfi_path results_port.send(partition)
def fimpact_run(partition): log = task.logger conf = task.conf results_port = task.ports("results") project = partition["project"] log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"])) log.info("Reading MA scores ...") ma_uniprot = {} ma_scores = {} with open(partition["ma_path"], "r") as f: for var_id, uniprot, fi_score in tsv.lines(f, (int, str, float), null_value="-"): ma_uniprot[var_id] = uniprot ma_scores[var_id] = fi_score log.info("Reading VEP results and calculating functional impact ...") tfic = TransFIC(data_path=os.path.join(conf["data_path"], "TransFIC")) tfi_path = os.path.join(partition["base_path"], "{0:08d}.tfi".format(partition["index"])) cf = open(tfi_path, "w") aff_gene_attrs = {} with open(partition["vep_path"], "r") as f: for fields in tsv.lines(f, (int, str, str, str, str, str, str, float, float), null_value="-"): (var_id, gene, transcript, ct, protein_pos, aa_change, protein, sift_score, pph2_score) = fields if ct is not None: ct = ct.split(",") else: ct = [] # Invert sift score if sift_score is not None: sift_score = 1.0 - sift_score ma_score = None uniprot = ma_uniprot[var_id] if var_id in ma_uniprot else None sift_impact = pph2_impact = ma_impact = None # TransFIC.UNKNOWN_IMPACT_CLASS coding_region = so.match(ct, so.CODING_REGION) calculate_transfic = True ct_type = None if so.match(ct, so.NON_SYNONYMOUS): # missense ct_type = TransFIC.CT_NON_SYNONYMOUS ma_score = ma_scores[var_id] if var_id in ma_scores else None elif so.match(ct, so.STOP): # stop ct_type = TransFIC.CT_STOP sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.FRAMESHIFT): # frameshift ct_type = TransFIC.CT_FRAMESHIFT sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.SPLICE): # splice ct_type = "splice" sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS if so.match(ct, so.SPLICE_JUNCTION) else TransFIC.UNKNOWN_IMPACT_CLASS calculate_transfic = False elif so.match(ct, so.SYNONYMOUS): # synonymous ct_type = TransFIC.CT_SYNONYMOUS sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS sift_score = pph2_score = 0.0 ma_score = -2 else: sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS calculate_transfic = False if calculate_transfic: (sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class) = tfic.calculate("gosmf", gene, ct_type, sift_score, pph2_score, ma_score) # if the impact was not preassigned get it from the transFIC calculated class sift_impact = sift_class if sift_impact is None and sift_class in IMPACT_CLASSES else sift_impact pph2_impact = pph2_class if pph2_impact is None and pph2_class in IMPACT_CLASSES else pph2_impact ma_impact = ma_class if ma_impact is None and ma_class in IMPACT_CLASSES else ma_impact else: sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class = (None, None, None, None, None, None) aff_gene = (var_id, gene) # update aggregated impact for all the predictors update_attr(aff_gene_attrs, aff_gene, "sift_impact", sift_impact, update=TransFIC.higher_impact) update_attr(aff_gene_attrs, aff_gene, "pph2_impact", pph2_impact, update=TransFIC.higher_impact) update_attr(aff_gene_attrs, aff_gene, "ma_impact", ma_impact, update=TransFIC.higher_impact) # update whether the affected gene is a coding region or not update_attr(aff_gene_attrs, aff_gene, "coding_region", coding_region, update=lambda prev_value, value: prev_value or value) # aggregate protein changes per affected_gene # try to follow the convention http://www.hgvs.org/mutnomen/recs-prot.html prot_change = None if ct_type == TransFIC.CT_FRAMESHIFT: if protein_pos is None: prot_change = "fs" else: prot_change = "fs {0}".format(protein_pos) #log.debug("FRAMESHIFT: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change)) elif ct_type == "splice": prot_change = "r.spl?" #log.debug("SPLICE: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change)) elif protein_pos is not None and aa_change is not None: rc = ReContext() if rc.match(SIMPLE_AA_CHANGE_RE, aa_change): prot_change = "{ref}{pos}{alt}".format(pos=protein_pos, ref=rc.group(1), alt=rc.group(2) or "=") elif rc.match(COMPLEX_AA_CHANGE_RE, aa_change): prot_change = "{0} {1}".format(aa_change, protein_pos) else: log.warn("Unmatched aa change: gene={}, protein={}, pos={}, change={}, ct=[{}]".format( gene, protein, protein_pos, aa_change, ", ".join(ct))) if prot_change is not None: update_attr(aff_gene_attrs, aff_gene, "prot_changes", prot_change, new=lambda value: set([value]), update=lambda prev_value, value: prev_value | set([value])) impact = ma_impact or pph2_impact or sift_impact or TransFIC.UNKNOWN_IMPACT_CLASS tsv.write_line(cf, var_id, transcript, uniprot, sift_score, sift_tfic, sift_class, pph2_score, pph2_tfic, pph2_class, ma_score, ma_tfic, ma_class, impact, null_value="-") cf.close() log.info("Saving variant impacts ...") gfi_path = os.path.join(partition["base_path"], "{0:08d}.gfi".format(partition["index"])) vf = open(gfi_path, "w") for aff_gene, attrs in aff_gene_attrs.items(): var_id, gene = aff_gene # get the impact by trust priority: ma, pph2, sift impact = attrs.get("ma_impact") or attrs.get("pph2_impact") or attrs.get("sift_impact") or TransFIC.UNKNOWN_IMPACT_CLASS coding_region = attrs.get("coding_region", False) coding_region = 1 if coding_region else 0 prot_changes = attrs.get("prot_changes") prot_changes = ",".join(prot_changes) if prot_changes is not None else None tsv.write_line(vf, var_id, gene, impact, coding_region, prot_changes, null_value="-") vf.close() # Send results to the next module partition["tfi_path"] = tfi_path partition["gfi_path"] = gfi_path results_port.send(partition)