Beispiel #1
0
def get_transcripts(fanns_db, mut_cds, mut_aa, mut_pos, mut_strand, acc, logger):
	fields = [mut_cds, mut_aa, mut_pos, mut_strand, acc]

	cds_ctx = ReContext(mut_cds)
	aa_ctx = ReContext(mut_aa)

	if cds_ctx.match(MUT_CDS_RE):
		ref, alt = [cds_ctx.group(i) for i in xrange(1, 3)]
		aa_ref_len = len(ref)
		if aa_ref_len != len(alt):
			logger.warn("Found substitution with different alleles: {}".format(fields))

		if mut_strand == "-":
			ref = complementary_sequence(ref)
			alt = complementary_sequence(alt)

		pos_ctx = ReContext(mut_pos)
		if not pos_ctx.match(MUT_POS_RE):
			logger.warn("Unexpected mutation position: {}".format(fields))
			return

		chrom, start = [pos_ctx.group(i) for i in xrange(1, 3)]
		if chrom == "25":
			return

		start = int(start)
		for i in xrange(aa_ref_len):
			#logger.info("{}{}:{}:{}/{}:{} ({}, {})".format(chrom, mut_strand, start+i, ref[i], alt[i], acc, mut_cds, mut_aa))
			for row in fanns_db.query_scores(chr=chrom, start=start + i, ref=ref[i], alt=alt[i],
												  strand=mut_strand, transcript=acc, maps=["symbol"]):
				#logger.info("  -> {}".format(row))
				yield row

	elif aa_ctx.match(MUT_AA_RE):
		aa_ref, aa_pos, aa_alt = [aa_ctx.group(i) for i in xrange(1, 4)]
		aa_ref_len = len(aa_ref)
		if aa_ref_len != len(aa_alt):
			logger.warn("Found substitution with different alleles: {}".format(fields))

		aa_pos = int(aa_pos)
		for i in xrange(aa_ref_len):
			for row in fanns_db.query_scores(protein=acc, aa_pos=aa_pos + i, aa_ref=aa_ref[i], aa_alt=aa_alt[i],
												  maps=["symbol", "prot_transcript"]):
				yield row
def fimpact_run(partition):
	log = task.logger

	config = GlobalConfig(task.conf)
	paths = PathsConfig(config)

	results_port = task.ports("results")

	project = partition["project"]

	log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"]))

	log.info("Reading MA scores ...")

	ma_uniprot = {}
	ma_scores = {}

	with open(partition["ma_path"], "r") as f:
		for var_id, uniprot, fi_score in tsv.lines(f, (int, str, float), null_value="-"):
			ma_uniprot[var_id] = uniprot
			ma_scores[var_id] = fi_score

	log.info("Reading VEP results and calculating functional impact ...")

	tfic = TransFIC(data_path=paths.data_transfic_path())

	tfi_path = os.path.join(partition["base_path"], "{0:08d}.tfi".format(partition["index"]))
	cf = open(tfi_path, "w")

	with open(partition["vep_path"], "r") as f:
		for fields in tsv.lines(f, (int, str, str, str, str, str, str, float, float), null_value="-"):
			(var_id, gene, transcript, ct,
				protein_pos, aa_change, protein,
				sift_score, pph2_score) = fields

			ct = (ct or "").split(",")

			# Invert sift score
			if sift_score is not None:
				sift_score = 1.0 - sift_score

			ma_score = None

			uniprot = ma_uniprot.get(var_id)

			sift_impact = pph2_impact = ma_impact = None # TransFIC.UNKNOWN_IMPACT_CLASS

			coding_region = 1 if so.match(ct, so.CODING_REGION) else 0

			sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class = (None, None, None, None, None, None)

			ct_type = None
			if so.match(ct, so.NON_SYNONYMOUS):       # missense
				ct_type = TransFIC.CT_NON_SYNONYMOUS
				ma_score = ma_scores.get(var_id)

				(sift_tfic, sift_class,
				 pph2_tfic, pph2_class,
				 ma_tfic, ma_class) = tfic.calculate("gosmf", gene, ct_type, sift_score, pph2_score, ma_score)

				sift_impact = sift_class if sift_class in IMPACT_CLASSES else sift_impact
				pph2_impact = pph2_class if pph2_class in IMPACT_CLASSES else pph2_impact
				ma_impact = ma_class if ma_class in IMPACT_CLASSES else ma_impact
			elif so.match(ct, so.STOP):               # stop
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.FRAMESHIFT):         # frameshift
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.SPLICE_JUNCTION):    # splice junction
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.SPLICE_REGION):      # splice region
				sift_impact = pph2_impact = ma_impact = TransFIC.UNKNOWN_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.SYNONYMOUS):         # synonymous
				sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS
				sift_score = pph2_score = 0.0
				ma_score = -2
			else:
				sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS

			aff_gene = (var_id, gene)

			# try to follow the convention http://www.hgvs.org/mutnomen/recs-prot.html
			prot_change = None
			if ct_type == TransFIC.CT_FRAMESHIFT:
				if protein_pos is None:
					prot_change = "fs"
				else:
					prot_change = "fs {0}".format(protein_pos)
				#log.debug("FRAMESHIFT: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change))
			elif ct_type == "splice":
				prot_change = "r.spl?"
				#log.debug("SPLICE: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change))
			elif protein_pos is not None and aa_change is not None:
				rc = ReContext()
				if rc.match(SIMPLE_AA_CHANGE_RE, aa_change):
					prot_change = "{ref}{pos}{alt}".format(pos=protein_pos, ref=rc.group(1), alt=rc.group(2) or "=")
				elif rc.match(COMPLEX_AA_CHANGE_RE, aa_change):
					prot_change = "{0} {1}".format(aa_change, protein_pos)
				else:
					log.warn("Unmatched aa change: gene={}, protein={}, pos={}, change={}, ct=[{}]".format(
													gene, protein, protein_pos, aa_change, ", ".join(ct)))

			tr_impact = ma_impact or pph2_impact or sift_impact or TransFIC.UNKNOWN_IMPACT_CLASS

			tsv.write_line(cf, var_id, transcript, gene, uniprot, prot_change, coding_region, tr_impact,
						   sift_score, sift_tfic, sift_class, sift_impact,
						   pph2_score, pph2_tfic, pph2_class, pph2_impact,
						   ma_score, ma_tfic, ma_class, ma_impact,
						   null_value="-")

	cf.close()

	# Send results to the next module
	partition["tfi_path"] = tfi_path
	results_port.send(partition)
Beispiel #3
0
def fimpact_run(partition):
	log = task.logger
	conf = task.conf

	results_port = task.ports("results")

	project = partition["project"]

	log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"]))

	log.info("Reading MA scores ...")

	ma_uniprot = {}
	ma_scores = {}

	with open(partition["ma_path"], "r") as f:
		for var_id, uniprot, fi_score in tsv.lines(f, (int, str, float), null_value="-"):
			ma_uniprot[var_id] = uniprot
			ma_scores[var_id] = fi_score

	log.info("Reading VEP results and calculating functional impact ...")

	tfic = TransFIC(data_path=os.path.join(conf["data_path"], "TransFIC"))

	tfi_path = os.path.join(partition["base_path"], "{0:08d}.tfi".format(partition["index"]))
	cf = open(tfi_path, "w")

	aff_gene_attrs = {}

	with open(partition["vep_path"], "r") as f:
		for fields in tsv.lines(f, (int, str, str, str, str, str, str, float, float), null_value="-"):
			(var_id, gene, transcript, ct,
				protein_pos, aa_change, protein,
				sift_score, pph2_score) = fields

			if ct is not None:
				ct = ct.split(",")
			else:
				ct = []

			# Invert sift score
			if sift_score is not None:
				sift_score = 1.0 - sift_score

			ma_score = None

			uniprot = ma_uniprot[var_id] if var_id in ma_uniprot else None

			sift_impact = pph2_impact = ma_impact = None # TransFIC.UNKNOWN_IMPACT_CLASS

			coding_region = so.match(ct, so.CODING_REGION)

			calculate_transfic = True

			ct_type = None
			if so.match(ct, so.NON_SYNONYMOUS):       # missense
				ct_type = TransFIC.CT_NON_SYNONYMOUS
				ma_score = ma_scores[var_id] if var_id in ma_scores else None
			elif so.match(ct, so.STOP):               # stop
				ct_type = TransFIC.CT_STOP
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.FRAMESHIFT):         # frameshift
				ct_type = TransFIC.CT_FRAMESHIFT
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.SPLICE):             # splice
				ct_type = "splice"
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS if so.match(ct, so.SPLICE_JUNCTION) else TransFIC.UNKNOWN_IMPACT_CLASS
				calculate_transfic = False
			elif so.match(ct, so.SYNONYMOUS):         # synonymous
				ct_type = TransFIC.CT_SYNONYMOUS
				sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS
				sift_score = pph2_score = 0.0
				ma_score = -2
			else:
				sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS
				calculate_transfic = False

			if calculate_transfic:
				(sift_tfic, sift_class,
				 pph2_tfic, pph2_class,
				 ma_tfic, ma_class) = tfic.calculate("gosmf", gene, ct_type, sift_score, pph2_score, ma_score)

				# if the impact was not preassigned get it from the transFIC calculated class
				sift_impact = sift_class if sift_impact is None and sift_class in IMPACT_CLASSES else sift_impact
				pph2_impact = pph2_class if pph2_impact is None and pph2_class in IMPACT_CLASSES else pph2_impact
				ma_impact = ma_class if ma_impact is None and ma_class in IMPACT_CLASSES else ma_impact
			else:
				sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class = (None, None, None, None, None, None)

			aff_gene = (var_id, gene)

			# update aggregated impact for all the predictors
			update_attr(aff_gene_attrs, aff_gene, "sift_impact", sift_impact, update=TransFIC.higher_impact)
			update_attr(aff_gene_attrs, aff_gene, "pph2_impact", pph2_impact, update=TransFIC.higher_impact)
			update_attr(aff_gene_attrs, aff_gene, "ma_impact", ma_impact, update=TransFIC.higher_impact)

			# update whether the affected gene is a coding region or not
			update_attr(aff_gene_attrs, aff_gene, "coding_region", coding_region,
						update=lambda prev_value, value: prev_value or value)

			# aggregate protein changes per affected_gene
			# try to follow the convention http://www.hgvs.org/mutnomen/recs-prot.html
			prot_change = None
			if ct_type == TransFIC.CT_FRAMESHIFT:
				if protein_pos is None:
					prot_change = "fs"
				else:
					prot_change = "fs {0}".format(protein_pos)
				#log.debug("FRAMESHIFT: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change))
			elif ct_type == "splice":
				prot_change = "r.spl?"
				#log.debug("SPLICE: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change))
			elif protein_pos is not None and aa_change is not None:
				rc = ReContext()
				if rc.match(SIMPLE_AA_CHANGE_RE, aa_change):
					prot_change = "{ref}{pos}{alt}".format(pos=protein_pos, ref=rc.group(1), alt=rc.group(2) or "=")
				elif rc.match(COMPLEX_AA_CHANGE_RE, aa_change):
					prot_change = "{0} {1}".format(aa_change, protein_pos)
				else:
					log.warn("Unmatched aa change: gene={}, protein={}, pos={}, change={}, ct=[{}]".format(
													gene, protein, protein_pos, aa_change, ", ".join(ct)))

			if prot_change is not None:
				update_attr(aff_gene_attrs, aff_gene, "prot_changes", prot_change,
								 new=lambda value: set([value]),
								 update=lambda prev_value, value: prev_value | set([value]))

			impact = ma_impact or pph2_impact or sift_impact or TransFIC.UNKNOWN_IMPACT_CLASS

			tsv.write_line(cf, var_id, transcript, uniprot,
						   sift_score, sift_tfic, sift_class,
						   pph2_score, pph2_tfic, pph2_class,
						   ma_score, ma_tfic, ma_class,
						   impact, null_value="-")

	cf.close()

	log.info("Saving variant impacts ...")

	gfi_path = os.path.join(partition["base_path"], "{0:08d}.gfi".format(partition["index"]))
	vf = open(gfi_path, "w")
	for aff_gene, attrs in aff_gene_attrs.items():
		var_id, gene = aff_gene
		# get the impact by trust priority: ma, pph2, sift
		impact = attrs.get("ma_impact") or attrs.get("pph2_impact") or attrs.get("sift_impact") or TransFIC.UNKNOWN_IMPACT_CLASS
		coding_region = attrs.get("coding_region", False)
		coding_region = 1 if coding_region else 0
		prot_changes = attrs.get("prot_changes")
		prot_changes = ",".join(prot_changes) if prot_changes is not None else None

		tsv.write_line(vf, var_id, gene, impact, coding_region, prot_changes, null_value="-")
	vf.close()

	# Send results to the next module
	partition["tfi_path"] = tfi_path
	partition["gfi_path"] = gfi_path
	results_port.send(partition)