コード例 #1
0
ファイル: service.py プロジェクト: chris-zen/phd-thesis
	def run(self, variants_path):
		"""
		Run the VEP service and save results in a temporary file.

		:param variants_path: File with variants. In BED format. http://www.ensembl.org/info/docs/variation/vep/vep_script.html#custom_formats
		:return: True if successfull or False otherwise
		"""

		if self.results_path is None:
			self.results_path = tempfile.mkstemp()[1]

		with open(self.results_path, "w") as rf:
			with open(variants_path, "r") as vf:
				column_types = (str, int, int, str, str, int)
				for fields in tsv.lines(vf, column_types):
					chr, start, end, allele, strand, var_id = fields

					alt = allele[allele.find("/") + 1:]

					results = self.get(chr, start, end, strand, alt, var_id)
					if results is None:
						continue

					for r in results:
						rf.write(tsv.line_text(
							var_id, chr, start, allele,
							r.gene, r.transcript, ",".join(sorted(r.consequences)),
							r.protein_pos, r.aa_change, r.protein,
							r.sift, r.polyphen, null_value="-"))
コード例 #2
0
ファイル: update_db.py プロジェクト: chris-zen/phd-thesis
def update_db(project):
	log = task.logger
	conf = task.conf

	projects_out_port = task.ports("projects_out")

	project_id = project["id"]
	log.info("--- [{0}] --------------------------------------------".format(project_id))

	oclust = project["oncodriveclust"]
	del project["oncodriveclust"]

	if not os.path.exists(oclust["results"]):
		log.warn("No results have been found. Skipping it.")
		return

	log.info("Updating the project database ...")

	projdb = ProjectDb(project["db"])

	exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv")

	log.info("  Excluded gene causes ...")
	log.debug("    > {0}".format(exc_path))

	count = 0
	with tsv.open(exc_path, "r") as exf:
		for gene, cause in tsv.lines(exf, (str, str), header=True):
			projdb.update_gene(Gene(id=gene, clust_exc_cause=cause))
			count += 1

	log.debug("    {0} genes excluded".format(count))

	log.info("  OncodriveCLUST results ...")

	with tsv.open(oclust["results"], "r") as f:
		types = (str, str, float, float, float)
		columns = ("GENE", "CLUST_COORDS", "ZSCORE", "PVALUE", "QVALUE")
		for gene, coords, zscore, pvalue, qvalue in tsv.lines(f, types, columns=columns, header=True, null_value="NA"):
			projdb.update_gene(Gene(id=gene, clust_coords=coords, clust_zscore=zscore, clust_pvalue=pvalue,
									clust_qvalue=qvalue, clust_exc_cause=ProjectDb.NO_GENE_EXC))

	projdb.commit()

	projdb.close()

	projects_out_port.send(project)
コード例 #3
0
ファイル: vcf_to_snvs.py プロジェクト: chris-zen/phd-thesis
def main():
    parser = argparse.ArgumentParser(description="Extract mutations in VCF and save as simple tabulated file")

    parser.add_argument("vcf_paths", metavar="PATH", nargs="+", help="The VCF files")

    parser.add_argument("-o", dest="out_path", metavar="PATH", help="Output file. Use - for standard output.")

    bglogging.add_logging_arguments(self._parser)

    args = parser.parse_args()

    bglogging.initialize(self.args)

    log = bglogging.get_logger("vcf-to-snvs")

    if args.out_path is None:
        names = []
        for path in args.vcf_paths:
            if path != "-":
                base_path, name, ext = tsv.split_path(path)
                names += [name]

        prefix = os.path.commonprefix(*names) if len(names) > 0 else ""
        prefix = prefix.rstrip(".")
        if len(prefix) == 0:
            prefix = "genome"
        args.out_path = "{}.tsv.gz".format(prefix)

    with tsv.open(args.out_path, "w") as outf:
        tsv.write_line(outf, "CHR", "POS", "REF", "ALT")

        for path in args.vcf_paths:
            log.info("Reading {} ...".format(path))

            with tsv.open(path) as inf:
                types = (str, str, str, str)
                columns = [0, 1, 3, 4]
                for fields in tsv.lines(inf, types, columns=columns):
                    chrom, pos, ref, alt = fields

                    # ref = ref.upper().strip("N")
                    # alt = alt.upper().strip("N")

                    ref_len = len(ref)
                    alt_len = len(alt)

                    if ref_len != alt_len or ref_len == 0 or alt_len == 0:
                        continue

                    try:
                        pos = int(pos)
                    except:
                        continue

                    if ref_len == 1:
                        tsv.write_line(outf, chrom, pos, ref, alt)
                    else:
                        for i in range(ref_len):
                            tsv.write_line(outf, chrom, pos + i, ref[i], alt[i])
コード例 #4
0
	def load_cds_len(self, path):

		self.logger.info("Loading transcripts CDS length ...")
		self.logger.debug("> {}".format(path))

		cds_len = {}
		with tsv.open(path, "r") as f:
			for gene, transcript, transcript_len in tsv.lines(f, (str, str, int), header=True):
				cds_len[transcript] = transcript_len
		return cds_len
コード例 #5
0
ファイル: ann_add.py プロジェクト: chris-zen/phd-thesis
def main():
    parser = argparse.ArgumentParser(description="Add annotations")

    cmd = DefaultCommandHelper(parser)

    cmd.add_db_args()

    parser.add_argument("id", metavar="ID", help="Annotation identifier.")

    parser.add_argument("name", metavar="NAME", help="Annotation name.")

    parser.add_argument(
        "type", metavar="TYPE", choices=["transcript", "protein"], help="Annotation type: transcript, protein"
    )

    parser.add_argument("path", metavar="PATH", help="Annotation items")

    parser.add_argument(
        "--priority",
        dest="priority",
        default=0,
        help="Priority for translating input annotations. 0 means not considered for translation. Default 0.",
    )

    parser.add_argument(
        "--header",
        dest="header",
        action="store_true",
        default=False,
        help="Specify that the annotation items file have a header.",
    )

    args, logger = cmd.parse_args("ann-add")

    db = cmd.open_db()

    try:
        logger.info("Creating annotation {} ...".format(args.name))

        db.add_map(args.id, args.name, args.type, args.priority)

        logger.info("Loading items ...")

        with tsv.open(args.path) as f:
            for source, value in tsv.lines(f, (str, str), header=args.header):
                if len(source) > 0 and len(value) > 0:
                    db.add_map_item(args.id, source, value)

        db.commit()
    except:
        return cmd.handle_error()
    finally:
        db.close()

    return 0
コード例 #6
0
ファイル: service.py プロジェクト: chris-zen/phd-thesis
	def results(self):
		"""
		Iterator that parses the results temporary file and yields VepResult's
		"""

		with open(self.results_path, "r") as f:
			column_types = (int, str, int, str, str, str, _ctype, str, str, str, float, float)
			for fields in tsv.lines(f, column_types, null_value="-"):
				var_id, chr, start, allele,	gene, transcript, consequences, protein_pos, aa_change, protein, sift, polyphen = fields

				yield VepResult(var_id=var_id, chr=chr, start=start, allele=allele,
					gene=gene, transcript=transcript, consequences=consequences,
					protein_pos = protein_pos, aa_change=aa_change, protein=protein,
					sift=sift, polyphen=polyphen)
コード例 #7
0
ファイル: combine.py プロジェクト: chris-zen/phd-thesis
	def load_data(self, data_paths, method=None):
		columns = []
		col_names = []
		row_name_index = {}
		for col_index, data_file in enumerate(data_paths):
			self.log.debug("  > {0}".format(data_file))
			names = []
			values = []
			with tsv.open(data_file, "r") as f:
				col_name, ext = os.path.splitext(os.path.basename(data_file))
				params = tsv.params(f)
				if "slice" in params:
					col_name = params["slice"]
				if "method" in params:
					if method is None:
						method = params["method"]
					elif method != params["method"]:
						self.log.warn("Different method of computation used for file {0}".format(data_file))

				for name, value in tsv.lines(f, (str, float), header=True, null_value="-"):
					if len(name) == 0:
						self.log.warn("Empty identifier detected")
						continue

					if name not in row_name_index:
						row_name_index[name] = len(row_name_index)

					names += [name]
					values += [value]
			col_names += [col_name]
			columns += [(names, values)]

		num_cols = len(columns)
		num_rows = len(row_name_index)
		row_names = [None] * num_rows
		for name, index in row_name_index.items():
			row_names[index] = name

		data = np.empty((num_rows, num_cols))
		data[:] = np.nan

		for col_index, (names, values) in enumerate(columns):
			for i, name in enumerate(names):
				data[row_name_index[name], col_index] = values[i]

		return row_names, col_names, data, method
コード例 #8
0
ファイル: maps.py プロジェクト: chris-zen/phd-thesis
def add_map(db, id, name, type, priority, path, header=True):
	"""
	:param id: map identifier
	:param name: map name
	:param type: xref maps to type: transcript, protein
	:param path: map file
	:param priority: priority for translating input xrefs. 0 means not considered for translation. Default 0.
	:param header: specify that the map file have a header.
	"""

	logger = logging.getLogger("fannsdb.map-add")

	logger.info("Creating map {} ...".format(name))

	db.add_map(id, name, type, priority)

	logger.info("Loading items ...")

	with tsv.open(path) as f:
		for source, value in tsv.lines(f, (str, str), header=header):
			if len(source) > 0 and len(value) > 0:
				db.add_map_item(id, source, value)
コード例 #9
0
ファイル: dbnsfp_export.py プロジェクト: chris-zen/phd-thesis
def main():
	parser = argparse.ArgumentParser(
		description="Export dbNSFP scores")

	cmd = DefaultCommandHelper(parser)

	parser.add_argument("source_path", metavar="SOURCE",
						help="The original zip file")

	parser.add_argument("ensp_map_path", metavar="MAP",
						help="The mapping between Ensembl protein id's and Ensembl transcript id's and Uniprot id's")

	parser.add_argument("uniprot_map_path", metavar="MAP",
						help="The mapping between Ensembl protein id's and Uniprot id's")

	parser.add_argument("-o", "--output", dest="out_path", metavar="OUT_PATH",
						help="The output file")

	parser.add_argument("--temp", dest="temp_path", metavar="TEMP_PATH",
						help="A temporary path for zip extraction")

	parser.add_argument("--chr", dest="chr", metavar="CHROMOSOMES",
						help="Chromosomes to include: list separated by commas.")

	parser.add_argument("--skip-empty-scores", dest="skip_empty_scores", action="store_true", default=False,
						help="Skip SNV's where all the scores are empty")

	args, logger = cmd.parse_args("dbnsfp-export")

	if args.out_path is None:
		basename = os.path.basename(args.source_path)
		prefix = os.path.splitext(basename)[0]
		args.out_path = "{}.tsv.gz".format(prefix)

	logger.info("Loading maps ...")

	uniprot_map = {}
	trs_map = {}
	with tsv.open(args.ensp_map_path) as f:
		for ensp, enst in tsv.lines(f, (str, str)):
			if len(enst) > 0:
				trs_map[enst] = ensp

	with tsv.open(args.uniprot_map_path) as f:
		for ensp, uniprot_id in tsv.lines(f, (str, str)):
			if len(uniprot_id) > 0:
				uniprot_map[uniprot_id] = ensp

	logger.info("Opening {} ...".format(args.source_path))

	chromosomes = None
	if args.chr is not None:
		chromosomes = [c.strip().upper() for c in args.chr.split(",") if len(c.strip()) > 0]
		logger.info("Selected chromosomes: {}".format(", ".join(chromosomes)))
		chromosomes = set(chromosomes)

	name_pattern = re.compile(r"dbNSFP.+_variant.chr(.+)")

	COLUMNS = [
		"#chr", "pos(1-coor)", "ref", "alt", "cds_strand",
		"genename", "Uniprot_id", "Uniprot_aapos", "aaref", "aaalt",
		"Ensembl_geneid", "Ensembl_transcriptid", "aapos",
		"SIFT_score",
		"Polyphen2_HVAR_score",
		"MutationAssessor_score",
		"FATHMM_score",
		"MutationTaster_score",
#		"GERP_RS",
		"GERP++_RS",
#		"PhyloP_score"
		"phyloP"
	]

	tmp_prefix = args.temp_path or tempfile.gettempdir()
	if not os.path.exists(tmp_prefix):
		os.makedirs(tmp_prefix)
	if tmp_prefix[-1] != "/":
		tmp_prefix += "/"

	extract_path = tempfile.mkdtemp(prefix=tmp_prefix)

	try:
		logger.info("Output: {}".format(args.out_path if args.out_path != "-" else "standard output"))

		total_start_time = time.time()
		total_lines = 0
		with ZipFile(args.source_path, "r") as zf,\
			tsv.open(args.out_path, "w") as of: #,\
			#tsv.open(args.noprot_path, "w") as npf:

			tsv.write_line(of, "CHR", "STRAND", "START", "REF", "ALT", "TRANSCRIPT",
						   "PROTEIN", "AA_POS", "AA_REF", "AA_ALT",
						   "SIFT", "PPH2", "MA", "FATHMM", "MT", "GERPRS", "PHYLOP")

			#tsv.write_line(npf, "#CHR", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO")

			entries = []
			for entry in zf.infolist():
				m = name_pattern.match(entry.filename)
				if not m:
					continue

				chr = m.group(1)
				index = CHR_INDEX[chr] if chr in CHR_INDEX else 99

				if chromosomes is not None and chr not in chromosomes:
					logger.debug("Skipping chromosome {} ...".format(chr))
					continue

				entries += [(index, chr, entry)]

			for index, chr, entry in sorted(entries, key=lambda x: x[0]):
				logger.info("Reading chromosome {} ...".format(chr))

				zf.extract(entry, extract_path)
				fpath = os.path.join(extract_path, entry.filename)
				with open(fpath) as f:
					# Parse header
					hdr_line = f.readline()
					hdr = {}
					for index, name in enumerate(hdr_line.rstrip("\n").split("\t")):
						hdr[name] = index
					columns = [hdr[name] if name in hdr else None for name in COLUMNS]

					read = set()

					start_time = time.time()
					partial_start_time = start_time
					for line_num, line in enumerate(f, start=2):
						fields = line.rstrip("\n").split("\t")

						try:
							fields = [fields[i] if i is not None and i < len(fields) else None for i in columns]

							(chr, start, ref, alt, strand,
							 symbol, uniprot, uniprot_aapos, aa_ref, aa_alt,
							 gene, transcript, aapos,
							 sift, pph2, ma, fathmm,
							 mt, gerprs, phylop) = fields
							
							start = safe_int(start)
							ref = ref.upper() if ref is not None else None
							alt = alt.upper() if alt is not None else None
							aa_ref = aa_ref.upper() if aa_ref is not None else None
							aa_alt = aa_alt.upper() if aa_alt is not None else None
							sift = safe_float(sift)
							ma = safe_float(ma)
							fathmm = safe_float(fathmm)
							mt = safe_float(mt)
							gerprs = safe_float(gerprs)
							phylop = safe_float(phylop)

							if start is None or ref is None or alt is None:
								logger.warn("None value for pos or ref or alt at line {}: {}".format(line_num, fields))
								continue
							elif ref not in BASE_INDEX or alt not in BASE_INDEX:
								logger.warn("Unknown ref or alt at line {}: {}".format(line_num, fields))
								continue
							elif len(ref) != 1 or len(alt) != 1:
								logger.warn("Length != 1 for ref or alt len at line {}: {}".format(line_num, fields))
								continue
							#elif aa_ref not in AA_INDEX or aa_alt not in AA_INDEX:
							#	logger.warn("Unknown aa_ref or aa_alt at line {}: {}".format(line_num, fields))
							#	continue
							elif transcript is None or aapos is None or uniprot is None or uniprot_aapos is None:
								logger.warn("None value for transcript or aapos or uniprot or uniprot_aapos at line {}: {}".format(line_num, fields))
								continue

							if aa_ref not in AA_INDEX:
								aa_ref = None
							if aa_alt not in AA_INDEX:
								aa_alt = None

							trs_values = transcript.split(";")

							aapos_values = [safe_int(v) for v in aapos.split(";")]
							l = len(trs_values) - len(aapos_values)
							if l > 0:
								aapos_values += [aapos_values[-1]] * l

							uniprot_values = uniprot.split(";")
							uniprot_aapos_values = [safe_int(v) for v in uniprot_aapos.split(";")]
							l = len(uniprot_values) - len(uniprot_aapos_values)
							if l > 0:
								uniprot_aapos_values += [uniprot_aapos_values[-1]] * l

							pph2_values = [safe_float(v) for v in pph2.split(";")] if pph2 is not None else [None]
							l = len(uniprot_values) - len(pph2_values)
							if l > 0:
								pph2_values += [pph2_values[-1]] * l

							uniprot_index = {}
							for i, id in enumerate(uniprot_values):
								if uniprot_aapos_values[i] is not None:
									uniprot_index[uniprot_aapos_values[i]] = i

							for i, trs in enumerate(trs_values):
								pos = aapos_values[i]
								if pos < 0:
									pos = None

								if pos is not None and pos in uniprot_index:
									j = uniprot_index[pos]
									uniprot_value = uniprot_values[j]
									pph2_value = pph2_values[j]
								else:
									uniprot_value = pph2_value = None

								if trs in trs_map:
									prot_id = trs_map[trs]
								elif uniprot_value in uniprot_map:
									prot_id = uniprot_map[uniprot_value]
								else:
									logger.warn("Couldn't map neither protein {} or transcript {} at line {}: {}".format(uniprot_value, trs, line_num, "|".join([str(v) for v in fields])))
									continue

								#if pos < 0:
								#	logger.warn("Negative protein position at line {}: {}".format(line_num, pos))
								#	continue
								#elif ...
								if pph2_value is not None and (pph2_value < 0.0 or pph2_value > 1.0):
									logger.warn("PPH2 score {} out of range at line {}: {}".format(pph2_value, line_num, fields))
									continue

								if aa_alt == "X": # fix stop codons having a sift score
									sift = None

								if args.skip_empty_scores and sift is None and pph2_value is None and ma is None \
										and mt is None and gerprs is None and phylop is None:
									continue

								#log.info((chr, strand, start, ref, alt, aapos_values[i], aa_ref, aa_alt, trs, sift, pph2_value, ma))

								if pos is None or aa_ref is None or aa_alt is None:
									pass #tsv.write_line(npf, chr, start, ".", ref, alt, ".", "PASS",
										#		   "dbNSFP={}|{}|{}|{}|{}|{}".format(trs, prot_id,
										#					sift or "", pph2_value or "", ma or "", fathmm or ""))
								else:
									tsv.write_line(of, chr, strand, start, ref, alt, trs,
												   prot_id, pos, aa_ref, aa_alt,
												   sift, pph2_value, ma, fathmm,
												   mt, gerprs, phylop)

						except KeyboardInterrupt:
							raise
						except:
							logger.warn("Malformed line {}: {}".format(line_num, "|".join([str(v) for v in fields])))
							raise #continue

						partial_time = time.time() - partial_start_time
						if partial_time >= 5.0:
							partial_start_time = time.time()
							elapsed_time = time.time() - start_time
							logger.debug("  {} lines, {:.1f} lines/second".format(hsize(line_num-1), (line_num-1) / float(elapsed_time)))

					total_lines += line_num

					logger.info("  >  {} lines, {:.1f} lines/second".format(hsize(line_num), line_num / float(time.time() - start_time)))
					logger.info("  >> {} lines, {:.1f} lines/second".format(hsize(total_lines), total_lines / float(time.time() - total_start_time)))

				os.remove(fpath)

		total_elapsed_time = timedelta(seconds=time.time() - total_start_time)
		logger.info("Finished successfully. Elapsed time: {}".format(total_elapsed_time))

	except:
		return cmd.handle_error()
	finally:
		shutil.rmtree(extract_path)

	return 0
コード例 #10
0
ファイル: blt_groups.py プロジェクト: chris-zen/phd-thesis
def main():
	parser = argparse.ArgumentParser(
		description="Calculate Baseline Tolerance statistics")

	cmd = DefaultCommandHelper(parser)

	parser.add_argument("tree_path", metavar="TREE_PATH",
						help="The groups descendant tree")

	parser.add_argument("root_group", metavar="ROOT_GROUP",
						help="Tree root group")

	parser.add_argument("group_genes_path", metavar="GROUP_FEATS_PATH",
						help="Map between groups and features")

	parser.add_argument("stats_path", metavar="STATS_PATH",
						help="Partial feature statistics")

	parser.add_argument("out_path", metavar="OUTPUT_PATH",
						help="Output feature statistics")

	parser.add_argument("--tsv", dest="tsv_path", metavar="PATH",
						help="Store baseline tolerance in tsv format too.")

	parser.add_argument("-c", "--count-threshold", dest="count_threshold", metavar="N", default=DEFAULT_COUNT_THRESHOLD,
						help="Minimum number of features per group")

	parser.add_argument("--stdev-threshold", dest="stdev_threshold", metavar="V", default=DEFAULT_STDEV_THRESHOLD,
						help="Skip feature statistics with a standard deviation less than V (it will be calculated at the level of groups)")

	args, logger = cmd.parse_args("blt-groups")

	logger.info("Loading groups tree ...")

	tree = Tree()
	with tsv.open(args.tree_path) as f:
		for group, children in tsv.lines(f, (str, lambda v: set(v.split(",")))):
			tree.add_node(group, children)

	logger.info("  Nodes: {}".format(tree.node_count))

	logger.info("Loading mappings between groups and features ...")

	all_groups = set()
	all_features = set()
	with tsv.open(args.group_genes_path) as f:
		for group, features in tsv.lines(f, (str, lambda v: set(v.split(",")))):
			tree.add_node(group, features)
			all_groups.add(group)
			all_features.update(features)

	logger.info("  Nodes: {}".format(tree.node_count))
	logger.info("  Groups: {}".format(len(all_groups)))
	logger.info("  Features: {}".format(len(all_features)))

	logger.info("Loading partial statistics ...")

	with tsv.open(args.stats_path) as f:
		predictors = f.readline().rstrip("\n").split("\t")[1:]
		num_predictors = len(predictors)
		num_features = 0
		for line in f:
			try:
				fields = line.rstrip("\n").split("\t")
				feature = fields[0]
				node = tree.get_or_create_node(feature)
				for p, ss in zip(predictors, fields[1:]):
					try:
						s0, s1, s2 = [float(v) if i > 0 else int(v) for i, v in enumerate(ss.split("/"))]
						node.set_pblt(p, PartialBLT(s0, s1, s2, sources=set([feature])))
					except:
						import traceback
						traceback.print_exc()
						logger.warn("Failed to parse partial baseline tolerance"
									" for {}/{} from {}".format(feature, p, ss))
						exit(-1)
						continue
				num_features += 1
			except:
				logger.warn("Failed to parse partial baseline tolerance"
									" for {} from {}".format(feature, line))
				continue

	logger.info("  Nodes: {}".format(tree.node_count))
	logger.info("  Features: {}".format(num_features))
	logger.info("  Predictors: {}".format(", ".join(predictors)))

	logger.info("Calculating baseline tolerance ...")

	for predictor in predictors:
		logger.info("For {} ...".format(predictor))

		calculate_blt(
			parent=None, node=tree.get_or_create_node(args.root_group), predictor=predictor,
			count_threshold=args.count_threshold, stdev_threshold=args.stdev_threshold, logger=logger)

	# TODO log summary info

	logger.info("Writing results into {} ...".format(os.path.basename(args.out_path)))

	if args.tsv_path is not None:
		with tsv.open(args.tsv_path, "w") as of:
			tsv.write_line(of, "FEATURE", *predictors)
			for feature in all_features:
				sb = [feature]
				node = tree.get_node(feature)
				predictors_with_blt = 0
				for predictor in predictors:
					blt = node.get_blt(predictor)
					if blt is None or blt.n < args.count_threshold:
						sb += ["/".join(["-"] * 5)]
						continue

					predictors_with_blt += 1
					sb += ["/".join(map(str, [blt.from_node, blt.scope, blt.n, blt.mean, blt.stdev]))]

				if predictors_with_blt > 0:
					tsv.write_line(of, *sb)

	with tsv.open(args.out_path, "w") as of:
		tree_blt = {}
		for node_name, node in tree.nodes.items():
			predictors_blt = {}
			for predictor in predictors:
				pred_blt = node.get_blt(predictor)
				if pred_blt is None or pred_blt.n < args.count_threshold:
					continue

				predictors_blt[predictor] = dict(
					from_node=pred_blt.from_node, scope=pred_blt.scope,
					N=pred_blt.n, mean=pred_blt.mean, stdev=pred_blt.stdev)

			if len(predictors_blt) > 0:
				tree_blt[node.name] = predictors_blt

		doc = dict(
			created=str(datetime.now()),
			predictors=predictors,
			count_threshold=args.count_threshold,
			stdev_threshold=args.stdev_threshold,
			tree=None, # tree relations
			features=list(all_features),
			pblt=None, # TODO
			blt=tree_blt
		)
		json.dump(doc, of, indent=True)


	return 0
コード例 #11
0
ファイル: update_db.py プロジェクト: chris-zen/phd-thesis
def end():
	log = task.logger

	projects_out_port = task.ports("projects_out")

	log.info("Updating the projects database ...")

	for project_id, projects in task.context.items():

		log.info("[{0}]".format(project_id))

		for index, project in enumerate(projects):
			projdb = ProjectDb(project["db"])

			if index == 0:
				log.info("  Functional impact ...")

				projdb.delete_sample_gene_fimpact()

				with tsv.open(project["sample_gene_fi_data"], "r") as f:
					types = (int, str, float, float, int, float, float, int, float, float, int)
					for fields in tsv.lines(f, types, header=True, null_value="-"):
						projdb.add_sample_gene_fimpact(*fields)

			ofm = project["oncodrivefm"]
			del project["oncodrivefm"]

			exc_path = os.path.join(project["temp_path"], "oncodrivefm-excluded-cause.tsv")

			log.info("  Excluded gene causes ...")
			log.debug("    > {0}".format(exc_path))

			count = 0
			with tsv.open(exc_path, "r") as exf:
				for gene, cause in tsv.lines(exf, (str, str), header=True):
					projdb.update_gene(Gene(id=gene, fm_exc_cause=cause))
					count += 1

			log.debug("    {0} genes excluded".format(count))

			for feature, results_path in ofm:

				log.info("  {0} ...".format(feature))
				log.debug("    > {0}".format(results_path))

				if feature == "genes":
					with tsv.open(results_path, "r") as f:
						count = 0
						for gene, pvalue, qvalue in tsv.lines(f, (str, float, float), header=True):
							projdb.update_gene(Gene(id=gene, fm_pvalue=pvalue,
													fm_qvalue=qvalue, fm_exc_cause=ProjectDb.NO_GENE_EXC))
							count += 1
						log.info("    {0} genes".format(count))
				elif feature == "pathways":
					with tsv.open(results_path, "r") as f:
						count = 0
						for pathway, zscore, pvalue, qvalue in tsv.lines(f, (str, float, float, float), header=True):
							projdb.update_pathway(Pathway(id=pathway, fm_zscore=zscore,
														  fm_pvalue=pvalue, fm_qvalue=qvalue))
							count += 1
						log.info("    {0} pathways".format(count))

			projdb.commit()

			projdb.close()

		projects_out_port.send(projects[0])
コード例 #12
0
ファイル: training_sets.py プロジェクト: chris-zen/phd-thesis
def main():
	parser = argparse.ArgumentParser(
		description="Prepare SNV's dataset from individual training sets")

	parser.add_argument("pos_path", metavar="POS_SET",
						help="The positive training set file")

	parser.add_argument("neg_path", metavar="NEG_SET",
						help="The negative training set file")

	parser.add_argument("-m", "--map", dest="map_path", metavar="MAP",
						help="Optional mapping file for feature id's. Format: DST SRC")

	parser.add_argument("-o", dest="out_path", metavar="PATH",
						help="Output file. Use - for standard output.")

	bglogging.add_logging_arguments(parser)

	args = parser.parse_args()

	bglogging.initialize(args)

	logger = bglogging.get_logger("training-sets")

	if args.out_path is None:
		prefix = os.path.commonprefix([
							os.path.splitext(os.path.basename(args.pos_path))[0],
							os.path.splitext(os.path.basename(args.neg_path))[0]])

		prefix = prefix.rstrip(".")

		args.out_path = os.path.join(os.getcwd(), "{}-training.tsv".format(prefix))

	if args.map_path is not None:
		logger.info("Loading map ...")

		prot_map = {}
		with tsv.open(args.map_path) as f:
			for dst_feature, src_feature in tsv.lines(f, (str, str)):
					if len(src_feature) > 0:
						if src_feature not in prot_map:
							prot_map[src_feature] = set([dst_feature])
						else:
							prot_map[src_feature].add(dst_feature)
	else:
		prot_map = None
	
	logger.info("Processing ...")

	hits = dict(POS=0, NEG=0)
	fails = dict(POS=0, NEG=0)

	start_time = datetime.now()

	with tsv.open(args.out_path, "w") as wf:

		for event_type, path in (("POS", args.pos_path), ("NEG", args.neg_path)):

			logger.info("  [{}] Reading {} ...".format(event_type, path))

			with tsv.open(path) as f:
				types = (str, int, str, str)
				for protein, pos, aa1, aa2 in tsv.lines(f, types):
					protein = protein.strip()

					if prot_map is not None:
						if protein not in prot_map:
							logger.debug("[{}] Unmapped protein: {}".format(event_type, protein))
							fails[event_type] += 1
							continue
						proteins = prot_map[protein]
					else:
						proteins = [protein]

					hits[event_type] += 1

					for p in proteins:
						tsv.write_line(wf, p, pos, aa1.strip(), aa2.strip(), event_type)

	logger.info("               POS       NEG")
	logger.info("SNVs      {POS:>8}  {NEG:>8}".format(**hits))
	if args.map_path is not None:
		logger.info("unmapped  {POS:>8}  {NEG:>8}".format(**fails))

	logger.info("Finished. Elapsed time: {}".format(datetime.now() - start_time))
コード例 #13
0
def load_cds_len(conf):
	cds_len = {}
	with tsv.open(get_data_ensembl_gene_transcripts_path(conf), "r") as f:
		for gene, transcript, transcript_len in tsv.lines(f, (str, str, int), header=True):
			cds_len[transcript] = transcript_len
	return cds_len
コード例 #14
0
ファイル: drivers.py プロジェクト: chris-zen/phd-thesis
def drivers():
	log = task.logger

	config = GlobalConfig(task.conf)
	paths = PathsConfig(config)

	db_path = paths.results_path("drivers.db")
	db = SigDb(db_path)
	db.open()

	log.info("Variants ...")

	path = paths.combination_path("recurrences", "variant_gene-global-all.tsv.gz")
	with tsv.open(path, "r") as f:
		types = (str, str, int, str)
		for fields in tsv.lines(f, types, columns=("CHR", "STRAND", "START", "ALLELE"), header=True):
			chr, strand, start, allele = fields[:4]
			db.add_variant(chr, start)

	log.info("Genes ...")

	gene_sites = {}

	gene_fm = set()
	gene_clust = set()

	#SPECIAL_THRESHOLD = ["C18", "C34"]
	SPECIAL_THRESHOLD = []

	log.info("  OncodriveFM ...")

	filename_re = re.compile(r"gene-cancer_site-(.+)\.tsv.gz")
	base_path = paths.combination_path("oncodrivefm")
	for path in os.listdir(base_path):
		m = filename_re.match(path)
		if not m:
			continue

		cancer_site_code = m.group(1)

		if cancer_site_code in SPECIAL_THRESHOLD:
			threshold = 1e-6
		else:
			threshold = 0.01

		with tsv.open(os.path.join(base_path, path), "r") as f:
			params = tsv.params(f)
			cancer_site_name = params["group_long_name"]
			for fields in tsv.lines(f, (str, float), columns=("ID", "QVALUE"), header=True):
				gene, qvalue = fields
				if qvalue < threshold:
					add_cancer_site(gene_sites, gene, cancer_site_code, cancer_site_name)

					gene_fm.add(gene)

	log.info("  OncodriveCLUST ...")

	filename_re = re.compile(r"cancer_site-(.+)\.tsv.gz")
	base_path = paths.combination_path("oncodriveclust")
	for path in os.listdir(base_path):
		m = filename_re.match(path)
		if not m:
			continue

		cancer_site_code = m.group(1)

		with tsv.open(os.path.join(base_path, path), "r") as f:
			params = tsv.params(f)
			cancer_site_name = params["group_long_name"]
			for fields in tsv.lines(f, (str, float), columns=("ID", "QVALUE"), header=True):
				gene, qvalue = fields
				if qvalue < 0.05:
					add_cancer_site(gene_sites, gene, cancer_site_code, cancer_site_name)

					gene_clust.add(gene)

	log.info("  Updating db ...")
	sig_genes = gene_fm | gene_clust
	for gene in sig_genes:
		db.add_gene(gene, gene in gene_fm, gene in gene_clust)

	log.info("Saving driver genes cancer sites dataset ...")
	path = paths.results_path("gene-driver_cancer_sites.tsv")
	log.debug("> {}".format(path))
	with open(path, "w") as f:
		tsv.write_param(f, "date", datetime.now())
		tsv.write_line(f, "GENE_ID", "FM", "CLUST", "CANCER_SITES_COUNT", "CANCER_SITE_CODES", "CANCER_SITE_NAMES")
		for gene, sites in gene_sites.items():
			tsv.write_line(f, gene,
						   1 if gene in gene_fm else 0,
						   1 if gene in gene_clust else 0,
						   len(sites),
						   ", ".join(sorted([code for code, name in sites])),
						   ", ".join(sorted([name for code, name in sites])))

	db.commit()
	db.close()
コード例 #15
0
ファイル: fimpact.py プロジェクト: chris-zen/phd-thesis
def fimpact_run(partition):
	log = task.logger
	conf = task.conf

	results_port = task.ports("results")

	project = partition["project"]

	log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"]))

	log.info("Reading MA scores ...")

	ma_uniprot = {}
	ma_scores = {}

	with open(partition["ma_path"], "r") as f:
		for var_id, uniprot, fi_score in tsv.lines(f, (int, str, float), null_value="-"):
			ma_uniprot[var_id] = uniprot
			ma_scores[var_id] = fi_score

	log.info("Reading VEP results and calculating functional impact ...")

	tfic = TransFIC(data_path=os.path.join(conf["data_path"], "TransFIC"))

	tfi_path = os.path.join(partition["base_path"], "{0:08d}.tfi".format(partition["index"]))
	cf = open(tfi_path, "w")

	aff_gene_attrs = {}

	with open(partition["vep_path"], "r") as f:
		for fields in tsv.lines(f, (int, str, str, str, str, str, str, float, float), null_value="-"):
			(var_id, gene, transcript, ct,
				protein_pos, aa_change, protein,
				sift_score, pph2_score) = fields

			if ct is not None:
				ct = ct.split(",")
			else:
				ct = []

			# Invert sift score
			if sift_score is not None:
				sift_score = 1.0 - sift_score

			ma_score = None

			uniprot = ma_uniprot[var_id] if var_id in ma_uniprot else None

			sift_impact = pph2_impact = ma_impact = None # TransFIC.UNKNOWN_IMPACT_CLASS

			coding_region = so.match(ct, so.CODING_REGION)

			calculate_transfic = True

			ct_type = None
			if so.match(ct, so.NON_SYNONYMOUS):       # missense
				ct_type = TransFIC.CT_NON_SYNONYMOUS
				ma_score = ma_scores[var_id] if var_id in ma_scores else None
			elif so.match(ct, so.STOP):               # stop
				ct_type = TransFIC.CT_STOP
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.FRAMESHIFT):         # frameshift
				ct_type = TransFIC.CT_FRAMESHIFT
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.SPLICE):             # splice
				ct_type = "splice"
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS if so.match(ct, so.SPLICE_JUNCTION) else TransFIC.UNKNOWN_IMPACT_CLASS
				calculate_transfic = False
			elif so.match(ct, so.SYNONYMOUS):         # synonymous
				ct_type = TransFIC.CT_SYNONYMOUS
				sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS
				sift_score = pph2_score = 0.0
				ma_score = -2
			else:
				sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS
				calculate_transfic = False

			if calculate_transfic:
				(sift_tfic, sift_class,
				 pph2_tfic, pph2_class,
				 ma_tfic, ma_class) = tfic.calculate("gosmf", gene, ct_type, sift_score, pph2_score, ma_score)

				# if the impact was not preassigned get it from the transFIC calculated class
				sift_impact = sift_class if sift_impact is None and sift_class in IMPACT_CLASSES else sift_impact
				pph2_impact = pph2_class if pph2_impact is None and pph2_class in IMPACT_CLASSES else pph2_impact
				ma_impact = ma_class if ma_impact is None and ma_class in IMPACT_CLASSES else ma_impact
			else:
				sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class = (None, None, None, None, None, None)

			aff_gene = (var_id, gene)

			# update aggregated impact for all the predictors
			update_attr(aff_gene_attrs, aff_gene, "sift_impact", sift_impact, update=TransFIC.higher_impact)
			update_attr(aff_gene_attrs, aff_gene, "pph2_impact", pph2_impact, update=TransFIC.higher_impact)
			update_attr(aff_gene_attrs, aff_gene, "ma_impact", ma_impact, update=TransFIC.higher_impact)

			# update whether the affected gene is a coding region or not
			update_attr(aff_gene_attrs, aff_gene, "coding_region", coding_region,
						update=lambda prev_value, value: prev_value or value)

			# aggregate protein changes per affected_gene
			# try to follow the convention http://www.hgvs.org/mutnomen/recs-prot.html
			prot_change = None
			if ct_type == TransFIC.CT_FRAMESHIFT:
				if protein_pos is None:
					prot_change = "fs"
				else:
					prot_change = "fs {0}".format(protein_pos)
				#log.debug("FRAMESHIFT: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change))
			elif ct_type == "splice":
				prot_change = "r.spl?"
				#log.debug("SPLICE: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change))
			elif protein_pos is not None and aa_change is not None:
				rc = ReContext()
				if rc.match(SIMPLE_AA_CHANGE_RE, aa_change):
					prot_change = "{ref}{pos}{alt}".format(pos=protein_pos, ref=rc.group(1), alt=rc.group(2) or "=")
				elif rc.match(COMPLEX_AA_CHANGE_RE, aa_change):
					prot_change = "{0} {1}".format(aa_change, protein_pos)
				else:
					log.warn("Unmatched aa change: gene={}, protein={}, pos={}, change={}, ct=[{}]".format(
													gene, protein, protein_pos, aa_change, ", ".join(ct)))

			if prot_change is not None:
				update_attr(aff_gene_attrs, aff_gene, "prot_changes", prot_change,
								 new=lambda value: set([value]),
								 update=lambda prev_value, value: prev_value | set([value]))

			impact = ma_impact or pph2_impact or sift_impact or TransFIC.UNKNOWN_IMPACT_CLASS

			tsv.write_line(cf, var_id, transcript, uniprot,
						   sift_score, sift_tfic, sift_class,
						   pph2_score, pph2_tfic, pph2_class,
						   ma_score, ma_tfic, ma_class,
						   impact, null_value="-")

	cf.close()

	log.info("Saving variant impacts ...")

	gfi_path = os.path.join(partition["base_path"], "{0:08d}.gfi".format(partition["index"]))
	vf = open(gfi_path, "w")
	for aff_gene, attrs in aff_gene_attrs.items():
		var_id, gene = aff_gene
		# get the impact by trust priority: ma, pph2, sift
		impact = attrs.get("ma_impact") or attrs.get("pph2_impact") or attrs.get("sift_impact") or TransFIC.UNKNOWN_IMPACT_CLASS
		coding_region = attrs.get("coding_region", False)
		coding_region = 1 if coding_region else 0
		prot_changes = attrs.get("prot_changes")
		prot_changes = ",".join(prot_changes) if prot_changes is not None else None

		tsv.write_line(vf, var_id, gene, impact, coding_region, prot_changes, null_value="-")
	vf.close()

	# Send results to the next module
	partition["tfi_path"] = tfi_path
	partition["gfi_path"] = gfi_path
	results_port.send(partition)
コード例 #16
0
ファイル: gene_impact.py プロジェクト: chris-zen/phd-thesis
def gene_impact(project):
	log = task.logger

	config = GlobalConfig(task.conf)
	paths = PathsConfig(config)

	projects_port = task.ports("projects")

	log.info("--- [{0}] --------------------------------------------".format(project["id"]))

	partitions = project["partitions"]

	log.info("Reading {} partitions ...".format(len(partitions)))

	aff_gene_attrs = {}

	for partition in partitions:
		log.info(" Partition {} ...".format(partition["index"]))
		with open(partition["tfi_path"], "r") as f:
			bool_type = lambda val: bool(int(val)) if val is not None else False
			types = (int, str, str, bool_type, int, int, int, int)
			columns = [0, 2, 4, 5, 6, 10, 14, 18]
			for fields in tsv.lines(f, types, columns=columns, null_value="-"):
				(var_id, gene, prot_change, coding_region, tr_impact,
				 	sift_impact, pph2_impact, ma_impact) = fields

				coding_region = coding_region == 1

				aff_gene = (var_id, gene)

				# update aggregated impact for all the predictors
				update_attr(aff_gene_attrs, aff_gene, "sift_impact", sift_impact, update=TransFIC.higher_impact)
				update_attr(aff_gene_attrs, aff_gene, "pph2_impact", pph2_impact, update=TransFIC.higher_impact)
				update_attr(aff_gene_attrs, aff_gene, "ma_impact", ma_impact, update=TransFIC.higher_impact)

				# update whether the affected gene is a coding region or not
				update_attr(aff_gene_attrs, aff_gene, "coding_region", coding_region,
							update=lambda prev_value, value: prev_value or value)

				# aggregate protein changes per affected_gene
				if prot_change is not None:
					update_attr(aff_gene_attrs, aff_gene, "prot_changes", prot_change,
									 new=lambda value: set([value]),
									 update=lambda prev_value, value: prev_value | set([value]))

	num_vars = len(set([var_id for var_id, gene in aff_gene_attrs.keys()]))
	num_genes = len(set([gene for var_id, gene in aff_gene_attrs.keys()]))
	log.info("Saving {} variant-gene impacts ({} variants and {} genes) ...".format(len(aff_gene_attrs), num_vars, num_genes))

	gfi_path = os.path.join(project["csq_path"], "variant-gene_impact.tsv")
	with open(gfi_path, "w") as vf:
		for aff_gene, attrs in aff_gene_attrs.items():
			var_id, gene = aff_gene

			# get the impact by trust priority: ma, pph2, sift
			impact = attrs.get("ma_impact") or attrs.get("pph2_impact") or attrs.get("sift_impact") or TransFIC.UNKNOWN_IMPACT_CLASS

			coding_region = attrs.get("coding_region", False)
			coding_region = 1 if coding_region else 0

			prot_changes = attrs.get("prot_changes")
			prot_changes = ",".join(prot_changes) if prot_changes is not None else None

			tsv.write_line(vf, var_id, gene, impact, coding_region, prot_changes, null_value="-")

	# Send results to the next module
	project["gfi_path"] = gfi_path
	projects_port.send(project)
コード例 #17
0
def fimpact_run(partition):
	log = task.logger

	config = GlobalConfig(task.conf)
	paths = PathsConfig(config)

	results_port = task.ports("results")

	project = partition["project"]

	log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"]))

	log.info("Reading MA scores ...")

	ma_uniprot = {}
	ma_scores = {}

	with open(partition["ma_path"], "r") as f:
		for var_id, uniprot, fi_score in tsv.lines(f, (int, str, float), null_value="-"):
			ma_uniprot[var_id] = uniprot
			ma_scores[var_id] = fi_score

	log.info("Reading VEP results and calculating functional impact ...")

	tfic = TransFIC(data_path=paths.data_transfic_path())

	tfi_path = os.path.join(partition["base_path"], "{0:08d}.tfi".format(partition["index"]))
	cf = open(tfi_path, "w")

	with open(partition["vep_path"], "r") as f:
		for fields in tsv.lines(f, (int, str, str, str, str, str, str, float, float), null_value="-"):
			(var_id, gene, transcript, ct,
				protein_pos, aa_change, protein,
				sift_score, pph2_score) = fields

			ct = (ct or "").split(",")

			# Invert sift score
			if sift_score is not None:
				sift_score = 1.0 - sift_score

			ma_score = None

			uniprot = ma_uniprot.get(var_id)

			sift_impact = pph2_impact = ma_impact = None # TransFIC.UNKNOWN_IMPACT_CLASS

			coding_region = 1 if so.match(ct, so.CODING_REGION) else 0

			sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class = (None, None, None, None, None, None)

			ct_type = None
			if so.match(ct, so.NON_SYNONYMOUS):       # missense
				ct_type = TransFIC.CT_NON_SYNONYMOUS
				ma_score = ma_scores.get(var_id)

				(sift_tfic, sift_class,
				 pph2_tfic, pph2_class,
				 ma_tfic, ma_class) = tfic.calculate("gosmf", gene, ct_type, sift_score, pph2_score, ma_score)

				sift_impact = sift_class if sift_class in IMPACT_CLASSES else sift_impact
				pph2_impact = pph2_class if pph2_class in IMPACT_CLASSES else pph2_impact
				ma_impact = ma_class if ma_class in IMPACT_CLASSES else ma_impact
			elif so.match(ct, so.STOP):               # stop
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.FRAMESHIFT):         # frameshift
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.SPLICE_JUNCTION):    # splice junction
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.SPLICE_REGION):      # splice region
				sift_impact = pph2_impact = ma_impact = TransFIC.UNKNOWN_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.SYNONYMOUS):         # synonymous
				sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS
				sift_score = pph2_score = 0.0
				ma_score = -2
			else:
				sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS

			aff_gene = (var_id, gene)

			# try to follow the convention http://www.hgvs.org/mutnomen/recs-prot.html
			prot_change = None
			if ct_type == TransFIC.CT_FRAMESHIFT:
				if protein_pos is None:
					prot_change = "fs"
				else:
					prot_change = "fs {0}".format(protein_pos)
				#log.debug("FRAMESHIFT: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change))
			elif ct_type == "splice":
				prot_change = "r.spl?"
				#log.debug("SPLICE: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change))
			elif protein_pos is not None and aa_change is not None:
				rc = ReContext()
				if rc.match(SIMPLE_AA_CHANGE_RE, aa_change):
					prot_change = "{ref}{pos}{alt}".format(pos=protein_pos, ref=rc.group(1), alt=rc.group(2) or "=")
				elif rc.match(COMPLEX_AA_CHANGE_RE, aa_change):
					prot_change = "{0} {1}".format(aa_change, protein_pos)
				else:
					log.warn("Unmatched aa change: gene={}, protein={}, pos={}, change={}, ct=[{}]".format(
													gene, protein, protein_pos, aa_change, ", ".join(ct)))

			tr_impact = ma_impact or pph2_impact or sift_impact or TransFIC.UNKNOWN_IMPACT_CLASS

			tsv.write_line(cf, var_id, transcript, gene, uniprot, prot_change, coding_region, tr_impact,
						   sift_score, sift_tfic, sift_class, sift_impact,
						   pph2_score, pph2_tfic, pph2_class, pph2_impact,
						   ma_score, ma_tfic, ma_class, ma_impact,
						   null_value="-")

	cf.close()

	# Send results to the next module
	partition["tfi_path"] = tfi_path
	results_port.send(partition)
コード例 #18
0
def main():
    parser = argparse.ArgumentParser(description="Calculate Baseline Tolerance statistics per gene")

    cmd = DefaultCommandHelper(parser)

    parser.add_argument("tree_path", metavar="TREE_PATH", help="The groups descendant tree")

    parser.add_argument("root_group", metavar="ROOT_GROUP", help="Tree root group")

    parser.add_argument("group_genes_path", metavar="GROUP_FEATS_PATH", help="Map between groups and features")

    parser.add_argument("stats_path", metavar="STATS_PATH", help="Partial gene statistics")

    parser.add_argument("out_path", metavar="OUTPUT_PATH", help="Output gene statistics")

    parser.add_argument(
        "-c",
        "--count-threshold",
        dest="count_threshold",
        metavar="N",
        default=DEFAULT_COUNT_THRESHOLD,
        help="Minimum number of features per group",
    )

    parser.add_argument(
        "--stdev-threshold",
        dest="stdev_threshold",
        metavar="V",
        default=DEFAULT_STDEV_THRESHOLD,
        help="Skip feature statistics with a standard deviation less than V (it will be calculated at the level of groups)",
    )

    args, logger = cmd.parse_args("blt-groups")

    logger.info("Loading groups tree ...")

    group_children = defaultdict(set)
    with tsv.open(args.tree_path) as f:
        for group, children in tsv.lines(f, (str, lambda v: set(v.split(",")))):
            group_children[group] |= children

    logger.info("Loading mappings between groups and features ...")

    group_genes = defaultdict(set)
    with tsv.open(args.group_genes_path) as f:
        for group, genes in tsv.lines(f, (str, lambda v: set(v.split(",")))):
            group_genes[group] |= genes

    logger.info("Loading partial statistics ...")

    partial_stats = {}
    with tsv.open(args.stats_path) as f:
        predictors = f.readline().rstrip("\n").split("\t")[1:]
        num_predictors = len(predictors)
        for line in f:
            fields = line.rstrip("\n").split("\t")
            gene = fields[0]
            gene_stats = [[float(v) if i > 0 else int(v) for i, v in enumerate(ss.split("/"))] for ss in fields[1:]]
            partial_stats[gene] = gene_stats

    logger.info("  Predictors: {}".format(", ".join(predictors)))
    logger.info("  Features: {}".format(len(partial_stats.keys())))

    logger.info("Calculating features ...")

    stats = {}

    feat_count = 0
    feat_partial_count = [0] * num_predictors
    for feature, feat_partial_stats in partial_stats.items():
        feat_with_stats = False
        feat_stats = [None] * (num_predictors + 1)
        for i in range(num_predictors):
            s0, s1, s2 = feat_partial_stats[i]

            if s0 == 0.0:
                continue

            if s0 < args.count_threshold:
                continue

            x = (s0 * s2 - s1 * s1) / (s0 * (s0 - 1))
            if x < -1e-12:
                continue

            mean = s1 / s0
            std = math.sqrt(abs(x))
            if std < args.stdev_threshold:
                continue

            feat_stats[i] = (int(s0), mean, std)
            feat_partial_count[i] += 1
            feat_with_stats = True

        if feat_with_stats:
            feat_count += 1
            stats[feature] = feat_stats
            # print feature, "\t".join(["/".join([str(v) for v in feat_stats[i] or []]) for i in range(num_predictors)])

    logger.info(
        "  {} ({}) features out of {} calculated directly from partial statistics".format(
            feat_count, "/".join(map(str, feat_partial_count)), len(partial_stats)
        )
    )

    logger.info("Calculating groups ...")

    calculate_group(
        logger, args.root_group, args.count_threshold, group_children, group_genes, partial_stats, num_predictors, stats
    )

    logger.info("  {} features calculated in total".format(len(stats)))

    with tsv.open(args.out_path, "w") as of:
        tsv.write_line(of, "GENE", "GROUP", *predictors)
        for gene in sorted(stats.keys()):
            gene_stats = stats[gene]
            sb = [gene]
            stats_group = gene_stats[num_predictors]
            if stats_group is not None:
                sb += [stats_group]
            else:
                sb += ["|" + ("-" * num_predictors)]

            for i in range(num_predictors):
                if gene_stats[i] is not None:
                    sb += ["/".join([str(v) for v in gene_stats[i]])]
                else:
                    sb += ["-/-/-"]
            tsv.write_line(of, *sb)

    return 0
コード例 #19
0
ファイル: update_db.py プロジェクト: chris-zen/phd-thesis
def update_db(project):
    log = task.logger

    config = GlobalConfig(task.conf)

    projects_port = task.ports("projects_out")

    log.info("--- [{0}] --------------------------------------------".format(project["id"]))

    partitions = project["partitions"]

    if not os.path.exists(config.vardb_path):
        log.warn("Database for variation external references not found")
        log.debug("> {0}".format(conf["vardb_path"]))

    varxdb = VarXrefsDb(config.vardb_path)
    varxdb.open()

    projdb = ProjectDb(project["db"])

    updated_variants = set()

    plen = len(partitions)

    gene_xrefs = defaultdict(set)

    for part in partitions:
        log.info("Updating database with partition data ({0} out of {1}) ...".format(part["index"] + 1, plen))

        log.info("  VEP results ...")

        ctype = lambda v: v.split(",")

        with open(part["vep_path"], "r") as vf:
            for fields in tsv.lines(vf, (int, str, str, ctype, str, str, str, float, float), null_value="-"):
                (
                    var_id,
                    gene,
                    transcript,
                    consequences,
                    protein_pos,
                    aa_change,
                    protein,
                    sift_score,
                    pph2_score,
                ) = fields

                var = projdb.get_variant(var_id)

                xrefs = varxdb.get_xrefs(var.chr, var.start, var.ref, var.alt, var.strand)

                if xrefs is not None:
                    xrefs = ["{0}:{1}".format(source, xref) for source, xref in xrefs]
                    gene_xrefs[gene].update(xrefs)

                    if len(xrefs) == 0:
                        xrefs = None

                projdb.update_variant(Variant(id=var_id, xrefs=xrefs))

                projdb.add_consequence(
                    Consequence(
                        var=Variant(id=var_id),
                        transcript=transcript,
                        gene=gene,
                        ctypes=consequences,
                        protein_pos=protein_pos,
                        aa_change=aa_change,
                        protein=protein,
                    )
                )

        log.info("  Transcript functional impacts ...")

        with open(part["tfi_path"], "r") as f:
            types = (int, str, str, int, float, float, int, float, float, int, float, float, int)
            columns = [0, 1, 3, 6, 7, 8, 9, 11, 12, 13, 15, 16, 17]
            for fields in tsv.lines(f, types, columns=columns, null_value="-"):
                (
                    var_id,
                    transcript,
                    uniprot,
                    impact,
                    sift_score,
                    sift_tfic,
                    sift_class,
                    pph2_score,
                    pph2_tfic,
                    pph2_class,
                    ma_score,
                    ma_tfic,
                    ma_class,
                ) = fields
                print fields

                projdb.update_consequence(
                    Consequence(
                        var=Variant(id=var_id),
                        transcript=transcript,
                        uniprot=uniprot,
                        sift_score=sift_score,
                        sift_tfic=sift_tfic,
                        sift_tfic_class=sift_class,
                        pph2_score=pph2_score,
                        pph2_tfic=pph2_tfic,
                        pph2_tfic_class=pph2_class,
                        ma_score=ma_score,
                        ma_tfic=ma_tfic,
                        ma_tfic_class=ma_class,
                        impact=impact,
                    )
                )

    log.info("Updating variant-gene functional impacts ...")

    with open(project["gfi_path"], "r") as f:
        types = (int, str, float, int, str)
        for var_id, gene, impact, coding_region, prot_changes in tsv.lines(f, types, null_value="-"):
            projdb.add_affected_gene(
                AffectedGene(
                    var=Variant(id=var_id),
                    gene_id=gene,
                    impact=impact,
                    coding_region=coding_region,
                    prot_changes=prot_changes,
                )
            )

    log.info("Updating database with gene external variant references ...")

    for gene, xrefs in gene_xrefs.items():
        projdb.update_gene(Gene(id=gene, xrefs=xrefs))

    projdb.commit()
    projdb.close()

    varxdb.close()

    del project["partitions"]

    projects_port.send(project)