def main():
	parser = OptionParser(usage = "usage: %prog [options] <Variants gvf.gz file> ...")

	parser.add_option("--db", dest="db_path",
		help="Database path")

	parser.add_option("-L", "--log-level", dest="log_level",
		default="info", choices=["debug", "info", "warn", "error", "critical", "notset"],
		help="Which log level: debug, info, warn, error, critical, notset")

	(options, args) = parser.parse_args()

	logging.basicConfig(
		level=LOG_LEVEL[options.log_level],
		format="%(asctime)s %(levelname)-5s : %(message)s")

	log = logging.getLogger("var_db")

	if len(args) < 1:
		log.error("At least one variants file is required")
		parser.print_help()
		exit(-1)

	if options.db_path is None:
		log.error("The database path should be specified")
		parser.print_help()
		exit(-1)

	db_path = options.db_path

	log.info("Opening database ...")

	db = VarXrefsDb(db_path)

	db.open()

	db.begin()

	total_count = 0
	total_start_time = time.time()

	src_var_count = {}
	src_ratio = {}

	chromosomes = set()
	chr_var_count = {}
	strands = set()
	
	try:
		partial_count = 0
		partial_start_time = time.time()
		for xref_path in args:
			log.info("Reading {0} ...".format(xref_path))

			if not os.path.isfile(xref_path):
				log.error("File not found: {0}".format(xref_path))
				exit(-1)

			mtime = datetime.fromtimestamp(os.path.getmtime(xref_path))

			f = tsv.open(xref_path, "r")

			src_count = 0
			src_start_time = time.time()

			line_num = 1

			# discard headers
			line = f.readline()
			while line.startswith("#"):
				line = f.readline()
				line_num += 1

			src_var_count[xref_path] = 0

			for line in f:
				try:
					fields = [x if len(x) > 0 else None for x in line.rstrip("\n").split("\t")]

					chr, source, type, start, end, _1, strand, _2, extra = fields

					start = int(start)
					end = int(end)

					ref = None
					alt = None
					xref = None
					try:
						for var in extra.split(";"):
							try:
								key, value = var.split("=")
								if key == "Dbxref":
									pos = value.index(":")
									xref = value[pos + 1:]
								elif key == "Reference_seq":
									ref = value
								elif key == "Variant_seq":
									alt = value
							except:
								continue
					except:
						pass

					if sum([1 if x is None else 0 for x in [chr, start, strand, ref, alt, source, xref]]) > 0:
						log.warn("Discarding incomplete variant: {0}".format(",".join([chr, str(start), strand, ref, alt, source, xref])))
						continue

					src_var_count[xref_path] += 1

					chromosomes.add(chr)
					if chr in chr_var_count:
						chr_var_count[chr] += 1
					else:
						chr_var_count[chr] = 1
					
					strands.add(strand)

					db.add(chr, start, ref, alt, source, xref, strand)

					total_count += 1
					src_count += 1

					partial_count += 1
					elapsed_time = time.time() - partial_start_time
					if elapsed_time >= 10.0:
						ratio = float(partial_count) / elapsed_time
						log.debug("  {0:.1f} variants/second, {1} variants, {2} total variants".format(ratio,
								hsize(src_count), hsize(total_count)))
						partial_count = 0
						partial_start_time = time.time()

				except Exception as ex:
					log.error("Error at line {0}:\n{1}".format(line_num, line.rstrip("\n")))
					import sys
					import traceback
					traceback.print_exc(file=sys.stdout)
					continue
				finally:
					line_num += 1

			elapsed_time = time.time() - src_start_time
			ratio = float(src_count) / elapsed_time
			src_ratio[xref_path] = ratio
			log.info("  {0:.1f} variants/second, {1} variants, {2} total variants".format(ratio,
					hsize(src_count), hsize(total_count)))

			f.close()

		db.commit()
	except KeyboardInterrupt:
		db.commit()
		log.warn("Interrupted by the user with Ctrl-C")
		exit(-1)
	except:
		db.rollback()
		raise
	finally:
		db.close()

	elapsed_time = time.time() - total_start_time
	total_ratio = float(total_count) / elapsed_time

	log.info("Statistics:")
	log.info("  Sources:")
	for xref_path in args:
		log.info("    {0}: {1} variants".format(os.path.basename(xref_path), src_var_count[xref_path]))
	total_size = 0

	log.info("  Chromosomes:")
	for chr in chromosomes:
		log.info("    {0:>2}: {1:>7} variants".format(chr, str(chr_var_count[chr])))
	log.info("  Strands: {0}".format(", ".join(strands)))

	log.info("  Total {0} variants ({1:.1f} variants/sec)".format(hsize(total_count), total_ratio))
Exemple #2
0
def update_db(project):
    log = task.logger

    config = GlobalConfig(task.conf)

    projects_port = task.ports("projects_out")

    log.info("--- [{0}] --------------------------------------------".format(project["id"]))

    partitions = project["partitions"]

    if not os.path.exists(config.vardb_path):
        log.warn("Database for variation external references not found")
        log.debug("> {0}".format(conf["vardb_path"]))

    varxdb = VarXrefsDb(config.vardb_path)
    varxdb.open()

    projdb = ProjectDb(project["db"])

    updated_variants = set()

    plen = len(partitions)

    gene_xrefs = defaultdict(set)

    for part in partitions:
        log.info("Updating database with partition data ({0} out of {1}) ...".format(part["index"] + 1, plen))

        log.info("  VEP results ...")

        ctype = lambda v: v.split(",")

        with open(part["vep_path"], "r") as vf:
            for fields in tsv.lines(vf, (int, str, str, ctype, str, str, str, float, float), null_value="-"):
                (
                    var_id,
                    gene,
                    transcript,
                    consequences,
                    protein_pos,
                    aa_change,
                    protein,
                    sift_score,
                    pph2_score,
                ) = fields

                var = projdb.get_variant(var_id)

                xrefs = varxdb.get_xrefs(var.chr, var.start, var.ref, var.alt, var.strand)

                if xrefs is not None:
                    xrefs = ["{0}:{1}".format(source, xref) for source, xref in xrefs]
                    gene_xrefs[gene].update(xrefs)

                    if len(xrefs) == 0:
                        xrefs = None

                projdb.update_variant(Variant(id=var_id, xrefs=xrefs))

                projdb.add_consequence(
                    Consequence(
                        var=Variant(id=var_id),
                        transcript=transcript,
                        gene=gene,
                        ctypes=consequences,
                        protein_pos=protein_pos,
                        aa_change=aa_change,
                        protein=protein,
                    )
                )

        log.info("  Transcript functional impacts ...")

        with open(part["tfi_path"], "r") as f:
            types = (int, str, str, int, float, float, int, float, float, int, float, float, int)
            columns = [0, 1, 3, 6, 7, 8, 9, 11, 12, 13, 15, 16, 17]
            for fields in tsv.lines(f, types, columns=columns, null_value="-"):
                (
                    var_id,
                    transcript,
                    uniprot,
                    impact,
                    sift_score,
                    sift_tfic,
                    sift_class,
                    pph2_score,
                    pph2_tfic,
                    pph2_class,
                    ma_score,
                    ma_tfic,
                    ma_class,
                ) = fields
                print fields

                projdb.update_consequence(
                    Consequence(
                        var=Variant(id=var_id),
                        transcript=transcript,
                        uniprot=uniprot,
                        sift_score=sift_score,
                        sift_tfic=sift_tfic,
                        sift_tfic_class=sift_class,
                        pph2_score=pph2_score,
                        pph2_tfic=pph2_tfic,
                        pph2_tfic_class=pph2_class,
                        ma_score=ma_score,
                        ma_tfic=ma_tfic,
                        ma_tfic_class=ma_class,
                        impact=impact,
                    )
                )

    log.info("Updating variant-gene functional impacts ...")

    with open(project["gfi_path"], "r") as f:
        types = (int, str, float, int, str)
        for var_id, gene, impact, coding_region, prot_changes in tsv.lines(f, types, null_value="-"):
            projdb.add_affected_gene(
                AffectedGene(
                    var=Variant(id=var_id),
                    gene_id=gene,
                    impact=impact,
                    coding_region=coding_region,
                    prot_changes=prot_changes,
                )
            )

    log.info("Updating database with gene external variant references ...")

    for gene, xrefs in gene_xrefs.items():
        projdb.update_gene(Gene(id=gene, xrefs=xrefs))

    projdb.commit()
    projdb.close()

    varxdb.close()

    del project["partitions"]

    projects_port.send(project)