def main(): parser = OptionParser(usage = "usage: %prog [options] <Variants gvf.gz file> ...") parser.add_option("--db", dest="db_path", help="Database path") parser.add_option("-L", "--log-level", dest="log_level", default="info", choices=["debug", "info", "warn", "error", "critical", "notset"], help="Which log level: debug, info, warn, error, critical, notset") (options, args) = parser.parse_args() logging.basicConfig( level=LOG_LEVEL[options.log_level], format="%(asctime)s %(levelname)-5s : %(message)s") log = logging.getLogger("var_db") if len(args) < 1: log.error("At least one variants file is required") parser.print_help() exit(-1) if options.db_path is None: log.error("The database path should be specified") parser.print_help() exit(-1) db_path = options.db_path log.info("Opening database ...") db = VarXrefsDb(db_path) db.open() db.begin() total_count = 0 total_start_time = time.time() src_var_count = {} src_ratio = {} chromosomes = set() chr_var_count = {} strands = set() try: partial_count = 0 partial_start_time = time.time() for xref_path in args: log.info("Reading {0} ...".format(xref_path)) if not os.path.isfile(xref_path): log.error("File not found: {0}".format(xref_path)) exit(-1) mtime = datetime.fromtimestamp(os.path.getmtime(xref_path)) f = tsv.open(xref_path, "r") src_count = 0 src_start_time = time.time() line_num = 1 # discard headers line = f.readline() while line.startswith("#"): line = f.readline() line_num += 1 src_var_count[xref_path] = 0 for line in f: try: fields = [x if len(x) > 0 else None for x in line.rstrip("\n").split("\t")] chr, source, type, start, end, _1, strand, _2, extra = fields start = int(start) end = int(end) ref = None alt = None xref = None try: for var in extra.split(";"): try: key, value = var.split("=") if key == "Dbxref": pos = value.index(":") xref = value[pos + 1:] elif key == "Reference_seq": ref = value elif key == "Variant_seq": alt = value except: continue except: pass if sum([1 if x is None else 0 for x in [chr, start, strand, ref, alt, source, xref]]) > 0: log.warn("Discarding incomplete variant: {0}".format(",".join([chr, str(start), strand, ref, alt, source, xref]))) continue src_var_count[xref_path] += 1 chromosomes.add(chr) if chr in chr_var_count: chr_var_count[chr] += 1 else: chr_var_count[chr] = 1 strands.add(strand) db.add(chr, start, ref, alt, source, xref, strand) total_count += 1 src_count += 1 partial_count += 1 elapsed_time = time.time() - partial_start_time if elapsed_time >= 10.0: ratio = float(partial_count) / elapsed_time log.debug(" {0:.1f} variants/second, {1} variants, {2} total variants".format(ratio, hsize(src_count), hsize(total_count))) partial_count = 0 partial_start_time = time.time() except Exception as ex: log.error("Error at line {0}:\n{1}".format(line_num, line.rstrip("\n"))) import sys import traceback traceback.print_exc(file=sys.stdout) continue finally: line_num += 1 elapsed_time = time.time() - src_start_time ratio = float(src_count) / elapsed_time src_ratio[xref_path] = ratio log.info(" {0:.1f} variants/second, {1} variants, {2} total variants".format(ratio, hsize(src_count), hsize(total_count))) f.close() db.commit() except KeyboardInterrupt: db.commit() log.warn("Interrupted by the user with Ctrl-C") exit(-1) except: db.rollback() raise finally: db.close() elapsed_time = time.time() - total_start_time total_ratio = float(total_count) / elapsed_time log.info("Statistics:") log.info(" Sources:") for xref_path in args: log.info(" {0}: {1} variants".format(os.path.basename(xref_path), src_var_count[xref_path])) total_size = 0 log.info(" Chromosomes:") for chr in chromosomes: log.info(" {0:>2}: {1:>7} variants".format(chr, str(chr_var_count[chr]))) log.info(" Strands: {0}".format(", ".join(strands))) log.info(" Total {0} variants ({1:.1f} variants/sec)".format(hsize(total_count), total_ratio))
def update_db(project): log = task.logger config = GlobalConfig(task.conf) projects_port = task.ports("projects_out") log.info("--- [{0}] --------------------------------------------".format(project["id"])) partitions = project["partitions"] if not os.path.exists(config.vardb_path): log.warn("Database for variation external references not found") log.debug("> {0}".format(conf["vardb_path"])) varxdb = VarXrefsDb(config.vardb_path) varxdb.open() projdb = ProjectDb(project["db"]) updated_variants = set() plen = len(partitions) gene_xrefs = defaultdict(set) for part in partitions: log.info("Updating database with partition data ({0} out of {1}) ...".format(part["index"] + 1, plen)) log.info(" VEP results ...") ctype = lambda v: v.split(",") with open(part["vep_path"], "r") as vf: for fields in tsv.lines(vf, (int, str, str, ctype, str, str, str, float, float), null_value="-"): ( var_id, gene, transcript, consequences, protein_pos, aa_change, protein, sift_score, pph2_score, ) = fields var = projdb.get_variant(var_id) xrefs = varxdb.get_xrefs(var.chr, var.start, var.ref, var.alt, var.strand) if xrefs is not None: xrefs = ["{0}:{1}".format(source, xref) for source, xref in xrefs] gene_xrefs[gene].update(xrefs) if len(xrefs) == 0: xrefs = None projdb.update_variant(Variant(id=var_id, xrefs=xrefs)) projdb.add_consequence( Consequence( var=Variant(id=var_id), transcript=transcript, gene=gene, ctypes=consequences, protein_pos=protein_pos, aa_change=aa_change, protein=protein, ) ) log.info(" Transcript functional impacts ...") with open(part["tfi_path"], "r") as f: types = (int, str, str, int, float, float, int, float, float, int, float, float, int) columns = [0, 1, 3, 6, 7, 8, 9, 11, 12, 13, 15, 16, 17] for fields in tsv.lines(f, types, columns=columns, null_value="-"): ( var_id, transcript, uniprot, impact, sift_score, sift_tfic, sift_class, pph2_score, pph2_tfic, pph2_class, ma_score, ma_tfic, ma_class, ) = fields print fields projdb.update_consequence( Consequence( var=Variant(id=var_id), transcript=transcript, uniprot=uniprot, sift_score=sift_score, sift_tfic=sift_tfic, sift_tfic_class=sift_class, pph2_score=pph2_score, pph2_tfic=pph2_tfic, pph2_tfic_class=pph2_class, ma_score=ma_score, ma_tfic=ma_tfic, ma_tfic_class=ma_class, impact=impact, ) ) log.info("Updating variant-gene functional impacts ...") with open(project["gfi_path"], "r") as f: types = (int, str, float, int, str) for var_id, gene, impact, coding_region, prot_changes in tsv.lines(f, types, null_value="-"): projdb.add_affected_gene( AffectedGene( var=Variant(id=var_id), gene_id=gene, impact=impact, coding_region=coding_region, prot_changes=prot_changes, ) ) log.info("Updating database with gene external variant references ...") for gene, xrefs in gene_xrefs.items(): projdb.update_gene(Gene(id=gene, xrefs=xrefs)) projdb.commit() projdb.close() varxdb.close() del project["partitions"] projects_port.send(project)