def load_annotation(chromosome): ''' parse over a JSON file, extract position, frequencies, ids, and ADSP-ranked most severe consequence; bulk load using COPY ''' lfn = xstr(chromosome) + '.log' lfh = open(lfn, 'w') algInvocation = AlgorithmInvocation('parallel_load_dbsnp_vep_result.py', json.dumps({'chromosome': chromosome}), args.commit) algInvocId = xstr(algInvocation.getAlgorithmInvocationId()) algInvocation.close() warning("Algorithm Invocation ID", algInvocId, file=lfh, flush=True) vepParser = VepJsonParser(args.rankingFile, verbose=True) indexer = BinIndex(args.gusConfigFile, verbose=False) database = Database(args.gusConfigFile) database.connect() fname = 'chr' + xstr(chromosome) + '.json.gz' fname = path.join(args.dir, fname) lineCount = 0 variantCount = 0 skipCount = 0 resume = True if args.resumeAfter is None else False if not resume: warning("--resumeAfter flag specified; Finding skip until point", args.resumeAfter, file=lfh, flush=True) previousSnp = None with database.cursor() as cursor: copyObj = io.StringIO() with open(fname, 'r') as fhandle: mappedFile = mmap.mmap(fhandle.fileno(), 0, prot=mmap.PROT_READ) # put file in swap with gzip.GzipFile(mode='r', fileobj=mappedFile) as gfh: for line in gfh: lineCount = lineCount + 1 vepResult = json.loads(line.rstrip()) vepParser.set_annotation(copy.deepcopy(vepResult)) vepInputStr = vepParser.get('input') entry = VcfEntryParser(vepInputStr) # there are json formatting issues w/the input str # so replace w/the parsed entry; which is now a dict vepResult['input'] = entry.get_entry() refSnpId = entry.get_refsnp() if not resume: if previousSnp == args.resumeAfter: warning(previousSnp, refSnpId, file=lfh, flush=True) warning("Resuming after:", args.resumeAfter, "- SKIPPED", skipCount, "lines.", file=lfh, flush=True) resume = True else: previousSnp = refSnpId skipCount = skipCount + 1 continue if lineCount == 1 or variantCount % args.commitAfter == 0: warning('Processing new copy object', file=lfh, flush=True) tstart = datetime.now() vepParser.set('ref_snp_id', refSnpId) refAllele = entry.get('ref') altAllele = entry.get('alt').split(',') chrom = xstr(entry.get('chrom')) if chrom == 'MT': chrom = 'M' position = int(entry.get('pos')) isMultiAllelic = len(altAllele) > 1 try: vepParser.set('is_multi_allelic', isMultiAllelic) vepParser.set('ref_allele', refAllele) vepParser.set('alt_allele', altAllele) frequencies = get_frequencies(vepParser, lfh) vepParser.adsp_rank_and_sort_consequences() # for each allele for alt in altAllele: variantCount = variantCount + 1 metaseqId = ':'.join( (chrom, xstr(position), refAllele, alt)) positionEnd = entry.infer_variant_end_location(alt) binIndex = indexer.find_bin_index( chrom, position, positionEnd) # NOTE: VEP uses normalized alleles to indicate variant_allele nRef, nAlt = entry.normalize_alleles( refAllele, alt, snvDivMinus=True) alleleFreq = None if frequencies is None \ else get_allele_frequencies(nAlt, frequencies) msConseq = get_most_severe_consequence( nAlt, vepParser) valueStr = '#'.join( ('chr' + xstr(chrom), xstr(position), xstr(vepParser.get('is_multi_allelic')), binIndex, refSnpId, metaseqId, json2str(alleleFreq), json2str(msConseq), json2str( get_adsp_ranked_allele_consequences( nAlt, vepParser)), json.dumps(vepResult), algInvocId)) copyObj.write(valueStr + '\n') if variantCount % args.logAfter == 0 \ and variantCount % args.commitAfter != 0: warning("PARSED", variantCount, file=lfh, flush=True) if variantCount % args.commitAfter == 0: tendw = datetime.now() warning('Copy object prepared in ' + str(tendw - tstart) + '; ' + str(copyObj.tell()) + ' bytes; transfering to database', file=lfh, flush=True) copyObj.seek(0) cursor.copy_from(copyObj, 'variant', sep='#', null="NULL", columns=VARIANT_COLUMNS) message = '{:,}'.format(variantCount) if args.commit: database.commit() message = "COMMITTED " + message else: database.rollback() message = "PARSED " + message + " -- rolling back" if variantCount % args.logAfter == 0: warning(message, "; up to = ", refSnpId, file=lfh, flush=True) tend = datetime.now() warning('Database copy time: ' + str(tend - tendw), file=lfh, flush=True) warning(' Total time: ' + str(tend - tstart), file=lfh, flush=True) if args.test: die("Test complete") copyObj = io.StringIO() # reset io string except Exception as e: warning("ERROR parsing variant", refSnpId, file=lfh, flush=True) warning(lineCount, ":", line, file=lfh, flush=True) warning(str(e), file=lfh, flush=True) raise # final commit / leftovers copyObj.seek(0) cursor.copy_from(copyObj, 'variant', sep='#', null="NULL", columns=VARIANT_COLUMNS) message = '{:,}'.format(variantCount) if args.commit: database.commit() message = "DONE - COMMITTED " + message else: database.rollback() message = "DONE - PARSED " + message + " -- rolling back" warning(message, file=lfh, flush=True) mappedFile.close() database.close() indexer.close() lfh.close()
def load_annotation(): ''' extract basic SNP information from VCF line; check against AnnotatedDB; if missing load ''' indexer = BinIndex(args.gusConfigFile, verbose=False) database = Database(args.gusConfigFile) database.connect() lineCount = 0 variantCount = 0 with database.cursor() as cursor: with open(args.vcfFile, 'r') as fh: with open(args.logFileName, 'w') as lfh: warning("Parsing", args.vcfFile, file=lfh, flush=True) for line in fh: lineCount = lineCount + 1 vepResult = {} # will just be the input string, so it matches the annotated variants if line.startswith('#'): continue entry = VcfEntryParser(line.rstrip()) # there are json formatting issues w/the input str # so replace w/the parsed entry; which is now a dict vepResult['input'] = entry.get_entry() refAllele = entry.get('ref') altAllele = entry.get('alt') # assuming not multiallelic if refAllele == '0': # happens sometimes refAllele = '?' if altAllele == '0': altAllele = '?' # truncatedRef = truncate(refAllele, 20) # truncatedAlt = truncate(altAllele, 20) chrom = xstr(entry.get('chrom')) if chrom == 'MT': chrom = 'M' position = int(entry.get('pos')) # metaseqId = ':'.join((chrom, xstr(position), truncatedRef, truncatedAlt)) isMultiAllelic = False # assuming b/c of how the VCF files were generated metaseqId = ':'.join((chrom, xstr(position), refAllele, altAllele)) try: if variant_is_missing(database, 'chr' + chrom, metaseqId): warning(metaseqId, "- MISSING -- LOADING", file=lfh, flush=True) variantCount = variantCount + 1 positionEnd = entry.infer_variant_end_location(altAllele) binIndex = indexer.find_bin_index(chrom, position, positionEnd) cursor.execute(INSERT_SQL, ('chr' + chrom, position, isMultiAllelic, binIndex, metaseqId, json.dumps(vepResult), algInvocId)) if args.commit: database.commit() else: database.rollback() if lineCount % 50 == 0: warning("Parsed", lineCount, "lines.", file=lfh, flush=True) warning("Loaded", variantCount, "missing variants", file=lfh, flush=True) except Exception: warning("ERROR parsing variant", metaseqId, file=lfh, flush=True) warning(lineCount, ":", line, file=lfh, flush=True) print("FAIL", file=sys.stdout) raise warning("DONE - Loaded", variantCount, "missing variants", file=lfh, flush=True) database.close() indexer.close()
def update_variant_records_from_vcf(): ''' lookup and update variant records from a VCF file assuming the load by file was called by a plugin, so this variant has already been verified to be new to the resource; no need to check alternative metaseq IDs''' cupdater = CADDUpdater(args.logFile, args.databaseDir) database = Database(args.gusConfigFile) database.connect() lineCount = 0 with database.cursor() as updateCursor, \ open(args.vcfFile, 'r') as fh: try: for line in fh: if line.startswith("#"): continue lineCount = lineCount + 1 entry = VcfEntryParser(line.rstrip()) refAllele = entry.get('ref') altAllele = entry.get('alt') chrom = xstr(entry.get('chrom')) if chrom == 'MT': chrom = 'M' position = int(entry.get('pos')) metaseqId = ':'.join( (chrom, xstr(position), refAllele, altAllele)) record = {"metaseq_id": metaseqId} # mimic "record" if len(refAllele) > 1 or len(altAllele) > 1: # only doing SNVs update_variant_record(record, cupdater, INDEL) else: update_variant_record(record, cupdater, SNV) if lineCount % args.commitAfter == 0: warning("Processed:", lineCount, "- SNVs:", cupdater.get_update_count(SNV), "- INDELS:", cupdater.get_update_count(INDEL), " - Not Matched:", cupdater.get_not_matched_count(), file=cupdater.lfh(), flush=True) updateCursor.execute(cupdater.sql_buffer_str()) if args.commit: database.commit() else: database.rollback() cupdater.clear_update_sql() if cupdater.buffered_variant_count() > 0: # trailing updateCursor.execute(cupdater.sql_buffer_str()) if args.commit: database.commit() else: database.rollback() warning("DONE - Updated SNVs:", cupdater.get_update_count(SNV), "- Updated INDELS:", cupdater.get_update_count(INDEL), "- Not Matched:", cupdater.get_not_matched_count(), file=cupdater.lfh(), flush=True) cupdater.close_lfh() except Exception as e: warning(e, entry, file=cupdater.lfh(), flush=True) database.rollback() database.close() print("FAIL", file=sys.stdout) raise
def load_annotation(chromosome): ''' extract basic SNP information from VCF line; check against AnnotatedDB; if missing load ''' indexer = BinIndex(args.gusConfigFile, verbose=False) database = Database(args.gusConfigFile) database.connect() fname = '00-All.MT.vcf.gz' if chromosome == 'M' \ else '00-All.' + xstr(chromosome) + '.vcf.gz' logFname = path.join(args.logDir, fname + '.log') fname = path.join(args.dir, fname) lineCount = 0 variantCount = 0 warning("Parsing", fname) warning("Logging:", logFname) with database.cursor() as cursor: with open(fname, 'r') as fhandle, open(logFname, 'w') as lfh: warning("Parsing", fname, file=lfh, flush=True) mappedFile = mmap.mmap(fhandle.fileno(), 0, prot=mmap.PROT_READ) # put file in swap with gzip.GzipFile(mode='r', fileobj=mappedFile) as gfh: for line in gfh: lineCount = lineCount + 1 vepResult = { } # will just be the input string, so it matches the annotated variants if line.startswith('#'): continue entry = VcfEntryParser(line.rstrip()) # there are json formatting issues w/the input str # so replace w/the parsed entry; which is now a dict vepResult['input'] = entry.get_entry() refSnpId = entry.get_refsnp() refAllele = entry.get('ref') altAllele = entry.get('alt').split(',') if '.' not in altAllele: if lineCount % 25000 == 0: warning("Parsed", lineCount, "lines.", file=lfh, flush=True) continue # sanity check/all missing are lacking alt alleles chrom = xstr(entry.get('chrom')) position = int(entry.get('pos')) isMultiAllelic = len(altAllele) > 1 try: if variant_is_missing(database, 'chr' + chrom, refSnpId): warning(refSnpId, "- MISSING -- LOADING", file=lfh, flush=True) for alt in altAllele: if alt == '.': # checking again in case some are multiallelic alt = '?' variantCount = variantCount + 1 metaseqId = ':'.join( (chrom, xstr(position), refAllele, alt)) positionEnd = entry.infer_variant_end_location( alt) binIndex = indexer.find_bin_index( chrom, position, positionEnd) cursor.execute( INSERT_SQL, ('chr' + chrom, xstr(position), isMultiAllelic, binIndex, refSnpId, metaseqId, json.dumps(vepResult), algInvocId)) if args.commit: database.commit() else: database.rollback() if lineCount % 25000 == 0: warning("Parsed", lineCount, "lines.", file=lfh, flush=True) warning("Loaded", variantCount, "missing variants", file=lfh, flush=True) except Exception: warning("ERROR parsing variant", refSnpId, file=lfh, flush=True) warning(lineCount, ":", line, file=lfh, flush=True) raise if not args.commit: database.rollback() warning("DONE -- rolling back") mappedFile.close() database.close() indexer.close() warning("DONE - Loaded", variantCount, "missing variants", file=lfh, flush=True)
def load_annotation(): ''' parse over a JSON file, extract position, frequencies, ids, and ADSP-ranked most severe consequence; bulk load using COPY ''' fname = args.inputFile lineCount = 0 variantCount = 0 skipCount = 0 warning("Parsing variants from:", fname) # should print to plugin log with database.cursor() as cursor, database.cursor("RealDictCursor") as dcursor: copyObj = io.StringIO() with open(fname, 'r') as fh: with open(args.logFile, 'w') as lfh: warning("Parsing variants from:", fname, file=lfh, flush=True) for line in fh: lineCount = lineCount + 1 vepResult = json.loads(line.rstrip()) vepParser.set_annotation(copy.deepcopy(vepResult)) vepInputStr = vepParser.get('input') entry = VcfEntryParser(vepInputStr) # there may be json formatting issues w/the input str # so replace w/the parsed entry; which is now a dict vepResult['input'] = entry.get_entry() if lineCount == 1 or variantCount % 5000 == 0: warning('Processing new copy object', file=lfh, flush=True) tstart = datetime.now() refAllele = entry.get('ref') altAllele = entry.get('alt') # assumes no multialleic variants if refAllele == '0': # happens sometimes refAllele = '?' if altAllele == '0': altAllele = '?' # truncatedRef = truncate(refAllele, 20) # truncatedAlt = truncate(altAllele, 20) chrom = xstr(entry.get('chrom')) if chrom == 'MT': chrom = 'M' position = int(entry.get('pos')) try: # metaseqId = ':'.join((chrom, xstr(position), truncatedRef, truncatedAlt)) metaseqId = ':'.join((chrom, xstr(position), refAllele, altAllele)) if duplicate(metaseqId, dcursor): warning("SKIPPING:",metaseqId, "- already loaded.", file=lfh, flush=True) continue vepParser.set('ref_allele', refAllele) vepParser.set('alt_allele', altAllele) frequencies = get_frequencies() vepParser.adsp_rank_and_sort_consequences() # for each allele variantCount = variantCount + 1 positionEnd = entry.infer_variant_end_location(altAllele) binIndex = indexer.find_bin_index(chrom, position, positionEnd) # NOTE: VEP uses normalized alleles to indicate variant_allele nRef, nAlt = entry.normalize_alleles(refAllele, altAllele, snvDivMinus=True) alleleFreq = None if frequencies is None \ else get_allele_frequencies(nAlt, frequencies) msConseq = get_most_severe_consequence(nAlt) valueStr = '#'.join(( 'chr' + xstr(chrom), xstr(position), binIndex, metaseqId, json2str(alleleFreq), json2str(msConseq), json2str(get_adsp_ranked_allele_consequences(nAlt)), json.dumps(vepResult), algInvocId )) copyObj.write(valueStr + '\n') if variantCount % 5000 == 0: warning("FOUND", variantCount, " new variants", file=lfh, flush=True) tendw = datetime.now() warning('Copy object prepared in ' + str(tendw - tstart) + '; ' + str(copyObj.tell()) + ' bytes; transfering to database', file=lfh, flush=True) copyObj.seek(0) cursor.copy_from(copyObj, 'variant', sep='#', null="NULL", columns=VARIANT_COLUMNS) message = '{:,}'.format(variantCount) if args.commit: database.commit() message = "COMMITTED " + message else: database.rollback() message = "PARSED " + message + " -- rolling back" warning(message, "; up to = ", metaseqId, file=lfh, flush=True) tend = datetime.now() warning('Database copy time: ' + str(tend - tendw), file=lfh, flush=True) warning(' Total time: ' + str(tend - tstart), file=lfh, flush=True) copyObj = io.StringIO() # reset io string except Exception: warning("ERROR parsing variant on line", line, ' - ', metaseqId, file=lfh, flush=True) print("FAIL", file=sys.stdout) raise # final commit / leftovers copyObj.seek(0) cursor.copy_from(copyObj, 'variant', sep='#', null="NULL", columns=VARIANT_COLUMNS) message = '{:,}'.format(variantCount) if args.commit: database.commit() message = "DONE - COMMITTED " + message else: database.rollback() message = "DONE - PARSED " + message + " -- rolling back" warning(message, file=lfh, flush=True)