Ejemplo n.º 1
0
def load_annotation(chromosome):
    ''' parse over a JSON file, extract position, frequencies,
    ids, and ADSP-ranked most severe consequence; bulk load using COPY '''

    lfn = xstr(chromosome) + '.log'
    lfh = open(lfn, 'w')

    algInvocation = AlgorithmInvocation('parallel_load_dbsnp_vep_result.py',
                                        json.dumps({'chromosome': chromosome}),
                                        args.commit)
    algInvocId = xstr(algInvocation.getAlgorithmInvocationId())
    algInvocation.close()
    warning("Algorithm Invocation ID", algInvocId, file=lfh, flush=True)

    vepParser = VepJsonParser(args.rankingFile, verbose=True)
    indexer = BinIndex(args.gusConfigFile, verbose=False)

    database = Database(args.gusConfigFile)
    database.connect()

    fname = 'chr' + xstr(chromosome) + '.json.gz'
    fname = path.join(args.dir, fname)
    lineCount = 0
    variantCount = 0
    skipCount = 0

    resume = True if args.resumeAfter is None else False
    if not resume:
        warning("--resumeAfter flag specified; Finding skip until point",
                args.resumeAfter,
                file=lfh,
                flush=True)

    previousSnp = None
    with database.cursor() as cursor:
        copyObj = io.StringIO()
        with open(fname, 'r') as fhandle:
            mappedFile = mmap.mmap(fhandle.fileno(), 0,
                                   prot=mmap.PROT_READ)  # put file in swap
            with gzip.GzipFile(mode='r', fileobj=mappedFile) as gfh:
                for line in gfh:
                    lineCount = lineCount + 1
                    vepResult = json.loads(line.rstrip())

                    vepParser.set_annotation(copy.deepcopy(vepResult))
                    vepInputStr = vepParser.get('input')
                    entry = VcfEntryParser(vepInputStr)

                    # there are json formatting issues w/the input str
                    # so replace w/the parsed entry; which is now a dict
                    vepResult['input'] = entry.get_entry()

                    refSnpId = entry.get_refsnp()

                    if not resume:
                        if previousSnp == args.resumeAfter:
                            warning(previousSnp,
                                    refSnpId,
                                    file=lfh,
                                    flush=True)
                            warning("Resuming after:",
                                    args.resumeAfter,
                                    "- SKIPPED",
                                    skipCount,
                                    "lines.",
                                    file=lfh,
                                    flush=True)
                            resume = True
                        else:
                            previousSnp = refSnpId
                            skipCount = skipCount + 1
                            continue

                    if lineCount == 1 or variantCount % args.commitAfter == 0:
                        warning('Processing new copy object',
                                file=lfh,
                                flush=True)
                        tstart = datetime.now()

                    vepParser.set('ref_snp_id', refSnpId)

                    refAllele = entry.get('ref')
                    altAllele = entry.get('alt').split(',')

                    chrom = xstr(entry.get('chrom'))
                    if chrom == 'MT':
                        chrom = 'M'
                    position = int(entry.get('pos'))

                    isMultiAllelic = len(altAllele) > 1

                    try:

                        vepParser.set('is_multi_allelic', isMultiAllelic)
                        vepParser.set('ref_allele', refAllele)
                        vepParser.set('alt_allele', altAllele)

                        frequencies = get_frequencies(vepParser, lfh)
                        vepParser.adsp_rank_and_sort_consequences()

                        # for each allele
                        for alt in altAllele:
                            variantCount = variantCount + 1
                            metaseqId = ':'.join(
                                (chrom, xstr(position), refAllele, alt))
                            positionEnd = entry.infer_variant_end_location(alt)

                            binIndex = indexer.find_bin_index(
                                chrom, position, positionEnd)

                            # NOTE: VEP uses normalized alleles to indicate variant_allele
                            nRef, nAlt = entry.normalize_alleles(
                                refAllele, alt, snvDivMinus=True)
                            alleleFreq = None if frequencies is None \
                              else get_allele_frequencies(nAlt, frequencies)

                            msConseq = get_most_severe_consequence(
                                nAlt, vepParser)

                            valueStr = '#'.join(
                                ('chr' + xstr(chrom), xstr(position),
                                 xstr(vepParser.get('is_multi_allelic')),
                                 binIndex, refSnpId, metaseqId,
                                 json2str(alleleFreq), json2str(msConseq),
                                 json2str(
                                     get_adsp_ranked_allele_consequences(
                                         nAlt, vepParser)),
                                 json.dumps(vepResult), algInvocId))

                            copyObj.write(valueStr + '\n')

                            if variantCount % args.logAfter == 0 \
                              and variantCount % args.commitAfter != 0:
                                warning("PARSED",
                                        variantCount,
                                        file=lfh,
                                        flush=True)

                            if variantCount % args.commitAfter == 0:
                                tendw = datetime.now()
                                warning('Copy object prepared in ' +
                                        str(tendw - tstart) + '; ' +
                                        str(copyObj.tell()) +
                                        ' bytes; transfering to database',
                                        file=lfh,
                                        flush=True)
                                copyObj.seek(0)
                                cursor.copy_from(copyObj,
                                                 'variant',
                                                 sep='#',
                                                 null="NULL",
                                                 columns=VARIANT_COLUMNS)

                                message = '{:,}'.format(variantCount)
                                if args.commit:
                                    database.commit()
                                    message = "COMMITTED " + message

                                else:
                                    database.rollback()
                                    message = "PARSED " + message + " -- rolling back"

                                if variantCount % args.logAfter == 0:
                                    warning(message,
                                            "; up to = ",
                                            refSnpId,
                                            file=lfh,
                                            flush=True)

                                tend = datetime.now()
                                warning('Database copy time: ' +
                                        str(tend - tendw),
                                        file=lfh,
                                        flush=True)
                                warning('        Total time: ' +
                                        str(tend - tstart),
                                        file=lfh,
                                        flush=True)

                                if args.test:
                                    die("Test complete")

                                copyObj = io.StringIO()  # reset io string

                    except Exception as e:
                        warning("ERROR parsing variant",
                                refSnpId,
                                file=lfh,
                                flush=True)
                        warning(lineCount, ":", line, file=lfh, flush=True)
                        warning(str(e), file=lfh, flush=True)
                        raise

            # final commit / leftovers
            copyObj.seek(0)
            cursor.copy_from(copyObj,
                             'variant',
                             sep='#',
                             null="NULL",
                             columns=VARIANT_COLUMNS)
            message = '{:,}'.format(variantCount)

            if args.commit:
                database.commit()
                message = "DONE - COMMITTED " + message
            else:
                database.rollback()
                message = "DONE - PARSED " + message + " -- rolling back"

            warning(message, file=lfh, flush=True)

            mappedFile.close()

    database.close()
    indexer.close()
    lfh.close()
def load_annotation(chromosome):
    ''' extract basic SNP information from VCF line; check against AnnotatedDB; if missing
    load '''

    indexer = BinIndex(args.gusConfigFile, verbose=False)

    database = Database(args.gusConfigFile)
    database.connect()


    fname = '00-All.MT.vcf.gz' if chromosome == 'M' \
      else '00-All.' + xstr(chromosome) + '.vcf.gz'

    logFname = path.join(args.logDir, fname + '.log')
    fname = path.join(args.dir, fname)

    lineCount = 0
    variantCount = 0

    warning("Parsing", fname)
    warning("Logging:", logFname)
    with database.cursor() as cursor:
        with open(fname, 'r') as fhandle, open(logFname, 'w') as lfh:
            warning("Parsing", fname, file=lfh, flush=True)
            mappedFile = mmap.mmap(fhandle.fileno(), 0,
                                   prot=mmap.PROT_READ)  # put file in swap
            with gzip.GzipFile(mode='r', fileobj=mappedFile) as gfh:
                for line in gfh:
                    lineCount = lineCount + 1
                    vepResult = {
                    }  # will just be the input string, so it matches the annotated variants

                    if line.startswith('#'):
                        continue

                    entry = VcfEntryParser(line.rstrip())

                    # there are json formatting issues w/the input str
                    # so replace w/the parsed entry; which is now a dict
                    vepResult['input'] = entry.get_entry()

                    refSnpId = entry.get_refsnp()

                    refAllele = entry.get('ref')
                    altAllele = entry.get('alt').split(',')

                    if '.' not in altAllele:
                        if lineCount % 25000 == 0:
                            warning("Parsed",
                                    lineCount,
                                    "lines.",
                                    file=lfh,
                                    flush=True)
                        continue  # sanity check/all missing are lacking alt alleles

                    chrom = xstr(entry.get('chrom'))
                    position = int(entry.get('pos'))

                    isMultiAllelic = len(altAllele) > 1

                    try:
                        if variant_is_missing(database, 'chr' + chrom,
                                              refSnpId):
                            warning(refSnpId,
                                    "- MISSING -- LOADING",
                                    file=lfh,
                                    flush=True)
                            for alt in altAllele:
                                if alt == '.':  # checking again in case some are multiallelic
                                    alt = '?'
                                variantCount = variantCount + 1
                                metaseqId = ':'.join(
                                    (chrom, xstr(position), refAllele, alt))
                                positionEnd = entry.infer_variant_end_location(
                                    alt)

                                binIndex = indexer.find_bin_index(
                                    chrom, position, positionEnd)
                                cursor.execute(
                                    INSERT_SQL,
                                    ('chr' + chrom, xstr(position),
                                     isMultiAllelic,
                                     binIndex, refSnpId, metaseqId,
                                     json.dumps(vepResult), algInvocId))

                                if args.commit:
                                    database.commit()
                                else:
                                    database.rollback()

                        if lineCount % 25000 == 0:
                            warning("Parsed",
                                    lineCount,
                                    "lines.",
                                    file=lfh,
                                    flush=True)
                            warning("Loaded",
                                    variantCount,
                                    "missing variants",
                                    file=lfh,
                                    flush=True)

                    except Exception:
                        warning("ERROR parsing variant",
                                refSnpId,
                                file=lfh,
                                flush=True)
                        warning(lineCount, ":", line, file=lfh, flush=True)
                        raise

            if not args.commit:
                database.rollback()
                warning("DONE -- rolling back")

            mappedFile.close()

    database.close()
    indexer.close()
    warning("DONE - Loaded",
            variantCount,
            "missing variants",
            file=lfh,
            flush=True)
def load_annotation():
    ''' extract basic SNP information from VCF line; check against AnnotatedDB; if missing
    load '''

    indexer = BinIndex(args.gusConfigFile, verbose=False)

    database = Database(args.gusConfigFile)
    database.connect()

    lineCount = 0
    variantCount = 0
    with database.cursor() as cursor:
        with open(args.vcfFile, 'r') as fh:
            with open(args.logFileName, 'w') as lfh:
                warning("Parsing", args.vcfFile, file=lfh, flush=True)
                for line in fh:
                    lineCount = lineCount + 1
                    vepResult = {} # will just be the input string, so it matches the annotated variants

                    if line.startswith('#'):
                        continue

                    entry = VcfEntryParser(line.rstrip())

                    # there are json formatting issues w/the input str
                    # so replace w/the parsed entry; which is now a dict
                    vepResult['input'] = entry.get_entry()

                    refAllele = entry.get('ref')
                    altAllele = entry.get('alt') # assuming not multiallelic

                    if refAllele == '0': # happens sometimes
                        refAllele = '?'
                    if altAllele == '0':
                        altAllele = '?'

                    # truncatedRef = truncate(refAllele, 20)
                    # truncatedAlt = truncate(altAllele, 20)

                    chrom = xstr(entry.get('chrom'))
                    if chrom == 'MT':
                        chrom = 'M'
                    position = int(entry.get('pos'))


                    # metaseqId = ':'.join((chrom, xstr(position), truncatedRef, truncatedAlt))
                    isMultiAllelic = False # assuming b/c of how the VCF files were generated
                    metaseqId = ':'.join((chrom, xstr(position), refAllele, altAllele))

                    try:
                        if variant_is_missing(database, 'chr' + chrom, metaseqId):
                            warning(metaseqId, "- MISSING -- LOADING", file=lfh, flush=True)
                            variantCount = variantCount + 1

                            positionEnd = entry.infer_variant_end_location(altAllele)
                            binIndex = indexer.find_bin_index(chrom, position, positionEnd)

                            cursor.execute(INSERT_SQL, 
                                               ('chr' + chrom,
                                                position,
                                                isMultiAllelic,
                                                binIndex,
                                                metaseqId,
                                                json.dumps(vepResult),
                                                algInvocId))

                            if args.commit:
                                database.commit()
                            else:
                                database.rollback()

                        if lineCount % 50 == 0:
                            warning("Parsed", lineCount, "lines.", file=lfh, flush=True)
                            warning("Loaded", variantCount, "missing variants", file=lfh, flush=True)

                    except Exception:
                        warning("ERROR parsing variant", metaseqId, file=lfh, flush=True)
                        warning(lineCount, ":", line, file=lfh, flush=True)
                        print("FAIL", file=sys.stdout)
                        raise

                warning("DONE - Loaded", variantCount, "missing variants", file=lfh, flush=True)
    database.close()
    indexer.close()
Ejemplo n.º 4
0
def load_annotation():
    ''' parse over a JSON file, extract position, frequencies,
    ids, and ADSP-ranked most severe consequence; bulk load using COPY '''

    fname = args.inputFile
    lineCount = 0
    variantCount = 0
    skipCount = 0

    warning("Parsing variants from:", fname) # should print to plugin log
    with database.cursor() as cursor, database.cursor("RealDictCursor") as dcursor:
        copyObj = io.StringIO()
        with open(fname, 'r') as fh:
            with open(args.logFile, 'w') as lfh:
                warning("Parsing variants from:", fname, file=lfh, flush=True)
                for line in fh:
                    lineCount = lineCount + 1
                    vepResult = json.loads(line.rstrip())

                    vepParser.set_annotation(copy.deepcopy(vepResult))
                    vepInputStr = vepParser.get('input')
                    entry = VcfEntryParser(vepInputStr)

                    # there may be json formatting issues w/the input str
                    # so replace w/the parsed entry; which is now a dict
                    vepResult['input'] = entry.get_entry()

                    if lineCount == 1 or variantCount % 5000 == 0:
                        warning('Processing new copy object', file=lfh, flush=True)
                        tstart = datetime.now()

                    refAllele = entry.get('ref')
                    altAllele = entry.get('alt')  # assumes no multialleic variants

                    if refAllele == '0': # happens sometimes
                        refAllele = '?'
                    if altAllele == '0':
                        altAllele = '?'

                    # truncatedRef = truncate(refAllele, 20)
                    # truncatedAlt = truncate(altAllele, 20)
                  
                    chrom = xstr(entry.get('chrom'))
                    if chrom == 'MT':
                        chrom = 'M'

                    position = int(entry.get('pos'))

                    try:
                        # metaseqId = ':'.join((chrom, xstr(position), truncatedRef, truncatedAlt))
                        metaseqId = ':'.join((chrom, xstr(position), refAllele, altAllele))

                        if duplicate(metaseqId, dcursor): 
                            warning("SKIPPING:",metaseqId, "- already loaded.", file=lfh, flush=True)
                            continue

                        vepParser.set('ref_allele', refAllele)
                        vepParser.set('alt_allele', altAllele)
                     
                        frequencies = get_frequencies()
                        vepParser.adsp_rank_and_sort_consequences()

                        # for each allele
                        variantCount = variantCount + 1

                        positionEnd = entry.infer_variant_end_location(altAllele)
                        binIndex = indexer.find_bin_index(chrom, position, positionEnd)

                        # NOTE: VEP uses normalized alleles to indicate variant_allele
                        nRef, nAlt = entry.normalize_alleles(refAllele, altAllele, snvDivMinus=True)
                        alleleFreq = None if frequencies is None \
                          else get_allele_frequencies(nAlt, frequencies)

                        msConseq = get_most_severe_consequence(nAlt)
                        valueStr = '#'.join((
                          'chr' + xstr(chrom),
                          xstr(position),
                          binIndex,
                          metaseqId,
                          json2str(alleleFreq),
                          json2str(msConseq),
                          json2str(get_adsp_ranked_allele_consequences(nAlt)),
                          json.dumps(vepResult),
                          algInvocId
                          ))


                        copyObj.write(valueStr + '\n')

                        if variantCount % 5000 == 0:
                            warning("FOUND", variantCount, " new variants", file=lfh, flush=True)
                            tendw = datetime.now()
                            warning('Copy object prepared in ' + str(tendw - tstart) + '; ' +
                                    str(copyObj.tell()) + ' bytes; transfering to database',
                                    file=lfh, flush=True)
                            copyObj.seek(0)
                            cursor.copy_from(copyObj, 'variant', sep='#', null="NULL",
                                             columns=VARIANT_COLUMNS)

                            message = '{:,}'.format(variantCount)
                            if args.commit:
                                database.commit()
                                message = "COMMITTED " + message

                            else:
                                database.rollback()
                                message = "PARSED " + message + " -- rolling back"


                            warning(message, "; up to = ", metaseqId, file=lfh, flush=True)

                            tend = datetime.now()
                            warning('Database copy time: ' + str(tend - tendw), file=lfh, flush=True)
                            warning('        Total time: ' + str(tend - tstart), file=lfh, flush=True)

                            copyObj = io.StringIO() # reset io string

                    except Exception:
                        warning("ERROR parsing variant on line", line, ' - ', metaseqId,
                                file=lfh, flush=True)
                        print("FAIL", file=sys.stdout)
                        raise

                # final commit / leftovers
                copyObj.seek(0)
                cursor.copy_from(copyObj, 'variant', sep='#', null="NULL", columns=VARIANT_COLUMNS)
                message = '{:,}'.format(variantCount)

                if args.commit:
                    database.commit()
                    message = "DONE - COMMITTED " + message
                else:
                    database.rollback()
                    message = "DONE - PARSED " + message + " -- rolling back"
                    
                warning(message, file=lfh, flush=True)