예제 #1
0
def evaluate_minor_allele(vepParser, lfh):
    ''' some dbSNP variants are reported in VCF w/single mutations
    but are actually multiallelic (have additional minor allele)
    and reported as such on the dbSNP website.  To catch these we
    need to evaluate the 1000Genomes minor allele frequency '''

    refSnpId = vepParser.get('ref_snp_id')
    refAllele = vepParser.get('ref_allele')
    altAlleles = vepParser.get('alt_allele')
    frequencies = vepParser.get_frequencies(refSnpId)

    if frequencies is None:
        return None

    if 'minor_allele_freq' not in frequencies:  # nothing to change
        return frequencies

    minorAlleleFreq = frequencies['minor_allele_freq']
    minorAllele = frequencies['minor_allele']

    match = False

    aFreq = get_allele_frequencies(minorAllele, frequencies)

    # minor allele already in frequency list; update
    if aFreq is not None:
        match = True
        if '1000Genomes' in aFreq:
            frequencies['values'][minorAllele]['1000Genomes'].append(
                {'gmaf': minorAlleleFreq})
        else:
            frequencies['values'][minorAllele]['1000Genomes'] = [{
                'gmaf':
                minorAlleleFreq
            }]

    # the minor allele is one of the alt alleles in the VCF, but not in frequency list; add
    elif minorAllele in altAlleles:
        match = True
        frequencies['values'][minorAllele]['1000Genomes'] = [{
            'gmaf':
            minorAlleleFreq
        }]

    # then check to see if it is a normalized allele (INDEL/DIV/INS/etc)
    else:
        for alt in altAlleles:
            nRef, nAlt = VcfEntryParser(None).normalize_alleles(
                refAllele, alt, snvDivMinus=True)
            if minorAllele == nAlt:
                match = True
                aFreq = get_allele_frequencies(nAlt, frequencies)
                if aFreq is not None:
                    # this should never happen, b/c it should have been caught above, but...
                    if '1000Genomes' in aFreq:
                        frequencies['values'][minorAllele][
                            '1000Genomes'].append({'gmaf': minorAlleleFreq})
                    else:
                        frequencies['values'][minorAllele]['1000Genomes'] = [{
                            'gmaf':
                            minorAlleleFreq
                        }]
                else:
                    frequencies['values'][minorAllele]['1000Genomes'] = [{
                        'gmaf':
                        minorAlleleFreq
                    }]
                break  # if match found, no need to iterate over the rest

            if minorAllele == nRef:
                # don't add it in; can/should be inferred from other allele frequencies
                match = True
                warning("Matched minor allele",
                        minorAllele,
                        "to normalized reference for variant -",
                        refSnpId,
                        refAllele + '/' + '/'.join(altAlleles),
                        "- IGNORED",
                        file=lfh,
                        flush=True)
                break

    if not match:
        warning("Unable to match minor allele",
                minorAllele,
                "to variant -",
                refSnpId,
                "- alleles:",
                refAllele + '/' + '/'.join(altAlleles),
                "Assuming VCF/dbSNP record mismatch.",
                file=lfh,
                flush=True)
        frequencies['values'][minorAllele]['1000Genomes'] = [{
            'gmaf':
            minorAlleleFreq
        }]

        # also add it to the alternative alleles, so it will be loaded as its own variant
        vepParser.set('alt_allele', altAlleles.append(minorAllele))

    return frequencies
예제 #2
0
def load_annotation(chromosome):
    ''' parse over a JSON file, extract position, frequencies,
    ids, and ADSP-ranked most severe consequence; bulk load using COPY '''

    lfn = xstr(chromosome) + '.log'
    lfh = open(lfn, 'w')

    algInvocation = AlgorithmInvocation('parallel_load_dbsnp_vep_result.py',
                                        json.dumps({'chromosome': chromosome}),
                                        args.commit)
    algInvocId = xstr(algInvocation.getAlgorithmInvocationId())
    algInvocation.close()
    warning("Algorithm Invocation ID", algInvocId, file=lfh, flush=True)

    vepParser = VepJsonParser(args.rankingFile, verbose=True)
    indexer = BinIndex(args.gusConfigFile, verbose=False)

    database = Database(args.gusConfigFile)
    database.connect()

    fname = 'chr' + xstr(chromosome) + '.json.gz'
    fname = path.join(args.dir, fname)
    lineCount = 0
    variantCount = 0
    skipCount = 0

    resume = True if args.resumeAfter is None else False
    if not resume:
        warning("--resumeAfter flag specified; Finding skip until point",
                args.resumeAfter,
                file=lfh,
                flush=True)

    previousSnp = None
    with database.cursor() as cursor:
        copyObj = io.StringIO()
        with open(fname, 'r') as fhandle:
            mappedFile = mmap.mmap(fhandle.fileno(), 0,
                                   prot=mmap.PROT_READ)  # put file in swap
            with gzip.GzipFile(mode='r', fileobj=mappedFile) as gfh:
                for line in gfh:
                    lineCount = lineCount + 1
                    vepResult = json.loads(line.rstrip())

                    vepParser.set_annotation(copy.deepcopy(vepResult))
                    vepInputStr = vepParser.get('input')
                    entry = VcfEntryParser(vepInputStr)

                    # there are json formatting issues w/the input str
                    # so replace w/the parsed entry; which is now a dict
                    vepResult['input'] = entry.get_entry()

                    refSnpId = entry.get_refsnp()

                    if not resume:
                        if previousSnp == args.resumeAfter:
                            warning(previousSnp,
                                    refSnpId,
                                    file=lfh,
                                    flush=True)
                            warning("Resuming after:",
                                    args.resumeAfter,
                                    "- SKIPPED",
                                    skipCount,
                                    "lines.",
                                    file=lfh,
                                    flush=True)
                            resume = True
                        else:
                            previousSnp = refSnpId
                            skipCount = skipCount + 1
                            continue

                    if lineCount == 1 or variantCount % args.commitAfter == 0:
                        warning('Processing new copy object',
                                file=lfh,
                                flush=True)
                        tstart = datetime.now()

                    vepParser.set('ref_snp_id', refSnpId)

                    refAllele = entry.get('ref')
                    altAllele = entry.get('alt').split(',')

                    chrom = xstr(entry.get('chrom'))
                    if chrom == 'MT':
                        chrom = 'M'
                    position = int(entry.get('pos'))

                    isMultiAllelic = len(altAllele) > 1

                    try:

                        vepParser.set('is_multi_allelic', isMultiAllelic)
                        vepParser.set('ref_allele', refAllele)
                        vepParser.set('alt_allele', altAllele)

                        frequencies = get_frequencies(vepParser, lfh)
                        vepParser.adsp_rank_and_sort_consequences()

                        # for each allele
                        for alt in altAllele:
                            variantCount = variantCount + 1
                            metaseqId = ':'.join(
                                (chrom, xstr(position), refAllele, alt))
                            positionEnd = entry.infer_variant_end_location(alt)

                            binIndex = indexer.find_bin_index(
                                chrom, position, positionEnd)

                            # NOTE: VEP uses normalized alleles to indicate variant_allele
                            nRef, nAlt = entry.normalize_alleles(
                                refAllele, alt, snvDivMinus=True)
                            alleleFreq = None if frequencies is None \
                              else get_allele_frequencies(nAlt, frequencies)

                            msConseq = get_most_severe_consequence(
                                nAlt, vepParser)

                            valueStr = '#'.join(
                                ('chr' + xstr(chrom), xstr(position),
                                 xstr(vepParser.get('is_multi_allelic')),
                                 binIndex, refSnpId, metaseqId,
                                 json2str(alleleFreq), json2str(msConseq),
                                 json2str(
                                     get_adsp_ranked_allele_consequences(
                                         nAlt, vepParser)),
                                 json.dumps(vepResult), algInvocId))

                            copyObj.write(valueStr + '\n')

                            if variantCount % args.logAfter == 0 \
                              and variantCount % args.commitAfter != 0:
                                warning("PARSED",
                                        variantCount,
                                        file=lfh,
                                        flush=True)

                            if variantCount % args.commitAfter == 0:
                                tendw = datetime.now()
                                warning('Copy object prepared in ' +
                                        str(tendw - tstart) + '; ' +
                                        str(copyObj.tell()) +
                                        ' bytes; transfering to database',
                                        file=lfh,
                                        flush=True)
                                copyObj.seek(0)
                                cursor.copy_from(copyObj,
                                                 'variant',
                                                 sep='#',
                                                 null="NULL",
                                                 columns=VARIANT_COLUMNS)

                                message = '{:,}'.format(variantCount)
                                if args.commit:
                                    database.commit()
                                    message = "COMMITTED " + message

                                else:
                                    database.rollback()
                                    message = "PARSED " + message + " -- rolling back"

                                if variantCount % args.logAfter == 0:
                                    warning(message,
                                            "; up to = ",
                                            refSnpId,
                                            file=lfh,
                                            flush=True)

                                tend = datetime.now()
                                warning('Database copy time: ' +
                                        str(tend - tendw),
                                        file=lfh,
                                        flush=True)
                                warning('        Total time: ' +
                                        str(tend - tstart),
                                        file=lfh,
                                        flush=True)

                                if args.test:
                                    die("Test complete")

                                copyObj = io.StringIO()  # reset io string

                    except Exception as e:
                        warning("ERROR parsing variant",
                                refSnpId,
                                file=lfh,
                                flush=True)
                        warning(lineCount, ":", line, file=lfh, flush=True)
                        warning(str(e), file=lfh, flush=True)
                        raise

            # final commit / leftovers
            copyObj.seek(0)
            cursor.copy_from(copyObj,
                             'variant',
                             sep='#',
                             null="NULL",
                             columns=VARIANT_COLUMNS)
            message = '{:,}'.format(variantCount)

            if args.commit:
                database.commit()
                message = "DONE - COMMITTED " + message
            else:
                database.rollback()
                message = "DONE - PARSED " + message + " -- rolling back"

            warning(message, file=lfh, flush=True)

            mappedFile.close()

    database.close()
    indexer.close()
    lfh.close()
def load_annotation():
    ''' extract basic SNP information from VCF line; check against AnnotatedDB; if missing
    load '''

    indexer = BinIndex(args.gusConfigFile, verbose=False)

    database = Database(args.gusConfigFile)
    database.connect()

    lineCount = 0
    variantCount = 0
    with database.cursor() as cursor:
        with open(args.vcfFile, 'r') as fh:
            with open(args.logFileName, 'w') as lfh:
                warning("Parsing", args.vcfFile, file=lfh, flush=True)
                for line in fh:
                    lineCount = lineCount + 1
                    vepResult = {} # will just be the input string, so it matches the annotated variants

                    if line.startswith('#'):
                        continue

                    entry = VcfEntryParser(line.rstrip())

                    # there are json formatting issues w/the input str
                    # so replace w/the parsed entry; which is now a dict
                    vepResult['input'] = entry.get_entry()

                    refAllele = entry.get('ref')
                    altAllele = entry.get('alt') # assuming not multiallelic

                    if refAllele == '0': # happens sometimes
                        refAllele = '?'
                    if altAllele == '0':
                        altAllele = '?'

                    # truncatedRef = truncate(refAllele, 20)
                    # truncatedAlt = truncate(altAllele, 20)

                    chrom = xstr(entry.get('chrom'))
                    if chrom == 'MT':
                        chrom = 'M'
                    position = int(entry.get('pos'))


                    # metaseqId = ':'.join((chrom, xstr(position), truncatedRef, truncatedAlt))
                    isMultiAllelic = False # assuming b/c of how the VCF files were generated
                    metaseqId = ':'.join((chrom, xstr(position), refAllele, altAllele))

                    try:
                        if variant_is_missing(database, 'chr' + chrom, metaseqId):
                            warning(metaseqId, "- MISSING -- LOADING", file=lfh, flush=True)
                            variantCount = variantCount + 1

                            positionEnd = entry.infer_variant_end_location(altAllele)
                            binIndex = indexer.find_bin_index(chrom, position, positionEnd)

                            cursor.execute(INSERT_SQL, 
                                               ('chr' + chrom,
                                                position,
                                                isMultiAllelic,
                                                binIndex,
                                                metaseqId,
                                                json.dumps(vepResult),
                                                algInvocId))

                            if args.commit:
                                database.commit()
                            else:
                                database.rollback()

                        if lineCount % 50 == 0:
                            warning("Parsed", lineCount, "lines.", file=lfh, flush=True)
                            warning("Loaded", variantCount, "missing variants", file=lfh, flush=True)

                    except Exception:
                        warning("ERROR parsing variant", metaseqId, file=lfh, flush=True)
                        warning(lineCount, ":", line, file=lfh, flush=True)
                        print("FAIL", file=sys.stdout)
                        raise

                warning("DONE - Loaded", variantCount, "missing variants", file=lfh, flush=True)
    database.close()
    indexer.close()
def load_annotation(chromosome):
    ''' extract basic SNP information from VCF line; check against AnnotatedDB; if missing
    load '''

    indexer = BinIndex(args.gusConfigFile, verbose=False)

    database = Database(args.gusConfigFile)
    database.connect()


    fname = '00-All.MT.vcf.gz' if chromosome == 'M' \
      else '00-All.' + xstr(chromosome) + '.vcf.gz'

    logFname = path.join(args.logDir, fname + '.log')
    fname = path.join(args.dir, fname)

    lineCount = 0
    variantCount = 0

    warning("Parsing", fname)
    warning("Logging:", logFname)
    with database.cursor() as cursor:
        with open(fname, 'r') as fhandle, open(logFname, 'w') as lfh:
            warning("Parsing", fname, file=lfh, flush=True)
            mappedFile = mmap.mmap(fhandle.fileno(), 0,
                                   prot=mmap.PROT_READ)  # put file in swap
            with gzip.GzipFile(mode='r', fileobj=mappedFile) as gfh:
                for line in gfh:
                    lineCount = lineCount + 1
                    vepResult = {
                    }  # will just be the input string, so it matches the annotated variants

                    if line.startswith('#'):
                        continue

                    entry = VcfEntryParser(line.rstrip())

                    # there are json formatting issues w/the input str
                    # so replace w/the parsed entry; which is now a dict
                    vepResult['input'] = entry.get_entry()

                    refSnpId = entry.get_refsnp()

                    refAllele = entry.get('ref')
                    altAllele = entry.get('alt').split(',')

                    if '.' not in altAllele:
                        if lineCount % 25000 == 0:
                            warning("Parsed",
                                    lineCount,
                                    "lines.",
                                    file=lfh,
                                    flush=True)
                        continue  # sanity check/all missing are lacking alt alleles

                    chrom = xstr(entry.get('chrom'))
                    position = int(entry.get('pos'))

                    isMultiAllelic = len(altAllele) > 1

                    try:
                        if variant_is_missing(database, 'chr' + chrom,
                                              refSnpId):
                            warning(refSnpId,
                                    "- MISSING -- LOADING",
                                    file=lfh,
                                    flush=True)
                            for alt in altAllele:
                                if alt == '.':  # checking again in case some are multiallelic
                                    alt = '?'
                                variantCount = variantCount + 1
                                metaseqId = ':'.join(
                                    (chrom, xstr(position), refAllele, alt))
                                positionEnd = entry.infer_variant_end_location(
                                    alt)

                                binIndex = indexer.find_bin_index(
                                    chrom, position, positionEnd)
                                cursor.execute(
                                    INSERT_SQL,
                                    ('chr' + chrom, xstr(position),
                                     isMultiAllelic,
                                     binIndex, refSnpId, metaseqId,
                                     json.dumps(vepResult), algInvocId))

                                if args.commit:
                                    database.commit()
                                else:
                                    database.rollback()

                        if lineCount % 25000 == 0:
                            warning("Parsed",
                                    lineCount,
                                    "lines.",
                                    file=lfh,
                                    flush=True)
                            warning("Loaded",
                                    variantCount,
                                    "missing variants",
                                    file=lfh,
                                    flush=True)

                    except Exception:
                        warning("ERROR parsing variant",
                                refSnpId,
                                file=lfh,
                                flush=True)
                        warning(lineCount, ":", line, file=lfh, flush=True)
                        raise

            if not args.commit:
                database.rollback()
                warning("DONE -- rolling back")

            mappedFile.close()

    database.close()
    indexer.close()
    warning("DONE - Loaded",
            variantCount,
            "missing variants",
            file=lfh,
            flush=True)
예제 #5
0
def update_variant_records_from_vcf():
    ''' lookup and update variant records from a VCF file 
    assuming the load by file was called by a plugin, so this variant has already been
    verified to be new to the resource; no need to check alternative metaseq IDs'''

    cupdater = CADDUpdater(args.logFile, args.databaseDir)

    database = Database(args.gusConfigFile)
    database.connect()

    lineCount = 0
    with database.cursor() as updateCursor, \
      open(args.vcfFile, 'r') as fh:
        try:
            for line in fh:
                if line.startswith("#"):
                    continue

                lineCount = lineCount + 1
                entry = VcfEntryParser(line.rstrip())

                refAllele = entry.get('ref')
                altAllele = entry.get('alt')
                chrom = xstr(entry.get('chrom'))
                if chrom == 'MT':
                    chrom = 'M'
                position = int(entry.get('pos'))
                metaseqId = ':'.join(
                    (chrom, xstr(position), refAllele, altAllele))

                record = {"metaseq_id": metaseqId}  # mimic "record"

                if len(refAllele) > 1 or len(altAllele) > 1:  # only doing SNVs
                    update_variant_record(record, cupdater, INDEL)
                else:
                    update_variant_record(record, cupdater, SNV)

                if lineCount % args.commitAfter == 0:
                    warning("Processed:",
                            lineCount,
                            "- SNVs:",
                            cupdater.get_update_count(SNV),
                            "- INDELS:",
                            cupdater.get_update_count(INDEL),
                            " - Not Matched:",
                            cupdater.get_not_matched_count(),
                            file=cupdater.lfh(),
                            flush=True)

                    updateCursor.execute(cupdater.sql_buffer_str())

                    if args.commit:
                        database.commit()
                    else:
                        database.rollback()
                    cupdater.clear_update_sql()

            if cupdater.buffered_variant_count() > 0:  # trailing
                updateCursor.execute(cupdater.sql_buffer_str())

                if args.commit:
                    database.commit()
                else:
                    database.rollback()

            warning("DONE - Updated SNVs:",
                    cupdater.get_update_count(SNV),
                    "- Updated INDELS:",
                    cupdater.get_update_count(INDEL),
                    "- Not Matched:",
                    cupdater.get_not_matched_count(),
                    file=cupdater.lfh(),
                    flush=True)

            cupdater.close_lfh()

        except Exception as e:
            warning(e, entry, file=cupdater.lfh(), flush=True)
            database.rollback()
            database.close()
            print("FAIL", file=sys.stdout)
            raise
예제 #6
0
def load_annotation():
    ''' parse over a JSON file, extract position, frequencies,
    ids, and ADSP-ranked most severe consequence; bulk load using COPY '''

    fname = args.inputFile
    lineCount = 0
    variantCount = 0
    skipCount = 0

    warning("Parsing variants from:", fname) # should print to plugin log
    with database.cursor() as cursor, database.cursor("RealDictCursor") as dcursor:
        copyObj = io.StringIO()
        with open(fname, 'r') as fh:
            with open(args.logFile, 'w') as lfh:
                warning("Parsing variants from:", fname, file=lfh, flush=True)
                for line in fh:
                    lineCount = lineCount + 1
                    vepResult = json.loads(line.rstrip())

                    vepParser.set_annotation(copy.deepcopy(vepResult))
                    vepInputStr = vepParser.get('input')
                    entry = VcfEntryParser(vepInputStr)

                    # there may be json formatting issues w/the input str
                    # so replace w/the parsed entry; which is now a dict
                    vepResult['input'] = entry.get_entry()

                    if lineCount == 1 or variantCount % 5000 == 0:
                        warning('Processing new copy object', file=lfh, flush=True)
                        tstart = datetime.now()

                    refAllele = entry.get('ref')
                    altAllele = entry.get('alt')  # assumes no multialleic variants

                    if refAllele == '0': # happens sometimes
                        refAllele = '?'
                    if altAllele == '0':
                        altAllele = '?'

                    # truncatedRef = truncate(refAllele, 20)
                    # truncatedAlt = truncate(altAllele, 20)
                  
                    chrom = xstr(entry.get('chrom'))
                    if chrom == 'MT':
                        chrom = 'M'

                    position = int(entry.get('pos'))

                    try:
                        # metaseqId = ':'.join((chrom, xstr(position), truncatedRef, truncatedAlt))
                        metaseqId = ':'.join((chrom, xstr(position), refAllele, altAllele))

                        if duplicate(metaseqId, dcursor): 
                            warning("SKIPPING:",metaseqId, "- already loaded.", file=lfh, flush=True)
                            continue

                        vepParser.set('ref_allele', refAllele)
                        vepParser.set('alt_allele', altAllele)
                     
                        frequencies = get_frequencies()
                        vepParser.adsp_rank_and_sort_consequences()

                        # for each allele
                        variantCount = variantCount + 1

                        positionEnd = entry.infer_variant_end_location(altAllele)
                        binIndex = indexer.find_bin_index(chrom, position, positionEnd)

                        # NOTE: VEP uses normalized alleles to indicate variant_allele
                        nRef, nAlt = entry.normalize_alleles(refAllele, altAllele, snvDivMinus=True)
                        alleleFreq = None if frequencies is None \
                          else get_allele_frequencies(nAlt, frequencies)

                        msConseq = get_most_severe_consequence(nAlt)
                        valueStr = '#'.join((
                          'chr' + xstr(chrom),
                          xstr(position),
                          binIndex,
                          metaseqId,
                          json2str(alleleFreq),
                          json2str(msConseq),
                          json2str(get_adsp_ranked_allele_consequences(nAlt)),
                          json.dumps(vepResult),
                          algInvocId
                          ))


                        copyObj.write(valueStr + '\n')

                        if variantCount % 5000 == 0:
                            warning("FOUND", variantCount, " new variants", file=lfh, flush=True)
                            tendw = datetime.now()
                            warning('Copy object prepared in ' + str(tendw - tstart) + '; ' +
                                    str(copyObj.tell()) + ' bytes; transfering to database',
                                    file=lfh, flush=True)
                            copyObj.seek(0)
                            cursor.copy_from(copyObj, 'variant', sep='#', null="NULL",
                                             columns=VARIANT_COLUMNS)

                            message = '{:,}'.format(variantCount)
                            if args.commit:
                                database.commit()
                                message = "COMMITTED " + message

                            else:
                                database.rollback()
                                message = "PARSED " + message + " -- rolling back"


                            warning(message, "; up to = ", metaseqId, file=lfh, flush=True)

                            tend = datetime.now()
                            warning('Database copy time: ' + str(tend - tendw), file=lfh, flush=True)
                            warning('        Total time: ' + str(tend - tstart), file=lfh, flush=True)

                            copyObj = io.StringIO() # reset io string

                    except Exception:
                        warning("ERROR parsing variant on line", line, ' - ', metaseqId,
                                file=lfh, flush=True)
                        print("FAIL", file=sys.stdout)
                        raise

                # final commit / leftovers
                copyObj.seek(0)
                cursor.copy_from(copyObj, 'variant', sep='#', null="NULL", columns=VARIANT_COLUMNS)
                message = '{:,}'.format(variantCount)

                if args.commit:
                    database.commit()
                    message = "DONE - COMMITTED " + message
                else:
                    database.rollback()
                    message = "DONE - PARSED " + message + " -- rolling back"
                    
                warning(message, file=lfh, flush=True)