def generate_bins(binRoot, locStart, locEnd, level, seqLength):
    ''' recursive function for generating bin index '''
    global binCount
    if level >= numLevels: return

    lowerBound = locStart
    upperBound = locStart + increments[level]

    currentBin = 0

    if locEnd > seqLength: locEnd = seqLength

    while lowerBound < locEnd:
        binCount = binCount + 1
        currentBin = currentBin + 1
        binLabel = binRoot + ".B" + xstr(currentBin) if level != 0 else xstr(
            binRoot)
        if upperBound > seqLength:
            upperBound = seqLength
        if upperBound > locEnd:
            upperBound = locEnd

        insert_bin(level, binCount, binLabel, lowerBound, upperBound)

        nextLevel = level + 1
        if nextLevel <= numLevels:
            if lowerBound == 0:
                warning("New Level:", level)

            generate_bins(binLabel + ".L" + xstr(nextLevel), lowerBound,
                          upperBound, nextLevel, seqLength)

        lowerBound = upperBound
        upperBound = upperBound + increments[level]
Esempio n. 2
0
 def match(self, chrm, position, indels=True):
     ''' match single position in CADD file; default indels to true
     b/c will be using this more with the indel file'''
     tbxFh = self._indel if indels else self._snv
     caddChr = 'MT' if chrm == 'M' else xstr(chrm)
     try:
         return tbxFh.fetch(caddChr, int(position) - 1, int(position), parser=pysam.asTuple())
     except ValueError as e: # happens sometimes on chrm M/MT
         warning("WARNING", e)
         return []
Esempio n. 3
0
    def buffer_update_sql(self, metaseqId, evidence):
        chrm, pos, ref, alt = metaseqId.split(':')
        if self._chrm is not None:
            chrm = self._chrm
        else:
            chrm = 'chr' + xstr(chrm)

        sql = "UPDATE Variant_" + chrm \
          + " v SET cadd_scores = '" + json.dumps(evidence) + "'" \
          + " WHERE left(v.metaseq_id, 50) = left('" + metaseqId + "',50)" \
          + " AND chromosome = '" + chrm + "'" \
          + " AND v.metaseq_id = '" + metaseqId + "'"
        self._sql_buffer.append(sql)
Esempio n. 4
0
    def _update_current_bin_index(self, chrm, start, end):
        ''' query against database to get minimum enclosing bin;
        set as new current bin'''
        if self._verbose: warning("Updating current bin")
        result = None
        self._cursor.execute(BIN_INDEX_SQL, (chrm, start, end))
        try:
            self._currentBin = self._cursor.fetchone()

        except ProgrammingError:
            raise ProgrammingError('Could not map ' + chrm + ':' + xstr(start) + '-' + xstr(end) + ' to a bin.')

        if self._verbose: warning(self._currentBin)
        return result
Esempio n. 5
0
    def find_bin_index(self, chrm, start, end=None):
        ''' finds the bin index for the position;
        if end == None assume SNV, and set to start'''

        if end is None: end = start
        if 'chr' not in chrm: chrm = 'chr' + xstr(chrm)

        if bool(self._currentBin): # if a current bin exists and the variant falls in it, return it
            if self._currentBin['bin_level'] >= 27: # otherwise may be a broad bin b/c of a indel; so need to do a lookup
                brange = self._currentBin['location']
                if self._currentBin['chromosome'] == chrm \
                  and start in brange and end in brange:
                    return self._currentBin['global_bin_path']

        # otherwise, find & return the new bin
        self._update_current_bin_index(chrm, start, end) # not in current bin, so update bin
        return self._currentBin['global_bin_path']
Esempio n. 6
0
def load_annotation(chromosome):
    ''' parse over a JSON file, extract position, frequencies,
    ids, and ADSP-ranked most severe consequence; bulk load using COPY '''

    lfn = xstr(chromosome) + '.log'
    lfh = open(lfn, 'w')

    algInvocation = AlgorithmInvocation('parallel_load_dbsnp_vep_result.py',
                                        json.dumps({'chromosome': chromosome}),
                                        args.commit)
    algInvocId = xstr(algInvocation.getAlgorithmInvocationId())
    algInvocation.close()
    warning("Algorithm Invocation ID", algInvocId, file=lfh, flush=True)

    vepParser = VepJsonParser(args.rankingFile, verbose=True)
    indexer = BinIndex(args.gusConfigFile, verbose=False)

    database = Database(args.gusConfigFile)
    database.connect()

    fname = 'chr' + xstr(chromosome) + '.json.gz'
    fname = path.join(args.dir, fname)
    lineCount = 0
    variantCount = 0
    skipCount = 0

    resume = True if args.resumeAfter is None else False
    if not resume:
        warning("--resumeAfter flag specified; Finding skip until point",
                args.resumeAfter,
                file=lfh,
                flush=True)

    previousSnp = None
    with database.cursor() as cursor:
        copyObj = io.StringIO()
        with open(fname, 'r') as fhandle:
            mappedFile = mmap.mmap(fhandle.fileno(), 0,
                                   prot=mmap.PROT_READ)  # put file in swap
            with gzip.GzipFile(mode='r', fileobj=mappedFile) as gfh:
                for line in gfh:
                    lineCount = lineCount + 1
                    vepResult = json.loads(line.rstrip())

                    vepParser.set_annotation(copy.deepcopy(vepResult))
                    vepInputStr = vepParser.get('input')
                    entry = VcfEntryParser(vepInputStr)

                    # there are json formatting issues w/the input str
                    # so replace w/the parsed entry; which is now a dict
                    vepResult['input'] = entry.get_entry()

                    refSnpId = entry.get_refsnp()

                    if not resume:
                        if previousSnp == args.resumeAfter:
                            warning(previousSnp,
                                    refSnpId,
                                    file=lfh,
                                    flush=True)
                            warning("Resuming after:",
                                    args.resumeAfter,
                                    "- SKIPPED",
                                    skipCount,
                                    "lines.",
                                    file=lfh,
                                    flush=True)
                            resume = True
                        else:
                            previousSnp = refSnpId
                            skipCount = skipCount + 1
                            continue

                    if lineCount == 1 or variantCount % args.commitAfter == 0:
                        warning('Processing new copy object',
                                file=lfh,
                                flush=True)
                        tstart = datetime.now()

                    vepParser.set('ref_snp_id', refSnpId)

                    refAllele = entry.get('ref')
                    altAllele = entry.get('alt').split(',')

                    chrom = xstr(entry.get('chrom'))
                    if chrom == 'MT':
                        chrom = 'M'
                    position = int(entry.get('pos'))

                    isMultiAllelic = len(altAllele) > 1

                    try:

                        vepParser.set('is_multi_allelic', isMultiAllelic)
                        vepParser.set('ref_allele', refAllele)
                        vepParser.set('alt_allele', altAllele)

                        frequencies = get_frequencies(vepParser, lfh)
                        vepParser.adsp_rank_and_sort_consequences()

                        # for each allele
                        for alt in altAllele:
                            variantCount = variantCount + 1
                            metaseqId = ':'.join(
                                (chrom, xstr(position), refAllele, alt))
                            positionEnd = entry.infer_variant_end_location(alt)

                            binIndex = indexer.find_bin_index(
                                chrom, position, positionEnd)

                            # NOTE: VEP uses normalized alleles to indicate variant_allele
                            nRef, nAlt = entry.normalize_alleles(
                                refAllele, alt, snvDivMinus=True)
                            alleleFreq = None if frequencies is None \
                              else get_allele_frequencies(nAlt, frequencies)

                            msConseq = get_most_severe_consequence(
                                nAlt, vepParser)

                            valueStr = '#'.join(
                                ('chr' + xstr(chrom), xstr(position),
                                 xstr(vepParser.get('is_multi_allelic')),
                                 binIndex, refSnpId, metaseqId,
                                 json2str(alleleFreq), json2str(msConseq),
                                 json2str(
                                     get_adsp_ranked_allele_consequences(
                                         nAlt, vepParser)),
                                 json.dumps(vepResult), algInvocId))

                            copyObj.write(valueStr + '\n')

                            if variantCount % args.logAfter == 0 \
                              and variantCount % args.commitAfter != 0:
                                warning("PARSED",
                                        variantCount,
                                        file=lfh,
                                        flush=True)

                            if variantCount % args.commitAfter == 0:
                                tendw = datetime.now()
                                warning('Copy object prepared in ' +
                                        str(tendw - tstart) + '; ' +
                                        str(copyObj.tell()) +
                                        ' bytes; transfering to database',
                                        file=lfh,
                                        flush=True)
                                copyObj.seek(0)
                                cursor.copy_from(copyObj,
                                                 'variant',
                                                 sep='#',
                                                 null="NULL",
                                                 columns=VARIANT_COLUMNS)

                                message = '{:,}'.format(variantCount)
                                if args.commit:
                                    database.commit()
                                    message = "COMMITTED " + message

                                else:
                                    database.rollback()
                                    message = "PARSED " + message + " -- rolling back"

                                if variantCount % args.logAfter == 0:
                                    warning(message,
                                            "; up to = ",
                                            refSnpId,
                                            file=lfh,
                                            flush=True)

                                tend = datetime.now()
                                warning('Database copy time: ' +
                                        str(tend - tendw),
                                        file=lfh,
                                        flush=True)
                                warning('        Total time: ' +
                                        str(tend - tstart),
                                        file=lfh,
                                        flush=True)

                                if args.test:
                                    die("Test complete")

                                copyObj = io.StringIO()  # reset io string

                    except Exception as e:
                        warning("ERROR parsing variant",
                                refSnpId,
                                file=lfh,
                                flush=True)
                        warning(lineCount, ":", line, file=lfh, flush=True)
                        warning(str(e), file=lfh, flush=True)
                        raise

            # final commit / leftovers
            copyObj.seek(0)
            cursor.copy_from(copyObj,
                             'variant',
                             sep='#',
                             null="NULL",
                             columns=VARIANT_COLUMNS)
            message = '{:,}'.format(variantCount)

            if args.commit:
                database.commit()
                message = "DONE - COMMITTED " + message
            else:
                database.rollback()
                message = "DONE - PARSED " + message + " -- rolling back"

            warning(message, file=lfh, flush=True)

            mappedFile.close()

    database.close()
    indexer.close()
    lfh.close()
Esempio n. 7
0
                        required=True,
                        help="chromosome to load, e.g., 1, 2, M, X")
    parser.add_argument('--commitAfter',
                        type=int,
                        default=500,
                        help="commit after specified inserts")
    parser.add_argument('--maxWorkers', default=10, type=int)
    parser.add_argument(
        '--logAfter',
        type=int,
        help=
        "number of inserts to log after completion; will work best if factor/multiple of commitAfter"
    )
    parser.add_argument('--verbose', action='store_true')
    args = parser.parse_args()

    if not args.logAfter:
        args.logAfter = args.commitAfter

    VARIANT_COLUMNS = qw(
        'chromosome location is_multi_allelic bin_index ref_snp_id metaseq_id allele_frequencies adsp_most_severe_consequence adsp_ranked_consequences vep_output row_algorithm_id',
        returnTuple=True)

    chrList = args.chr.split(',') if args.chr != 'all' \
      else [c.value for c in Human]

    with ProcessPoolExecutor(args.maxWorkers) as executor:
        for c in chrList:
            warning("Create and start thread for chromosome:", xstr(c))
            executor.submit(load_annotation, c)
def load_annotation():
    ''' extract basic SNP information from VCF line; check against AnnotatedDB; if missing
    load '''

    indexer = BinIndex(args.gusConfigFile, verbose=False)

    database = Database(args.gusConfigFile)
    database.connect()

    lineCount = 0
    variantCount = 0
    with database.cursor() as cursor:
        with open(args.vcfFile, 'r') as fh:
            with open(args.logFileName, 'w') as lfh:
                warning("Parsing", args.vcfFile, file=lfh, flush=True)
                for line in fh:
                    lineCount = lineCount + 1
                    vepResult = {} # will just be the input string, so it matches the annotated variants

                    if line.startswith('#'):
                        continue

                    entry = VcfEntryParser(line.rstrip())

                    # there are json formatting issues w/the input str
                    # so replace w/the parsed entry; which is now a dict
                    vepResult['input'] = entry.get_entry()

                    refAllele = entry.get('ref')
                    altAllele = entry.get('alt') # assuming not multiallelic

                    if refAllele == '0': # happens sometimes
                        refAllele = '?'
                    if altAllele == '0':
                        altAllele = '?'

                    # truncatedRef = truncate(refAllele, 20)
                    # truncatedAlt = truncate(altAllele, 20)

                    chrom = xstr(entry.get('chrom'))
                    if chrom == 'MT':
                        chrom = 'M'
                    position = int(entry.get('pos'))


                    # metaseqId = ':'.join((chrom, xstr(position), truncatedRef, truncatedAlt))
                    isMultiAllelic = False # assuming b/c of how the VCF files were generated
                    metaseqId = ':'.join((chrom, xstr(position), refAllele, altAllele))

                    try:
                        if variant_is_missing(database, 'chr' + chrom, metaseqId):
                            warning(metaseqId, "- MISSING -- LOADING", file=lfh, flush=True)
                            variantCount = variantCount + 1

                            positionEnd = entry.infer_variant_end_location(altAllele)
                            binIndex = indexer.find_bin_index(chrom, position, positionEnd)

                            cursor.execute(INSERT_SQL, 
                                               ('chr' + chrom,
                                                position,
                                                isMultiAllelic,
                                                binIndex,
                                                metaseqId,
                                                json.dumps(vepResult),
                                                algInvocId))

                            if args.commit:
                                database.commit()
                            else:
                                database.rollback()

                        if lineCount % 50 == 0:
                            warning("Parsed", lineCount, "lines.", file=lfh, flush=True)
                            warning("Loaded", variantCount, "missing variants", file=lfh, flush=True)

                    except Exception:
                        warning("ERROR parsing variant", metaseqId, file=lfh, flush=True)
                        warning(lineCount, ":", line, file=lfh, flush=True)
                        print("FAIL", file=sys.stdout)
                        raise

                warning("DONE - Loaded", variantCount, "missing variants", file=lfh, flush=True)
    database.close()
    indexer.close()
    indexer.close()



if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='load variants missing from VEP output into AnnotatedDB')
    parser.add_argument('-v', '--vcfFile',
                        help="input file (vcf)", required=True)
    parser.add_argument('--commit', action='store_true', help="run in (auto)commit mode", required=False)
    parser.add_argument('--logFileName', help="log file name", required=True)
    parser.add_argument('--gusConfigFile',
                        '--full path to gus config file, else assumes $GUS_HOME/config/gus.config')
  
    parser.add_argument('--verbose', action='store_true')
    args = parser.parse_args()

    INSERT_SQL = """INSERT INTO Variant 
(chromosome, location, is_multi_allelic, bin_index, metaseq_id, vep_output, row_algorithm_id)
VALUES (%s, %s, %s, %s, %s, %s, %s)"""

    algInvocation = AlgorithmInvocation('load_non_vep_annotated_variants_from_vcf.py', json.dumps(vars(args)), args.commit, args.gusConfigFile)
    algInvocId = xstr(algInvocation.getAlgorithmInvocationId())
    algInvocation.close()

    warning("Algorithm Invocation ID", algInvocId)
   
    load_annotation()

    print(algInvocId, file=sys.stdout)
         
Esempio n. 10
0
 def _slice(self, chrm, start, end, indels=False):
     ''' slice CADD file; assume using this more
     often with snv file; returns tuple'''
     tbxFh =  self._indel if indels else self._snv
     caddChr = 'MT' if chrm == 'M' else xstr(chrm)
     return tbxFh.fetch(caddChr, start, end, parser=pysam.asTuple())
def load_annotation(chromosome):
    ''' extract basic SNP information from VCF line; check against AnnotatedDB; if missing
    load '''

    indexer = BinIndex(args.gusConfigFile, verbose=False)

    database = Database(args.gusConfigFile)
    database.connect()


    fname = '00-All.MT.vcf.gz' if chromosome == 'M' \
      else '00-All.' + xstr(chromosome) + '.vcf.gz'

    logFname = path.join(args.logDir, fname + '.log')
    fname = path.join(args.dir, fname)

    lineCount = 0
    variantCount = 0

    warning("Parsing", fname)
    warning("Logging:", logFname)
    with database.cursor() as cursor:
        with open(fname, 'r') as fhandle, open(logFname, 'w') as lfh:
            warning("Parsing", fname, file=lfh, flush=True)
            mappedFile = mmap.mmap(fhandle.fileno(), 0,
                                   prot=mmap.PROT_READ)  # put file in swap
            with gzip.GzipFile(mode='r', fileobj=mappedFile) as gfh:
                for line in gfh:
                    lineCount = lineCount + 1
                    vepResult = {
                    }  # will just be the input string, so it matches the annotated variants

                    if line.startswith('#'):
                        continue

                    entry = VcfEntryParser(line.rstrip())

                    # there are json formatting issues w/the input str
                    # so replace w/the parsed entry; which is now a dict
                    vepResult['input'] = entry.get_entry()

                    refSnpId = entry.get_refsnp()

                    refAllele = entry.get('ref')
                    altAllele = entry.get('alt').split(',')

                    if '.' not in altAllele:
                        if lineCount % 25000 == 0:
                            warning("Parsed",
                                    lineCount,
                                    "lines.",
                                    file=lfh,
                                    flush=True)
                        continue  # sanity check/all missing are lacking alt alleles

                    chrom = xstr(entry.get('chrom'))
                    position = int(entry.get('pos'))

                    isMultiAllelic = len(altAllele) > 1

                    try:
                        if variant_is_missing(database, 'chr' + chrom,
                                              refSnpId):
                            warning(refSnpId,
                                    "- MISSING -- LOADING",
                                    file=lfh,
                                    flush=True)
                            for alt in altAllele:
                                if alt == '.':  # checking again in case some are multiallelic
                                    alt = '?'
                                variantCount = variantCount + 1
                                metaseqId = ':'.join(
                                    (chrom, xstr(position), refAllele, alt))
                                positionEnd = entry.infer_variant_end_location(
                                    alt)

                                binIndex = indexer.find_bin_index(
                                    chrom, position, positionEnd)
                                cursor.execute(
                                    INSERT_SQL,
                                    ('chr' + chrom, xstr(position),
                                     isMultiAllelic,
                                     binIndex, refSnpId, metaseqId,
                                     json.dumps(vepResult), algInvocId))

                                if args.commit:
                                    database.commit()
                                else:
                                    database.rollback()

                        if lineCount % 25000 == 0:
                            warning("Parsed",
                                    lineCount,
                                    "lines.",
                                    file=lfh,
                                    flush=True)
                            warning("Loaded",
                                    variantCount,
                                    "missing variants",
                                    file=lfh,
                                    flush=True)

                    except Exception:
                        warning("ERROR parsing variant",
                                refSnpId,
                                file=lfh,
                                flush=True)
                        warning(lineCount, ":", line, file=lfh, flush=True)
                        raise

            if not args.commit:
                database.rollback()
                warning("DONE -- rolling back")

            mappedFile.close()

    database.close()
    indexer.close()
    warning("DONE - Loaded",
            variantCount,
            "missing variants",
            file=lfh,
            flush=True)
                        action='store_true',
                        help="load 'commitAfter' rows as test")
    parser.add_argument('--maxWorkers', default=5, type=int)
    parser.add_argument(
        '-c',
        '--chr',
        required=True,
        help="comma separated list of chromosomes to load, or 'all'")

    parser.add_argument('--verbose', action='store_true')
    args = parser.parse_args()

    INSERT_SQL = """INSERT INTO Variant 
(chromosome, location, is_multi_allelic, bin_index, ref_snp_id, metaseq_id, vep_output, row_algorithm_id)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"""

    algInvocation = AlgorithmInvocation('load_missing_dbsnp_from_vcf.py',
                                        json.dumps(vars(args)), args.commit)
    algInvocId = xstr(algInvocation.getAlgorithmInvocationId())
    algInvocation.close()

    warning("Algorithm Invocation ID", algInvocId)

    chrList = args.chr.split(',') if args.chr != 'all' \
      else [c.value for c in Human]

    with ProcessPoolExecutor(args.maxWorkers) as executor:
        for c in chrList:
            warning("Create and start thread for chromosome:", xstr(c))
            executor.submit(load_annotation, c)
Esempio n. 13
0
def update_variant_records_from_vcf():
    ''' lookup and update variant records from a VCF file 
    assuming the load by file was called by a plugin, so this variant has already been
    verified to be new to the resource; no need to check alternative metaseq IDs'''

    cupdater = CADDUpdater(args.logFile, args.databaseDir)

    database = Database(args.gusConfigFile)
    database.connect()

    lineCount = 0
    with database.cursor() as updateCursor, \
      open(args.vcfFile, 'r') as fh:
        try:
            for line in fh:
                if line.startswith("#"):
                    continue

                lineCount = lineCount + 1
                entry = VcfEntryParser(line.rstrip())

                refAllele = entry.get('ref')
                altAllele = entry.get('alt')
                chrom = xstr(entry.get('chrom'))
                if chrom == 'MT':
                    chrom = 'M'
                position = int(entry.get('pos'))
                metaseqId = ':'.join(
                    (chrom, xstr(position), refAllele, altAllele))

                record = {"metaseq_id": metaseqId}  # mimic "record"

                if len(refAllele) > 1 or len(altAllele) > 1:  # only doing SNVs
                    update_variant_record(record, cupdater, INDEL)
                else:
                    update_variant_record(record, cupdater, SNV)

                if lineCount % args.commitAfter == 0:
                    warning("Processed:",
                            lineCount,
                            "- SNVs:",
                            cupdater.get_update_count(SNV),
                            "- INDELS:",
                            cupdater.get_update_count(INDEL),
                            " - Not Matched:",
                            cupdater.get_not_matched_count(),
                            file=cupdater.lfh(),
                            flush=True)

                    updateCursor.execute(cupdater.sql_buffer_str())

                    if args.commit:
                        database.commit()
                    else:
                        database.rollback()
                    cupdater.clear_update_sql()

            if cupdater.buffered_variant_count() > 0:  # trailing
                updateCursor.execute(cupdater.sql_buffer_str())

                if args.commit:
                    database.commit()
                else:
                    database.rollback()

            warning("DONE - Updated SNVs:",
                    cupdater.get_update_count(SNV),
                    "- Updated INDELS:",
                    cupdater.get_update_count(INDEL),
                    "- Not Matched:",
                    cupdater.get_not_matched_count(),
                    file=cupdater.lfh(),
                    flush=True)

            cupdater.close_lfh()

        except Exception as e:
            warning(e, entry, file=cupdater.lfh(), flush=True)
            database.rollback()
            database.close()
            print("FAIL", file=sys.stdout)
            raise
Esempio n. 14
0
def update_variant_records(chromosome):
    chrLabel = 'chr' + xstr(chromosome)

    logFileName = path.join(args.logFilePath, chrLabel + '.log')
    cupdater = CADDUpdater(logFileName, args.databaseDir)
    cupdater.setChrm(chrLabel)

    selectSQL = "SELECT metaseq_id, cadd_scores FROM Variant_" + chrLabel

    database = Database(args.gusConfigFile)
    database.connect()

    lineCount = 0
    updateCount = 0
    updateIndelCount = 0
    skipCount = 0

    with database.cursor("RealDictCursor") as selectCursor, \
        database.cursor() as updateCursor:
        try:
            warning("Fetching",
                    chrLabel,
                    "variants",
                    file=cupdater.lfh(),
                    flush=True)
            selectCursor.execute(selectSQL)
            warning("DONE - Fetching", file=cupdater.lfh(), flush=True)

            for record in selectCursor:
                if args.debug and args.veryVerbose:
                    warning(record, file=cupdater.lfh(), flush=True)

                if record['cadd_scores'] is not None:
                    if args.debug and args.veryVerbose:
                        warning("Skipping",
                                record['metaseq_id'],
                                file=cupdater.lfh(),
                                flush=True)
                    skipCount = skipCount + 1
                    continue

                lineCount = lineCount + 1

                metaseqId = record['metaseq_id']

                chrm, position, refAllele, altAllele = metaseqId.split(':')

                if len(refAllele) > 1 or len(altAllele) > 1:  # only doing SNVs
                    update_variant_record(record, cupdater, INDEL)
                else:
                    update_variant_record(record, cupdater, SNV)

                if cupdater.get_total_update_count(
                ) % args.commitAfter == 0 and cupdater.buffered_variant_count(
                ) > 0:
                    if args.commit:
                        if args.debug:
                            warning("Starting Update",
                                    file=cupdater.lfh(),
                                    flush=True)

                        updateCursor.execute(cupdater.sql_buffer_str())

                        if args.debug:
                            warning("Done", file=cupdater.lfh(), flush=True)

                        cupdater.clear_update_sql()
                        database.commit()
                    else:
                        database.rollback()

                    warning(metaseqId,
                            "- Processed:",
                            lineCount,
                            "- SNVs:",
                            cupdater.get_update_count(SNV),
                            "- INDELS:",
                            cupdater.get_update_count(INDEL),
                            "- Skipped:",
                            skipCount,
                            " - Not Matched:",
                            cupdater.get_not_matched_count(),
                            file=cupdater.lfh(),
                            flush=True)

            if cupdater.buffered_variant_count() > 0:
                updateCursor.execute(cupdater.sql_buffer_str())
                if args.commit:  # trailing
                    database.commit()
                else:
                    database.rollback()

            warning("DONE - Updated SNVs:",
                    cupdater.get_update_count(SNV),
                    "- Updated INDELS:",
                    cupdater.get_update_count(INDEL),
                    "- Skipped",
                    skipCount,
                    "- Not Matched:",
                    cupdater.get_not_matched_count(),
                    file=cupdater.lfh(),
                    flush=True)
            cupdater.close_lfh()

        except Exception as e:
            warning(e, file=cupdater.lfh(), flush=True)
            if args.commit:
                database.commit()
            else:
                database.rollback()
            database.close()
            raise

    database.close()
Esempio n. 15
0
def load_annotation():
    ''' parse over a JSON file, extract position, frequencies,
    ids, and ADSP-ranked most severe consequence; bulk load using COPY '''

    fname = args.inputFile
    lineCount = 0
    variantCount = 0
    skipCount = 0

    warning("Parsing variants from:", fname) # should print to plugin log
    with database.cursor() as cursor, database.cursor("RealDictCursor") as dcursor:
        copyObj = io.StringIO()
        with open(fname, 'r') as fh:
            with open(args.logFile, 'w') as lfh:
                warning("Parsing variants from:", fname, file=lfh, flush=True)
                for line in fh:
                    lineCount = lineCount + 1
                    vepResult = json.loads(line.rstrip())

                    vepParser.set_annotation(copy.deepcopy(vepResult))
                    vepInputStr = vepParser.get('input')
                    entry = VcfEntryParser(vepInputStr)

                    # there may be json formatting issues w/the input str
                    # so replace w/the parsed entry; which is now a dict
                    vepResult['input'] = entry.get_entry()

                    if lineCount == 1 or variantCount % 5000 == 0:
                        warning('Processing new copy object', file=lfh, flush=True)
                        tstart = datetime.now()

                    refAllele = entry.get('ref')
                    altAllele = entry.get('alt')  # assumes no multialleic variants

                    if refAllele == '0': # happens sometimes
                        refAllele = '?'
                    if altAllele == '0':
                        altAllele = '?'

                    # truncatedRef = truncate(refAllele, 20)
                    # truncatedAlt = truncate(altAllele, 20)
                  
                    chrom = xstr(entry.get('chrom'))
                    if chrom == 'MT':
                        chrom = 'M'

                    position = int(entry.get('pos'))

                    try:
                        # metaseqId = ':'.join((chrom, xstr(position), truncatedRef, truncatedAlt))
                        metaseqId = ':'.join((chrom, xstr(position), refAllele, altAllele))

                        if duplicate(metaseqId, dcursor): 
                            warning("SKIPPING:",metaseqId, "- already loaded.", file=lfh, flush=True)
                            continue

                        vepParser.set('ref_allele', refAllele)
                        vepParser.set('alt_allele', altAllele)
                     
                        frequencies = get_frequencies()
                        vepParser.adsp_rank_and_sort_consequences()

                        # for each allele
                        variantCount = variantCount + 1

                        positionEnd = entry.infer_variant_end_location(altAllele)
                        binIndex = indexer.find_bin_index(chrom, position, positionEnd)

                        # NOTE: VEP uses normalized alleles to indicate variant_allele
                        nRef, nAlt = entry.normalize_alleles(refAllele, altAllele, snvDivMinus=True)
                        alleleFreq = None if frequencies is None \
                          else get_allele_frequencies(nAlt, frequencies)

                        msConseq = get_most_severe_consequence(nAlt)
                        valueStr = '#'.join((
                          'chr' + xstr(chrom),
                          xstr(position),
                          binIndex,
                          metaseqId,
                          json2str(alleleFreq),
                          json2str(msConseq),
                          json2str(get_adsp_ranked_allele_consequences(nAlt)),
                          json.dumps(vepResult),
                          algInvocId
                          ))


                        copyObj.write(valueStr + '\n')

                        if variantCount % 5000 == 0:
                            warning("FOUND", variantCount, " new variants", file=lfh, flush=True)
                            tendw = datetime.now()
                            warning('Copy object prepared in ' + str(tendw - tstart) + '; ' +
                                    str(copyObj.tell()) + ' bytes; transfering to database',
                                    file=lfh, flush=True)
                            copyObj.seek(0)
                            cursor.copy_from(copyObj, 'variant', sep='#', null="NULL",
                                             columns=VARIANT_COLUMNS)

                            message = '{:,}'.format(variantCount)
                            if args.commit:
                                database.commit()
                                message = "COMMITTED " + message

                            else:
                                database.rollback()
                                message = "PARSED " + message + " -- rolling back"


                            warning(message, "; up to = ", metaseqId, file=lfh, flush=True)

                            tend = datetime.now()
                            warning('Database copy time: ' + str(tend - tendw), file=lfh, flush=True)
                            warning('        Total time: ' + str(tend - tstart), file=lfh, flush=True)

                            copyObj = io.StringIO() # reset io string

                    except Exception:
                        warning("ERROR parsing variant on line", line, ' - ', metaseqId,
                                file=lfh, flush=True)
                        print("FAIL", file=sys.stdout)
                        raise

                # final commit / leftovers
                copyObj.seek(0)
                cursor.copy_from(copyObj, 'variant', sep='#', null="NULL", columns=VARIANT_COLUMNS)
                message = '{:,}'.format(variantCount)

                if args.commit:
                    database.commit()
                    message = "DONE - COMMITTED " + message
                else:
                    database.rollback()
                    message = "DONE - PARSED " + message + " -- rolling back"
                    
                warning(message, file=lfh, flush=True)