Example #1
0
class BinIndex(object):
    ''' class to fetch and map locations to index bins 
    creates its own DB handle

    b/c variants are usually incremental; stores current bin
    and checks if variant is in that; if not fetches bin from DB
    '''

    def __init__(self, gusConfigFile=None, verbose=True):
        self._gusConfigFile = gusConfigFile
        self._verbose = verbose
        self._currentBin = {}
        self.__get_db_handle()
        self._cursor = self._database.cursor("RealDictCursor")


    def __get_db_handle(self):
        ''' create database connection '''
        self._database = Database(self._gusConfigFile)
        self._database.connect()


    def close(self):
        ''' close db connection '''
        self._cursor.close()
        self._database.close()

    def _update_current_bin_index(self, chrm, start, end):
        ''' query against database to get minimum enclosing bin;
        set as new current bin'''
        if self._verbose: warning("Updating current bin")
        result = None
        self._cursor.execute(BIN_INDEX_SQL, (chrm, start, end))
        try:
            self._currentBin = self._cursor.fetchone()

        except ProgrammingError:
            raise ProgrammingError('Could not map ' + chrm + ':' + xstr(start) + '-' + xstr(end) + ' to a bin.')

        if self._verbose: warning(self._currentBin)
        return result


    def find_bin_index(self, chrm, start, end=None):
        ''' finds the bin index for the position;
        if end == None assume SNV, and set to start'''

        if end is None: end = start
        if 'chr' not in chrm: chrm = 'chr' + xstr(chrm)

        if bool(self._currentBin): # if a current bin exists and the variant falls in it, return it
            if self._currentBin['bin_level'] >= 27: # otherwise may be a broad bin b/c of a indel; so need to do a lookup
                brange = self._currentBin['location']
                if self._currentBin['chromosome'] == chrm \
                  and start in brange and end in brange:
                    return self._currentBin['global_bin_path']

        # otherwise, find & return the new bin
        self._update_current_bin_index(chrm, start, end) # not in current bin, so update bin
        return self._currentBin['global_bin_path']
class AlgorithmInvocation(object):
    ''' transaction management for algorithm invocation '''
    def __init__(self,
                 script=None,
                 parameters=None,
                 commit=False,
                 gusConfigFile=None):
        self._database = None
        self._algorithm_invocation_id = None

        self.__get_db_handle(gusConfigFile)
        if script is not None:
            self.insertAlgorithmInvocation(script, parameters, commit)

    def __get_db_handle(self, gusConfigFile):
        ''' create database connection '''
        self._database = Database(gusConfigFile)
        self._database.connect()

    def insertAlgorithmInvocation(self, script, parameters, commit):
        ''' create the entry for the algorithm invocation '''
        sql = """INSERT INTO AlgorithmInvocation
(script_name, script_parameters, commit_mode) VALUES (%s, %s, %s)
RETURNING algorithm_invocation_id"""
        with self._database.cursor() as cursor:
            cursor.execute(sql, (script, parameters, commit))
            self._algorithm_invocation_id = cursor.fetchone()[0]

        self._database.commit()

    def getAlgorithmInvocationId(self):
        ''' return algorithm invocation id '''
        return self._algorithm_invocation_id

    def close(self):
        ''' close db connection '''
        self._database.close()
Example #3
0
def load_annotation(chromosome):
    ''' parse over a JSON file, extract position, frequencies,
    ids, and ADSP-ranked most severe consequence; bulk load using COPY '''

    lfn = xstr(chromosome) + '.log'
    lfh = open(lfn, 'w')

    algInvocation = AlgorithmInvocation('parallel_load_dbsnp_vep_result.py',
                                        json.dumps({'chromosome': chromosome}),
                                        args.commit)
    algInvocId = xstr(algInvocation.getAlgorithmInvocationId())
    algInvocation.close()
    warning("Algorithm Invocation ID", algInvocId, file=lfh, flush=True)

    vepParser = VepJsonParser(args.rankingFile, verbose=True)
    indexer = BinIndex(args.gusConfigFile, verbose=False)

    database = Database(args.gusConfigFile)
    database.connect()

    fname = 'chr' + xstr(chromosome) + '.json.gz'
    fname = path.join(args.dir, fname)
    lineCount = 0
    variantCount = 0
    skipCount = 0

    resume = True if args.resumeAfter is None else False
    if not resume:
        warning("--resumeAfter flag specified; Finding skip until point",
                args.resumeAfter,
                file=lfh,
                flush=True)

    previousSnp = None
    with database.cursor() as cursor:
        copyObj = io.StringIO()
        with open(fname, 'r') as fhandle:
            mappedFile = mmap.mmap(fhandle.fileno(), 0,
                                   prot=mmap.PROT_READ)  # put file in swap
            with gzip.GzipFile(mode='r', fileobj=mappedFile) as gfh:
                for line in gfh:
                    lineCount = lineCount + 1
                    vepResult = json.loads(line.rstrip())

                    vepParser.set_annotation(copy.deepcopy(vepResult))
                    vepInputStr = vepParser.get('input')
                    entry = VcfEntryParser(vepInputStr)

                    # there are json formatting issues w/the input str
                    # so replace w/the parsed entry; which is now a dict
                    vepResult['input'] = entry.get_entry()

                    refSnpId = entry.get_refsnp()

                    if not resume:
                        if previousSnp == args.resumeAfter:
                            warning(previousSnp,
                                    refSnpId,
                                    file=lfh,
                                    flush=True)
                            warning("Resuming after:",
                                    args.resumeAfter,
                                    "- SKIPPED",
                                    skipCount,
                                    "lines.",
                                    file=lfh,
                                    flush=True)
                            resume = True
                        else:
                            previousSnp = refSnpId
                            skipCount = skipCount + 1
                            continue

                    if lineCount == 1 or variantCount % args.commitAfter == 0:
                        warning('Processing new copy object',
                                file=lfh,
                                flush=True)
                        tstart = datetime.now()

                    vepParser.set('ref_snp_id', refSnpId)

                    refAllele = entry.get('ref')
                    altAllele = entry.get('alt').split(',')

                    chrom = xstr(entry.get('chrom'))
                    if chrom == 'MT':
                        chrom = 'M'
                    position = int(entry.get('pos'))

                    isMultiAllelic = len(altAllele) > 1

                    try:

                        vepParser.set('is_multi_allelic', isMultiAllelic)
                        vepParser.set('ref_allele', refAllele)
                        vepParser.set('alt_allele', altAllele)

                        frequencies = get_frequencies(vepParser, lfh)
                        vepParser.adsp_rank_and_sort_consequences()

                        # for each allele
                        for alt in altAllele:
                            variantCount = variantCount + 1
                            metaseqId = ':'.join(
                                (chrom, xstr(position), refAllele, alt))
                            positionEnd = entry.infer_variant_end_location(alt)

                            binIndex = indexer.find_bin_index(
                                chrom, position, positionEnd)

                            # NOTE: VEP uses normalized alleles to indicate variant_allele
                            nRef, nAlt = entry.normalize_alleles(
                                refAllele, alt, snvDivMinus=True)
                            alleleFreq = None if frequencies is None \
                              else get_allele_frequencies(nAlt, frequencies)

                            msConseq = get_most_severe_consequence(
                                nAlt, vepParser)

                            valueStr = '#'.join(
                                ('chr' + xstr(chrom), xstr(position),
                                 xstr(vepParser.get('is_multi_allelic')),
                                 binIndex, refSnpId, metaseqId,
                                 json2str(alleleFreq), json2str(msConseq),
                                 json2str(
                                     get_adsp_ranked_allele_consequences(
                                         nAlt, vepParser)),
                                 json.dumps(vepResult), algInvocId))

                            copyObj.write(valueStr + '\n')

                            if variantCount % args.logAfter == 0 \
                              and variantCount % args.commitAfter != 0:
                                warning("PARSED",
                                        variantCount,
                                        file=lfh,
                                        flush=True)

                            if variantCount % args.commitAfter == 0:
                                tendw = datetime.now()
                                warning('Copy object prepared in ' +
                                        str(tendw - tstart) + '; ' +
                                        str(copyObj.tell()) +
                                        ' bytes; transfering to database',
                                        file=lfh,
                                        flush=True)
                                copyObj.seek(0)
                                cursor.copy_from(copyObj,
                                                 'variant',
                                                 sep='#',
                                                 null="NULL",
                                                 columns=VARIANT_COLUMNS)

                                message = '{:,}'.format(variantCount)
                                if args.commit:
                                    database.commit()
                                    message = "COMMITTED " + message

                                else:
                                    database.rollback()
                                    message = "PARSED " + message + " -- rolling back"

                                if variantCount % args.logAfter == 0:
                                    warning(message,
                                            "; up to = ",
                                            refSnpId,
                                            file=lfh,
                                            flush=True)

                                tend = datetime.now()
                                warning('Database copy time: ' +
                                        str(tend - tendw),
                                        file=lfh,
                                        flush=True)
                                warning('        Total time: ' +
                                        str(tend - tstart),
                                        file=lfh,
                                        flush=True)

                                if args.test:
                                    die("Test complete")

                                copyObj = io.StringIO()  # reset io string

                    except Exception as e:
                        warning("ERROR parsing variant",
                                refSnpId,
                                file=lfh,
                                flush=True)
                        warning(lineCount, ":", line, file=lfh, flush=True)
                        warning(str(e), file=lfh, flush=True)
                        raise

            # final commit / leftovers
            copyObj.seek(0)
            cursor.copy_from(copyObj,
                             'variant',
                             sep='#',
                             null="NULL",
                             columns=VARIANT_COLUMNS)
            message = '{:,}'.format(variantCount)

            if args.commit:
                database.commit()
                message = "DONE - COMMITTED " + message
            else:
                database.rollback()
                message = "DONE - PARSED " + message + " -- rolling back"

            warning(message, file=lfh, flush=True)

            mappedFile.close()

    database.close()
    indexer.close()
    lfh.close()
def load_annotation():
    ''' extract basic SNP information from VCF line; check against AnnotatedDB; if missing
    load '''

    indexer = BinIndex(args.gusConfigFile, verbose=False)

    database = Database(args.gusConfigFile)
    database.connect()

    lineCount = 0
    variantCount = 0
    with database.cursor() as cursor:
        with open(args.vcfFile, 'r') as fh:
            with open(args.logFileName, 'w') as lfh:
                warning("Parsing", args.vcfFile, file=lfh, flush=True)
                for line in fh:
                    lineCount = lineCount + 1
                    vepResult = {} # will just be the input string, so it matches the annotated variants

                    if line.startswith('#'):
                        continue

                    entry = VcfEntryParser(line.rstrip())

                    # there are json formatting issues w/the input str
                    # so replace w/the parsed entry; which is now a dict
                    vepResult['input'] = entry.get_entry()

                    refAllele = entry.get('ref')
                    altAllele = entry.get('alt') # assuming not multiallelic

                    if refAllele == '0': # happens sometimes
                        refAllele = '?'
                    if altAllele == '0':
                        altAllele = '?'

                    # truncatedRef = truncate(refAllele, 20)
                    # truncatedAlt = truncate(altAllele, 20)

                    chrom = xstr(entry.get('chrom'))
                    if chrom == 'MT':
                        chrom = 'M'
                    position = int(entry.get('pos'))


                    # metaseqId = ':'.join((chrom, xstr(position), truncatedRef, truncatedAlt))
                    isMultiAllelic = False # assuming b/c of how the VCF files were generated
                    metaseqId = ':'.join((chrom, xstr(position), refAllele, altAllele))

                    try:
                        if variant_is_missing(database, 'chr' + chrom, metaseqId):
                            warning(metaseqId, "- MISSING -- LOADING", file=lfh, flush=True)
                            variantCount = variantCount + 1

                            positionEnd = entry.infer_variant_end_location(altAllele)
                            binIndex = indexer.find_bin_index(chrom, position, positionEnd)

                            cursor.execute(INSERT_SQL, 
                                               ('chr' + chrom,
                                                position,
                                                isMultiAllelic,
                                                binIndex,
                                                metaseqId,
                                                json.dumps(vepResult),
                                                algInvocId))

                            if args.commit:
                                database.commit()
                            else:
                                database.rollback()

                        if lineCount % 50 == 0:
                            warning("Parsed", lineCount, "lines.", file=lfh, flush=True)
                            warning("Loaded", variantCount, "missing variants", file=lfh, flush=True)

                    except Exception:
                        warning("ERROR parsing variant", metaseqId, file=lfh, flush=True)
                        warning(lineCount, ":", line, file=lfh, flush=True)
                        print("FAIL", file=sys.stdout)
                        raise

                warning("DONE - Loaded", variantCount, "missing variants", file=lfh, flush=True)
    database.close()
    indexer.close()
def load_annotation(chromosome):
    ''' extract basic SNP information from VCF line; check against AnnotatedDB; if missing
    load '''

    indexer = BinIndex(args.gusConfigFile, verbose=False)

    database = Database(args.gusConfigFile)
    database.connect()


    fname = '00-All.MT.vcf.gz' if chromosome == 'M' \
      else '00-All.' + xstr(chromosome) + '.vcf.gz'

    logFname = path.join(args.logDir, fname + '.log')
    fname = path.join(args.dir, fname)

    lineCount = 0
    variantCount = 0

    warning("Parsing", fname)
    warning("Logging:", logFname)
    with database.cursor() as cursor:
        with open(fname, 'r') as fhandle, open(logFname, 'w') as lfh:
            warning("Parsing", fname, file=lfh, flush=True)
            mappedFile = mmap.mmap(fhandle.fileno(), 0,
                                   prot=mmap.PROT_READ)  # put file in swap
            with gzip.GzipFile(mode='r', fileobj=mappedFile) as gfh:
                for line in gfh:
                    lineCount = lineCount + 1
                    vepResult = {
                    }  # will just be the input string, so it matches the annotated variants

                    if line.startswith('#'):
                        continue

                    entry = VcfEntryParser(line.rstrip())

                    # there are json formatting issues w/the input str
                    # so replace w/the parsed entry; which is now a dict
                    vepResult['input'] = entry.get_entry()

                    refSnpId = entry.get_refsnp()

                    refAllele = entry.get('ref')
                    altAllele = entry.get('alt').split(',')

                    if '.' not in altAllele:
                        if lineCount % 25000 == 0:
                            warning("Parsed",
                                    lineCount,
                                    "lines.",
                                    file=lfh,
                                    flush=True)
                        continue  # sanity check/all missing are lacking alt alleles

                    chrom = xstr(entry.get('chrom'))
                    position = int(entry.get('pos'))

                    isMultiAllelic = len(altAllele) > 1

                    try:
                        if variant_is_missing(database, 'chr' + chrom,
                                              refSnpId):
                            warning(refSnpId,
                                    "- MISSING -- LOADING",
                                    file=lfh,
                                    flush=True)
                            for alt in altAllele:
                                if alt == '.':  # checking again in case some are multiallelic
                                    alt = '?'
                                variantCount = variantCount + 1
                                metaseqId = ':'.join(
                                    (chrom, xstr(position), refAllele, alt))
                                positionEnd = entry.infer_variant_end_location(
                                    alt)

                                binIndex = indexer.find_bin_index(
                                    chrom, position, positionEnd)
                                cursor.execute(
                                    INSERT_SQL,
                                    ('chr' + chrom, xstr(position),
                                     isMultiAllelic,
                                     binIndex, refSnpId, metaseqId,
                                     json.dumps(vepResult), algInvocId))

                                if args.commit:
                                    database.commit()
                                else:
                                    database.rollback()

                        if lineCount % 25000 == 0:
                            warning("Parsed",
                                    lineCount,
                                    "lines.",
                                    file=lfh,
                                    flush=True)
                            warning("Loaded",
                                    variantCount,
                                    "missing variants",
                                    file=lfh,
                                    flush=True)

                    except Exception:
                        warning("ERROR parsing variant",
                                refSnpId,
                                file=lfh,
                                flush=True)
                        warning(lineCount, ":", line, file=lfh, flush=True)
                        raise

            if not args.commit:
                database.rollback()
                warning("DONE -- rolling back")

            mappedFile.close()

    database.close()
    indexer.close()
    warning("DONE - Loaded",
            variantCount,
            "missing variants",
            file=lfh,
            flush=True)
Example #6
0
def update_variant_records_from_vcf():
    ''' lookup and update variant records from a VCF file 
    assuming the load by file was called by a plugin, so this variant has already been
    verified to be new to the resource; no need to check alternative metaseq IDs'''

    cupdater = CADDUpdater(args.logFile, args.databaseDir)

    database = Database(args.gusConfigFile)
    database.connect()

    lineCount = 0
    with database.cursor() as updateCursor, \
      open(args.vcfFile, 'r') as fh:
        try:
            for line in fh:
                if line.startswith("#"):
                    continue

                lineCount = lineCount + 1
                entry = VcfEntryParser(line.rstrip())

                refAllele = entry.get('ref')
                altAllele = entry.get('alt')
                chrom = xstr(entry.get('chrom'))
                if chrom == 'MT':
                    chrom = 'M'
                position = int(entry.get('pos'))
                metaseqId = ':'.join(
                    (chrom, xstr(position), refAllele, altAllele))

                record = {"metaseq_id": metaseqId}  # mimic "record"

                if len(refAllele) > 1 or len(altAllele) > 1:  # only doing SNVs
                    update_variant_record(record, cupdater, INDEL)
                else:
                    update_variant_record(record, cupdater, SNV)

                if lineCount % args.commitAfter == 0:
                    warning("Processed:",
                            lineCount,
                            "- SNVs:",
                            cupdater.get_update_count(SNV),
                            "- INDELS:",
                            cupdater.get_update_count(INDEL),
                            " - Not Matched:",
                            cupdater.get_not_matched_count(),
                            file=cupdater.lfh(),
                            flush=True)

                    updateCursor.execute(cupdater.sql_buffer_str())

                    if args.commit:
                        database.commit()
                    else:
                        database.rollback()
                    cupdater.clear_update_sql()

            if cupdater.buffered_variant_count() > 0:  # trailing
                updateCursor.execute(cupdater.sql_buffer_str())

                if args.commit:
                    database.commit()
                else:
                    database.rollback()

            warning("DONE - Updated SNVs:",
                    cupdater.get_update_count(SNV),
                    "- Updated INDELS:",
                    cupdater.get_update_count(INDEL),
                    "- Not Matched:",
                    cupdater.get_not_matched_count(),
                    file=cupdater.lfh(),
                    flush=True)

            cupdater.close_lfh()

        except Exception as e:
            warning(e, entry, file=cupdater.lfh(), flush=True)
            database.rollback()
            database.close()
            print("FAIL", file=sys.stdout)
            raise
Example #7
0
def update_variant_records(chromosome):
    chrLabel = 'chr' + xstr(chromosome)

    logFileName = path.join(args.logFilePath, chrLabel + '.log')
    cupdater = CADDUpdater(logFileName, args.databaseDir)
    cupdater.setChrm(chrLabel)

    selectSQL = "SELECT metaseq_id, cadd_scores FROM Variant_" + chrLabel

    database = Database(args.gusConfigFile)
    database.connect()

    lineCount = 0
    updateCount = 0
    updateIndelCount = 0
    skipCount = 0

    with database.cursor("RealDictCursor") as selectCursor, \
        database.cursor() as updateCursor:
        try:
            warning("Fetching",
                    chrLabel,
                    "variants",
                    file=cupdater.lfh(),
                    flush=True)
            selectCursor.execute(selectSQL)
            warning("DONE - Fetching", file=cupdater.lfh(), flush=True)

            for record in selectCursor:
                if args.debug and args.veryVerbose:
                    warning(record, file=cupdater.lfh(), flush=True)

                if record['cadd_scores'] is not None:
                    if args.debug and args.veryVerbose:
                        warning("Skipping",
                                record['metaseq_id'],
                                file=cupdater.lfh(),
                                flush=True)
                    skipCount = skipCount + 1
                    continue

                lineCount = lineCount + 1

                metaseqId = record['metaseq_id']

                chrm, position, refAllele, altAllele = metaseqId.split(':')

                if len(refAllele) > 1 or len(altAllele) > 1:  # only doing SNVs
                    update_variant_record(record, cupdater, INDEL)
                else:
                    update_variant_record(record, cupdater, SNV)

                if cupdater.get_total_update_count(
                ) % args.commitAfter == 0 and cupdater.buffered_variant_count(
                ) > 0:
                    if args.commit:
                        if args.debug:
                            warning("Starting Update",
                                    file=cupdater.lfh(),
                                    flush=True)

                        updateCursor.execute(cupdater.sql_buffer_str())

                        if args.debug:
                            warning("Done", file=cupdater.lfh(), flush=True)

                        cupdater.clear_update_sql()
                        database.commit()
                    else:
                        database.rollback()

                    warning(metaseqId,
                            "- Processed:",
                            lineCount,
                            "- SNVs:",
                            cupdater.get_update_count(SNV),
                            "- INDELS:",
                            cupdater.get_update_count(INDEL),
                            "- Skipped:",
                            skipCount,
                            " - Not Matched:",
                            cupdater.get_not_matched_count(),
                            file=cupdater.lfh(),
                            flush=True)

            if cupdater.buffered_variant_count() > 0:
                updateCursor.execute(cupdater.sql_buffer_str())
                if args.commit:  # trailing
                    database.commit()
                else:
                    database.rollback()

            warning("DONE - Updated SNVs:",
                    cupdater.get_update_count(SNV),
                    "- Updated INDELS:",
                    cupdater.get_update_count(INDEL),
                    "- Skipped",
                    skipCount,
                    "- Not Matched:",
                    cupdater.get_not_matched_count(),
                    file=cupdater.lfh(),
                    flush=True)
            cupdater.close_lfh()

        except Exception as e:
            warning(e, file=cupdater.lfh(), flush=True)
            if args.commit:
                database.commit()
            else:
                database.rollback()
            database.close()
            raise

    database.close()
Example #8
0
    parser.add_argument('-r', '--rankingFile', required=True,
                        help="full path to ADSP VEP consequence ranking file")
    parser.add_argument('--commit', action='store_true', help="run in commit mode", required=False)
    parser.add_argument('--gusConfigFile',
                        help="full path to gus config file, else assumes $GUS_HOME/config/gus.config")
    parser.add_argument('--skipVerification', help="skip check against DB (for avoiding duplications")
    parser.add_argument('--verbose', action='store_true')
    args = parser.parse_args()


    VARIANT_COLUMNS = qw('chromosome location bin_index metaseq_id allele_frequencies adsp_most_severe_consequence adsp_ranked_consequences vep_output row_algorithm_id', returnTuple=True)

    algInvocation = AlgorithmInvocation('load_vep_result.py', json.dumps(vars(args)), args.commit, args.gusConfigFile)
    algInvocId = xstr(algInvocation.getAlgorithmInvocationId())
    algInvocation.close()

    warning("Algorithm Invocation ID", algInvocId)

    vepParser = VepJsonParser(args.rankingFile, verbose=True)

    indexer = BinIndex(args.gusConfigFile, verbose=False)

    database = Database(args.gusConfigFile)
    database.connect()

    load_annotation()

    database.close()
    indexer.close()
    print(algInvocId, file=sys.stdout)