Example #1
0
    def test_get_delta_score_acceptor(self):

        record = Record('10', 94077, 'A', ['C'])
        scores = get_delta_scores(record, self.ann, 500, 0)
        self.assertEqual(scores, ['C|TUBB8|0.15|0.27|0.00|0.05|89|-23|-267|193'])
        scores = get_delta_scores(record, self.ann_without_prefix, 500, 0)
        self.assertEqual(scores, ['C|TUBB8|0.15|0.27|0.00|0.05|89|-23|-267|193'])

        record = Record('chr10', 94077, 'A', ['C'])
        scores = get_delta_scores(record, self.ann, 500, 0)
        self.assertEqual(scores, ['C|TUBB8|0.15|0.27|0.00|0.05|89|-23|-267|193'])
        scores = get_delta_scores(record, self.ann_without_prefix, 500, 0)
        self.assertEqual(scores, ['C|TUBB8|0.15|0.27|0.00|0.05|89|-23|-267|193'])
    def test_get_delta_score_donor(self):

        record = Record('10', 94555, 'C', ['T'])
        scores = get_delta_scores(record, self.ann)
        self.assertEqual(scores, ['T|TUBB8|0.01|0.18|0.15|0.62|-2|110|-190|0'])
        scores = get_delta_scores(record, self.ann_without_prefix)
        self.assertEqual(scores, ['T|TUBB8|0.01|0.18|0.15|0.62|-2|110|-190|0'])

        record = Record('chr10', 94555, 'C', ['T'])
        scores = get_delta_scores(record, self.ann)
        self.assertEqual(scores, ['T|TUBB8|0.01|0.18|0.15|0.62|-2|110|-190|0'])
        scores = get_delta_scores(record, self.ann_without_prefix)
        self.assertEqual(scores, ['T|TUBB8|0.01|0.18|0.15|0.62|-2|110|-190|0'])
Example #3
0
def run_serial(args):
    """
    串行运行
    """
    try:
        vcf = pysam.VariantFile(args.I)
    except (IOError, ValueError) as e:
        logging.error('{}'.format(e))
        exit()
    header = vcf.header
    header.add_line(
        '##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAIv1.3 variant '
        'annotation. These include delta scores (DS) and delta positions (DP) for '
        'acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). '
        'Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">'
    )
    try:
        output = pysam.VariantFile(args.O, mode='w', header=header)
    except (IOError, ValueError) as e:
        logging.error('{}'.format(e))
        exit()
    ann = Annotator(args.R, args.A)
    for record in vcf:
        scores = get_delta_scores(record, ann, args.D, args.M)
        if len(scores) > 0:
            record.info['SpliceAI'] = scores
        output.write(record)
    vcf.close()
    output.close()
Example #4
0
    def test_get_delta_score_donor(self):
        ''' test get_delta_scores for a predicted donor
        '''
        class Record():
            chrom, pos, ref, alts = '10', 94555, 'C', ['T']

        record = Record()
        scores = get_delta_scores(record, self.ann)
        self.assertEqual(scores, ['T|TUBB8|0.01|0.18|0.15|0.62|-2|110|-190|0'])
Example #5
0
    def test_get_delta_score_acceptor(self):
        ''' test get_delta_scores for a predicted acceptor
        '''
        class Record():
            chrom, pos, ref, alts = '10', 94077, 'A', ['C']

        record = Record()
        scores = get_delta_scores(record, self.ann)
        self.assertEqual(scores,
                         ['C|TUBB8|0.15|0.27|0.00|0.05|89|-23|-267|193'])
Example #6
0
def main():

    args = get_options()

    if None in [args.I, args.O, args.D, args.M]:
        logging.error(
            'Usage: spliceai [-h] [-I [input]] [-O [output]] -R reference -A annotation '
            '[-D [distance]] [-M [mask]]')
        exit()

    try:
        vcf = pysam.VariantFile(args.I)
    except (IOError, ValueError) as e:
        logging.error('{}'.format(e))
        exit()

    header = vcf.header
    header.add_line(
        '##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAIv1.3 variant '
        'annotation. These include delta scores (DS) and delta positions (DP) for '
        'acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). '
        'This version also includes the distance (DIST) to the nearest splice site.'
        'Format: ALLELE|SYMBOL|DIST|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">'
    )

    try:
        output = pysam.VariantFile(args.O, mode='w', header=header)
    except (IOError, ValueError) as e:
        logging.error('{}'.format(e))
        exit()

    # loading prescored files
    prescored_files = []
    try:
        for filename in args.P:
            vcf_file = pysam.VariantFile(filename)
            prescored_files.append(vcf_file)
    except (IOError, ValueError) as e:
        logging.error('{}'.format(e))

    ann = Annotator(args.R, args.A)

    for record in vcf:
        scores = get_delta_scores(record, ann, args.D, args.M, prescored_files)
        if len(scores) > 0:
            record.info['SpliceAI'] = scores
        output.write(record)

    vcf.close()
    output.close()
Example #7
0
def main():

    args = get_options()

    vcf = pysam.VariantFile(args.I)
    header = vcf.header
    header.add_line(
        '##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAI variant annotation. These include delta scores (DS) and delta positions (DP) for acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">'
    )
    output = pysam.VariantFile(args.O, mode='w', header=header)
    ann = annotator(args.R, args.A)

    for record in vcf:
        scores = get_delta_scores(record, ann)
        if len(scores) > 0:
            record.info['SpliceAI'] = scores
        output.write(record)
Example #8
0
def main():

    args = get_options()

    if None in [args.I, args.O, args.D, args.M]:
        logging.error(
            'Usage: spliceai [-h] [-I [input]] [-O [output]] -R reference -A annotation '
            '[-D [distance]] [-M [mask]]')
        exit()

    try:
        vcf = pysam.VariantFile(args.I)
    except (IOError, ValueError) as e:
        logging.error('{}'.format(e))
        exit()

    header = vcf.header
    ###Adding header lines required to satisfy vcf format output.write()
    header.add_line('###fileDate=20191004')
    header.add_line('##reference=GRCh37/hg19')
    header.add_line(
        '##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAIv1.3.1 variant '
        'annotation. These include delta scores (DS) and delta positions (DP) for '
        'acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). '
        'Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">'
    )
    header.add_line(
        '##INFO=<ID=NS,Number=20000,Type=Integer,Description="Dummy NS">')

    try:
        output = pysam.VariantFile(args.O, mode='w', header=header)
    except (IOError, ValueError) as e:
        logging.error('{}'.format(e))
        exit()

    ann = Annotator(args.R, args.A)

    for record in vcf:
        scores = get_delta_scores(record, ann, args.D, args.M)
        if len(scores) > 0:
            record.info['SpliceAI'] = scores
        output.write(record)

    vcf.close()
    output.close()
Example #9
0
def process_record(records, results, ref_fasta, annotations, dist_var, mask):
    # 创建一个注释助手类
    ann = Annotator(ref_fasta, annotations)
    # 监听队列
    while True:
        # 尝试从队列获得一个待打分的变异
        try:
            record = records.get_nowait()
        except queue.Empty:
            continue
        # 判断队列是否结束
        if record != 'END':
            # 对变异进行打分并把结果放入队列
            scores = get_delta_scores(record, ann, dist_var, mask)
            results.put((record.id, scores))
        else:
            # 队列结束,重新把结束标志放入队列,以终止其他进程
            records.put('END')
            break
Example #10
0
def process_record(ann, distance, mask, record):
    scores = get_delta_scores(record, ann, distance, mask)
    if len(scores) > 0:
        record.info['SpliceAI'] = scores
    return record
Example #11
0
def process_variant(variant, genome_version, spliceai_distance, spliceai_mask, use_precomputed_scores):
    try:
        chrom, pos, ref, alt = parse_variant(variant)
    except ValueError as e:
        return {
            "variant": variant,
            "error": f"ERROR: {e}",
        }

    if len(ref) > 1 and len(alt) > 1:
        return {
            "variant": variant,
            "error": f"ERROR: SpliceAI does not currently support complex InDels like {chrom}-{pos}-{ref}-{alt}",
        }

    # generate error message if variant falls outside annotated exons or introns
    OTHER_GENOME_VERSION = {"37": "38", "38": "37"}
    chrom_without_chr = chrom.replace("chr", "")
    if not ANNOTATION_INTERVAL_TREES[genome_version][chrom_without_chr].at(pos):
        other_genome_version = OTHER_GENOME_VERSION[genome_version]
        other_genome_overlapping_intervals = ANNOTATION_INTERVAL_TREES[other_genome_version][chrom_without_chr].at(pos)
        if other_genome_overlapping_intervals:
            other_genome_genes = " and ".join(sorted(set([str(i.data).split("---")[0] for i in other_genome_overlapping_intervals])))
            return {
                "variant": variant,
                "error": f"ERROR: In GRCh{genome_version}, {chrom}-{pos}-{ref}-{alt} falls outside all gencode exons and introns."
                         f"SpliceAI only works for variants within known exons or introns. However, in GRCh{other_genome_version}, "
                         f"{chrom}:{pos} falls within {other_genome_genes}, so perhaps GRCh{genome_version} is not the correct genome version?"
            }
        else:
            return {
                "variant": variant,
                "error": f"ERROR: {chrom}-{pos}-{ref}-{alt} falls outside all Gencode exons and introns on "
                f"GRCh{genome_version}. SpliceAI only works for variants that are within known exons or introns.",
            }

            """
            NOTE: The reason SpliceAI currently works only for variants "
                         f"within annotated exons or introns is that, although the SpliceAI neural net takes any "
                         f"arbitrary nucleotide sequence as input, SpliceAI needs 1) the transcript strand "
                         f"to determine whether to reverse-complement the reference genome sequence before passing it "
                         f"to the neural net, and 2) transcript start and end positions to determine where to truncate "
                         f"the reference genome sequence.
            """

    source = None
    scores = []
    if (len(ref) <= 5 or len(alt) <= 2) and str(spliceai_distance) == str(SPLICEAI_DEFAULT_DISTANCE) and str(use_precomputed_scores) == "1":
        # examples: ("masked", "snv", "hg19")  ("raw", "indel", "hg38")
        key = (
            "masked" if str(spliceai_mask) == "1" else ("raw" if str(spliceai_mask) == "0" else None),
            "snv" if len(ref) == 1 and len(alt) == 1 else "indel",
            "hg19" if genome_version == "37" else ("hg38" if genome_version == "38" else None),
        )
        try:
            results = SPLICEAI_CACHE_FILES[key].fetch(chrom, pos-1, pos+1)
            for line in results:
                # ['1', '739023', '.', 'C', 'CT', '.', '.', 'SpliceAI=CT|AL669831.1|0.00|0.00|0.00|0.00|-1|-37|-48|-37']
                fields = line.split("\t")
                if fields[0] == chrom and int(fields[1]) == pos and fields[3] == ref and fields[4] == alt:
                    scores.append(fields[7])
            if scores:
                source = "lookup"
                #print(f"Fetched: ", scores, flush=True)

        except Exception as e:
            print(f"ERROR: couldn't retrieve scores using tabix: {type(e)}: {e}", flush=True)

    if not scores:
        if exceeds_rate_limit(request.remote_addr, request_type="SpliceAI: computed"):
            return {
                "variant": variant,
                "error": f"ERROR: Rate limit reached. To prevent a user from overwhelming the server and making it "
                         f"unavailable to other users, this tool allows no more than "
                         f"{RATE_LIMIT_REQUESTS_PER_USER_PER_MINUTE['SpliceAI: computed']} computed requests per minute per user.",
            }

        record = VariantRecord(chrom, pos, ref, alt)
        try:
            scores = get_delta_scores(
                record,
                SPLICEAI_ANNOTATOR[genome_version],
                spliceai_distance,
                spliceai_mask)
            source = "computed"
            #print(f"Computed: ", scores, flush=True)
        except Exception as e:
            return {
                "variant": variant,
                "error": f"ERROR: {type(e)}: {e}",
            }

    if not scores:
        return {
            "variant": variant,
            "error": f"ERROR: The SpliceAI model did not return any scores for {variant}. This is typically due to the "
                     f"variant falling outside of all Gencode exons and introns.",
        }

    scores = [s[s.index("|")+1:] for s in scores]  # drop allele field

    return {
        "variant": variant,
        "genome_version": genome_version,
        "chrom": chrom,
        "pos": pos,
        "ref": ref,
        "alt": alt,
        "scores": scores,
        "source": source,
    }