Example #1
0
def test_add_flag():
    vcf = VCF(VCF_PATH)
    vcf.add_info_to_header({'ID': 'myflag', 'Description': 'myflag',
        'Type':'Flag', 'Number': '0'})
    # NOTE that we have to add the info to the header of the reader,
    # not the writer because the record will be associated with the reader
    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)
    w = Writer(f, vcf)
    rec = vcf.next()

    rec.INFO["myflag"] = True
    w.write_record(rec)
    w.close()

    v = next(VCF(f))
    assert v.INFO["myflag"] is None, dict(v.INFO)

    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)
    w = Writer(f, vcf)
    rec.INFO["myflag"] = False
    w.write_record(rec)
    v = next(VCF(f))
    assert_raises(KeyError, v.INFO.__getitem__, "myflag")
Example #2
0
def test_ibd():
    samples = ['101976-101976', '100920-100920', '100231-100231']
    vcf = VCF(VCF_PATH, gts012=True, samples=samples)
    res = vcf.ibd()
    assert len(res) == 3, (len(res))
    arr = res[(b'101976-101976', b'100920-100920')]
    assert len(arr) > 0
Example #3
0
def test_relatedness():
    vcf = VCF(VCF_PATH, gts012=True)
    viter = iter(vcf.relatedness(gap=0, linkage_max=2))
    res = next(viter)
    assert "ibs0" in res
    assert "ibs2" in res
    assert "ibs2*" in res
Example #4
0
    def __init__(self, filename, reference=None):
        if not os.path.isfile(filename) and filename != "-":
            exit(message("Error: " + filename + " does not exist"))
        self.filename = filename
        if reference:
            self.reference = reference
            self.reference_file = resolve_reference_genome(reference)

        cyvcf2.__init__(self, self.filename)
        # Check if file exists
        self.n = len(self.samples)  # Number of Samples

        # Meta Data
        comp = re.compile(r'''^##(?P<key>[^<#]+?)=(?P<val>[^<#]+$)''', re.M)
        self.metadata = OrderedDict(comp.findall(self.raw_header))

        # Contigs
        self.contigs = OrderedDict(zip(
            re.compile("##contig=<ID=(.*?),").findall(self.raw_header),
            map(int, re.compile("##contig.*length=([^,>]*?)>").findall(self.raw_header))
        ))

        self.info_set = [x for x in self.header_iter() if x.type == "INFO"]
        self.filter_set = [x for x in self.header_iter() if x.type == "FILTER"]
        self.format_set = [x for x in self.header_iter() if x.type == "FORMAT"]
        self.header = copy(self.raw_header)
Example #5
0
def get_variant_type(variant_source):
    """Try to find out what type of variants that exists in a variant source
    
        Args:
            variant_source (str): Path to variant source
            source_mode (str): 'vcf' or 'gemini'
        
        Returns:
            variant_type (str): 'sv' or 'snv'
    """
    file_type = get_file_type(variant_source)
    variant_type = 'sv'
    if file_type == 'vcf':
        variants = VCF(variant_source)
    elif file_type == 'gemini':
        variants = GeminiQuery(variant_source)
        gemini_query = "SELECT * from variants"
        variants.run(gemini_query)
    # Check 1000 first variants, if anyone is a snv we set the variant_type
    # to 'snv'
    for i,variant in enumerate(variants):
        if file_type == 'vcf':
            if variant.is_snp:
                variant_type = 'snv'
        elif file_type == 'gemini':
            if variant['type'] == 'snp':
                variant_type = 'snv'
            
        if i > 1000:
            break
    
    return variant_type
Example #6
0
def _af_filter(data, in_file, out_file):
    """Soft-filter variants with AF below min_allele_fraction (appends "MinAF" to FILTER)
    """
    min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0
    logger.debug("Filtering MuTect2 calls with allele fraction threshold of %s" % min_freq)
    ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0]
    if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"):
        with file_transaction(data, ungz_out_file) as tx_out_file:
            vcf = VCF(in_file)
            vcf.add_filter_to_header({
                'ID': 'MinAF',
                'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + (
                    '(configured in bcbio as min_allele_fraction)'
                    if utils.get_in(data["config"], ("algorithm", "min_allele_fraction"))
                    else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')})
            w = Writer(tx_out_file, vcf)
            # GATK 3.x can produce VCFs without sample names for empty VCFs
            try:
                tumor_index = vcf.samples.index(dd.get_sample_name(data))
            except ValueError:
                tumor_index = None
            for rec in vcf:
                if tumor_index is not None and np.all(rec.format('AF')[tumor_index] < min_freq):
                    vcfutils.cyvcf_add_filter(rec, 'MinAF')
                w.write_record(rec)
            w.close()
    return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
Example #7
0
def test_hrec():

    vcf = VCF(VCF_PATH)
    for item in vcf.header_iter():
        info = item.info()
        if info['HeaderType'] != 'GENERIC':
            assert 'ID' in info
Example #8
0
def test_set_samples():
    vcf = VCF(VCF_PATH)
    assert len(vcf.samples) == 189, len(vcf.samples)
    vcf.set_samples([vcf.samples[2]])
    assert len(vcf.samples) == 1
    v = next(vcf)
    assert len(v.gt_types) == 1
Example #9
0
def test_header_stuff():
    vcf = VCF('{}/test.vcf.gz'.format(HERE))
    import sys
    seen_formats, seen_infos = 0, 0
    for h in vcf.header_iter():
        i = h.info(extra=True)
        assert isinstance(i, dict)
        seen_formats += i['HeaderType'] == 'FORMAT'
        seen_infos += i['HeaderType'] == 'INFO'
    assert seen_formats == 9, seen_formats
    assert seen_infos == 73, seen_infos
Example #10
0
def _af_annotate_and_filter(paired, items, in_file, out_file):
    """Populating FORMAT/AF, and dropping variants with AF<min_allele_fraction

    Strelka2 doesn't report exact AF for a variant, however it can be calculated as alt_counts/dp from existing fields:
    somatic
      snps:    GT:DP:FDP:SDP:SUBDP:AU:CU:GU:TU                 dp=DP                {ALT}U[0] = alt_counts(tier1,tier2)
      indels:  GT:DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50:BCN50  dp=DP                TIR = alt_counts(tier1,tier2)
    germline
      snps:    GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL(:PS)       dp=sum(alt_counts)   AD = ref_count,alt_counts
      indels:  GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL(:PS)             dp=sum(alt_counts)   AD = ref_count,alt_counts
    """
    data = paired.tumor_data if paired else items[0]
    min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0
    logger.debug("Filtering Strelka2 calls with allele fraction threshold of %s" % min_freq)
    ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0]
    if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"):
        with file_transaction(data, ungz_out_file) as tx_out_file:
            vcf = VCF(in_file)
            vcf.add_format_to_header({
                'ID': 'AF',
                'Description': 'Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), '
                               'TIR/DPI (somatic indels)',
                'Type': 'Float',
                'Number': '.'})
            vcf.add_filter_to_header({
                'ID': 'MinAF',
                'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + (
                    '(configured in bcbio as min_allele_fraction)'
                    if utils.get_in(data["config"], ("algorithm", "min_allele_fraction"))
                    else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')})
            w = Writer(tx_out_file, vcf)
            tumor_index = vcf.samples.index(data['description'])
            for rec in vcf:
                if paired:  # somatic?
                    if rec.is_snp:  # snps?
                        alt_counts = rec.format(rec.ALT[0] + 'U')[:,0]  # {ALT}U=tier1_depth,tier2_depth
                    else:  # indels
                        alt_counts = rec.format('TIR')[:,0]  # TIR=tier1_depth,tier2_depth
                    dp = rec.format('DP')[:,0]
                elif rec.format("AD") is not None:  # germline?
                    alt_counts = rec.format('AD')[:,1:]  # AD=REF,ALT1,ALT2,...
                    dp = np.sum(rec.format('AD')[:,0:], axis=1)
                else: # germline gVCF record
                    alt_counts, dp = (None, None)
                if dp is not None:
                    with np.errstate(divide='ignore', invalid='ignore'):  # ignore division by zero and put AF=.0
                        af = np.true_divide(alt_counts, dp)
                        af[~np.isfinite(af)] = .0  # -inf inf NaN -> .0
                    rec.set_format('AF', af)
                    if paired and np.all(af[tumor_index] < min_freq):
                        vcfutils.cyvcf_add_filter(rec, 'MinAF')
                w.write_record(rec)
            w.close()
    return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
Example #11
0
def test_set_format_int3():
    "test that we can handle multiple (in this case 3) values per sample"
    vcf = VCF('{}/test-format-string.vcf'.format(HERE))
    assert vcf.add_format_to_header(dict(ID="P3", Number=3, Type="Integer", Description="Int example")) == 0
    v = next(vcf)
    exp = np.array([[1, 11, 111], [2, 22, 222]], dtype=np.int)
    v.set_format("P3", exp)
    res = get_gt_str(v, "P3")
    assert res == ["1,11,111", "2,22,222"], (res, str(v))

    assert np.allclose(v.format("P3"), exp)
Example #12
0
def test_set_format_int():
    vcf = VCF('{}/test-format-string.vcf'.format(HERE))
    assert vcf.add_format_to_header(dict(ID="PI", Number=1, Type="Integer", Description="Int example")) == 0
    v = next(vcf)
    v.set_format("PI", np.array([5, 1], dtype=np.int))
    assert allclose(fmap(float, get_gt_str(v, "PI")), [5, 1])

    v.set_format("PI", np.array([855, 11], dtype=np.int64))
    assert allclose(fmap(float, get_gt_str(v, "PI")), [855, 11])

    v.set_format("PI", np.array([9998, 99911], dtype=np.int32))
    obs = fmap(float, get_gt_str(v, "PI"))
    assert allclose(obs, [9998, 99911]), obs
Example #13
0
def test_set_format_float():
    vcf = VCF('{}/test-format-string.vcf'.format(HERE))
    assert vcf.add_format_to_header(dict(ID="PS", Number=1, Type="Float", Description="PS example")) == 0
    v = next(vcf)
    v.set_format("PS", np.array([0.555, 1.111], dtype=np.float))
    assert allclose(fmap(float, get_gt_str(v, "PS")), np.array([0.555, 1.111]))

    v.set_format("PS", np.array([8.555, 11.111], dtype=np.float64))
    assert allclose(fmap(float, get_gt_str(v, "PS")), [8.555, 11.111])

    v.set_format("PS", np.array([9998.555, 99911.111], dtype=np.float32))
    obs = fmap(float, get_gt_str(v, "PS"))
    assert allclose(obs, [9998.555, 99911.111]), obs
Example #14
0
def test_add_filter_to_header():
    v = VCF(VCF_PATH)
    # NOTE that we have to add the filter to the header of the reader,
    # not the writer because the record will be associated with the reader
    v.add_filter_to_header({'ID': 'abcdefg', 'Description': 'abcdefg'})

    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)
    w = Writer(f, v)
    rec = v.next()

    rec.FILTER = ["abcdefg"]
    w.write_record(rec)
    w.close()

    v = next(VCF(f))
    assert v.FILTER == "abcdefg", v.FILTER
Example #15
0
    def validate(self, vcf_path, plot=False, king=False):
        if king:
            from .king import run_king
            run_king(vcf_path, self)

        else:
            from cyvcf2 import VCF
            vcf = VCF(vcf_path, gts012=True, lazy=True)
            rels = list(vcf.relatedness(min_af=0.02, n_variants=39000, gap=10000, linkage_max=1.5))
            if plot:
                fig = vcf.plot_relatedness(rels[:])
                fig.show()
                fig.savefig('t.png')

            print("sample_1\tsample_2\tped_relation\tvcf_relation\trel\tIBS")
            for rel in rels:
                sample_a, sample_b = rel['pair']
                ped_rel = self.relation(sample_a, sample_b)
                if ped_rel is None: continue
                out_line = "%s\t%s\t%s\t%s\t%.2f\t%.3f" % (sample_a, sample_b,
                        ped_rel, "|".join(rel['tags']), rel['rel'], rel['ibs'])
                if rel['rel'] < 0.04:  # likely unrelated
                    if ped_rel not in ('related level 2', 'unrelated'):
                        print(out_line)
                    continue

                if rel['rel'] < 0.15:
                    if ped_rel not in ('unrelated', 'related level 2', 'distant relations'):
                        print(out_line)
                    continue

                if 0.26 < rel['rel'] < 0.78:
                    if ped_rel not in ('parent-child', 'full siblings'):
                        print(out_line)
                    continue

                if 0.15 < rel['rel'] < 0.3:
                    if ped_rel not in ('related level 2', 'unrelated'):
                        print(out_line)
                    continue

                if ped_rel > 0.78:
                    if ped_rel not in ('identical twins', 'self'):
                        print(out_line)
                    continue
Example #16
0
def test_add_info_to_header():
    v = VCF(VCF_PATH)
    v.add_info_to_header({'ID': 'abcdefg', 'Description': 'abcdefg',
        'Type':'Character', 'Number': '1'})
    # NOTE that we have to add the info to the header of the reader,
    # not the writer because the record will be associated with the reader
    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)
    w = Writer(f, v)
    import sys
    rec = v.next()

    rec.INFO["abcdefg"] = "XXX"
    w.write_record(rec)
    w.close()

    v = next(VCF(f))
    assert v.INFO["abcdefg"] == "XXX", dict(v.INFO)
Example #17
0
def test_writer():

    v = VCF(VCF_PATH)
    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)
    o = Writer(f, v)
    rec = v.next()
    rec.INFO["AC"] = "3"
    rec.FILTER = ["LowQual"]
    o.write_record(rec)

    rec.FILTER = ["LowQual", "VQSRTrancheSNP99.90to100.00"]
    o.write_record(rec)


    rec.FILTER = "PASS"
    o.write_record(rec)

    o.close()

    expected = ["LowQual", "LowQual;VQSRTrancheSNP99.90to100.00", None]

    for i, variant in enumerate(VCF(f)):
        assert variant.FILTER == expected[i], (variant.FILTER, expected[i])
Example #18
0
def gvcf2coverage(threshold, merge, distance):

    vcf = VCF(fname='-', gts012=False, lazy=False, strict_gt=False)

    # eprint(f"samples: {vcf.samples}")
    assert len(vcf.samples) == 1

    # eprint(f"number of seqnames: {len(vcf.seqnames)}")
    assert len(vcf.seqnames) > 0

    first = True

    #
    # Loop over all entries
    #
    for entry in vcf:

        jump = False

        # Depth
        dp = entry.format('DP')

        if dp is None:
            depth = 0
        else:
            depth = dp[0][0]

        #
        # If depth is below the threshold, no need to go proceed
        #
        if depth < threshold:
            continue

        #
        # Convenience handles
        #
        chrom = entry.CHROM
        start = entry.start
        end = entry.end
        ploidy = entry.ploidy

        #
        # When we don't merge, just print here and proceed
        #
        if not merge:
            print(chrom, start, end, ploidy, sep="\t")
            continue

        #
        # We just started
        #
        if first:
            # First entry
            window_start = start
            window_end = end
            window_chrom = chrom
            window_ploidy = ploidy

            first = False

            # eprint(f"First! c:{window_chrom} s:{start}, w_s={window_start} e:{end} w_e={window_end}")

            continue

        if window_chrom != chrom:
            # eprint(f"Chrom changed from {window_chrom} to {chrom}.")
            jump = True

        elif window_ploidy != ploidy:
            # eprint(f"Ploidy changed from {window_ploidy} to {ploidy}")
            jump = True

        elif window_end + distance < start:
            # eprint("Gap! (window_end:%d < start:%d)" % (window_end + distance, start))
            jump = True

        if jump:
            # eprint("Jump!")
            print(window_chrom,
                  window_start,
                  window_end,
                  window_ploidy,
                  sep="\t")

            window_start = start
            window_end = end
            window_chrom = chrom
            window_ploidy = ploidy

        else:
            window_start = min(window_start, start)
            window_end = max(window_end, end)
            # eprint(f"No jump! s:{start}, w_s={window_start} e:{end} w_e={window_end}")

    #
    # If the last iteration of the loop was not a jump, we still need to print
    #
    if merge and not jump:
        print(window_chrom, window_start, window_end, window_ploidy, sep="\t")
Example #19
0
def query_bed_region(region, vcf_path, fasta, kmer_size, singleton_path, af_path, an_path, ac_path, model_dir):
    """
    @param ac_path:
    @param an_path:
    @param af_path:
    @param singleton_path:
    @param region:
    @param vcf_path:
    @param fasta:
    @param kmer_size:
    @return:
    """
    # TODO: Add binning somehow (either keep equal size or equal number of bins
    start = time.time()
    vcf = VCF(vcf_path)
    fasta = Fasta(fasta)
    window = QueryWindow(kmer_size, singleton_path=singleton_path, af_path=af_path, an_path=an_path, ac_path=ac_path, model_dir=model_dir)
    # The first kmer actually begins centered around first nucleotide in sequence so
    # start position is shifted upstream by half the kmer length
    # end position is shifted downstream by the same
    shift = kmer_size // 2
    try:
        if region.strand is not None:
            if is_dash(region.strand):
                sequence = fasta.get_seq(region.chrom, region.start - shift, region.stop + shift).complement.seq.upper()
            else:
                sequence = fasta.get_seq(region.chrom, region.start - shift, region.stop + shift).seq.upper()
        else:
            sequence = fasta.get_seq(region.chrom, region.start - shift, region.stop + shift).seq.upper()
        exp = window.calculate_expected(sequence)  # this does account for strandedness
        AF, AC, AN, singletons, count = count_regional_alleles(vcf(str(region)))
        field1 = count  # 'NumSNVs'
        field2 = singletons  # 'Singletons'
        field3 = AC  # 'AC'
        field4 = AN  # 'AN'
        field5 = AF  # 'AF'
        field6 = exp.get('singleton')  # 'ExpectedSingletons'
        field7 = exp.get('AC')  # 'ExpectedAC'
        field8 = exp.get('AN')  # 'ExpectedAN'
        field9 = exp.get('AF')  # 'ExpectedAF'

    except (KeyError, FetchError):
        field1 = 0  # 'NumSNVs'
        field2 = 0  # 'Singletons'
        field3 = 0  # 'AC'
        field4 = 0  # 'AN'
        field5 = 0  # 'AF'
        field6 = 0  # 'ExpectedSingletons'
        field7 = 0  # 'ExpectedAC'
        field8 = 0  # 'ExpectedAN'
        field9 = 0  # 'ExpectedAF'

    # print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (
    #     region.printstr(), str(field1), str(field2), str(field3), str(field4), str(field5), str(field6), str(field7),
    #     str(field8), str(field9)), flush=True)
    regname = region.str_name().split('\t')
    print(
        '{: <8} {: <12} {: <12} {: <20} {: <8} {: <10} {: <12} {: <10} {: <10} {: <24} {: <22} {: <20} {: <20} {: <20}'.format(
            str(regname[0]), str(regname[1]), str(regname[2]), str(regname[3]), str(regname[4]), str(field1), str(field2), str(field3), str(field4), str(field5), str(field6), str(field7), str(field8), str(field9)),
        flush=True)
    return '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (
        region.str_name(), str(field1), str(field2), str(field3), str(field4), str(field5), str(field6), str(field7),
        str(field8), str(field9))
Example #20
0
def test_relatedness():
    vcf = VCF(VCF_PATH, gts012=True)
    df = vcf.relatedness(gap=0, linkage_max=2)
    assert "ibs0" in df, df
    assert "rel" in df
Example #21
0
def test_raw_header():
    v = VCF(VCF_PATH)
    h = v.raw_header.strip().split("\n")
    s = h[0]
    assert s == "##fileformat=VCFv4.1", s
    assert len(h) == 185, len(h)
Example #22
0
def test_read_flag():
    vcf = VCF(VCF_PATH)
    for v in vcf:
        assert ("in_exac_flag" in str(v)) == v.INFO.get('in_exac_flag', False)
Example #23
0
def test_init():
    v = VCF(VCF_PATH)
    assert v
Example #24
0
    """Filter variants with QUAL above or below cutoff"""
    ret = variant.QUAL >= cutoff
    return nonrev and ret

for name, filt in filters.items():
    if name in BUILTIN_FILTERS:
        if not isinstance(filt, tuple):
            filt = (filt, )
        filters[name] = lambda variant: BUILTIN_FILTERS[name](variant, *filt)
        filters[name].__doc__ = BUILTIN_FILTERS[name].__doc__
    else:
        filters[name] = eval(filt)
        filters[name].__doc__ = filter_descs.get(name, filt)


invcf = VCF(infile)
for name, filt in filters.items():
    invcf.add_filter_to_header({
        'ID': name,
        'Description': filt.__doc__,
    })

if outfile.endswith(".gz"):
    outvcf = Writer(outfile, invcf, "wz")
else:
    outvcf = Writer(outfile, invcf)

for variant in invcf:
    for name, filt in filters.items():
        if not filt(variant):
            if not variant.FILTER:
Example #25
0

########################################################################################################################
# Run
parser = argparse.ArgumentParser(description="Convert a vcf to mixqtl formats")
parser.add_argument("-vcf")
parser.add_argument("-output_prefix")
args = parser.parse_args()

folder = os.path.split(args.output_prefix)[0]
if not os.path.exists(folder):
    os.makedirs(folder)

variant_annotation, hap_1, hap_2 = activate_output(args.output_prefix,
                                                   current_chromosome, None)
vcf = VCF(args.vcf)

print("processing")
for i, variant in enumerate(vcf):

    chromosome = variant.CHROM
    if chromosome != current_chromosome:
        deactivate_output(variant_annotation, hap_1, hap_2)
        current_chromosome = chromosome
        variant_annotation, hap_1, hap_2 = activate_output(
            args.output_prefix, current_chromosome, vcf.samples)

    _write(variant_annotation, [
        variant.ID, variant.CHROM,
        str(variant.POS), variant.REF, variant.ALT[0]
    ])
def transcript_for_alt(transcripts, alt):
    for transcript in transcripts[alt]:
        if transcript['PICK'] == '1':
            return transcript
    return transcripts[alt][0]


def decode_hex(string):
    hex_string = string.group(0).replace('%', '')
    return binascii.unhexlify(hex_string).decode('utf-8')


(script, tsv_filename, vcf_filename, vep_fields, output_dir) = sys.argv
vep_fields_list = vep_fields.split(',')

vcf_file = VCF(vcf_filename)

csq_fields = parse_csq_header(vcf_file)

vep = {}
for variant in vcf_file:
    chr = str(variant.CHROM)
    pos = str(variant.POS)
    ref = str(variant.REF)
    alts = variant.ALT

    if chr not in vep:
        vep[chr] = {}

    if pos not in vep[chr]:
        vep[chr][pos] = {}
Example #27
0
    def process_vcf(self, inputfile):
        """
        Main function for parsing VCF

        """
        # initialize reference genome
        fasta_reader = Fasta(self.args.fastafile, read_ahead=1000000)

        # initialize vcf reader
        if self.args.samplefile:
            keep_samples = parseSampleFile(self.args.samplefile)

            vcf_reader = VCF(inputfile,
                             mode='rb',
                             gts012=True,
                             lazy=True,
                             samples=keep_samples)
        else:
            vcf_reader = VCF(inputfile, mode='rb', gts012=True, lazy=True)

        nbp = (self.args.length - 1) // 2

        # index samples
        if (self.args.samplefile and self.args.groupvar):
            all_samples = vcf_reader.samples

            sg_dict = indexGroups(self.args.samplefile, self.args.groupvar)
            samples = sorted(list(set(sg_dict.values())))

            # get boolean vector of samples that are in sample file
            samples_keep_match = np.isin(all_samples, list(sg_dict.keys()))

            # get indices of matching samples
            samples_keep_idx = np.where(samples_keep_match)

            # get list of individual sample ids to keep
            samples_keep = sorted(list(set(sg_dict.keys())))

            util_log.debug("%s samples will be pooled into %s groups: %s",
                           len(all_samples), len(samples), ",".join(samples))
        else:
            samples = vcf_reader.samples

        samples_dict = {}
        for i, sample in enumerate(samples):
            samples_dict[sample] = i

        # Query records in VCF and build matrix
        M = np.zeros((len(samples), len(self.subtypes_dict)))
        numsites_keep = 0
        numsites_skip = 0
        chrseq = '0'
        chr_check = "none"

        for record in vcf_reader:

            # Filter by SNP status, # alt alleles, and FILTER column
            if (not record.is_snp or len(record.ALT) != 1
                    or record.FILTER is not None):
                numsites_skip += 1
                continue

            # Filter by allele count
            if record.INFO['AC'] > self.args.maxac > 0:
                numsites_skip += 1
                continue

            row_chr = record.CHROM

            # check chromosome formatting matches between MAF and fasta files
            if numsites_keep == 0:
                if "chr1" in fasta_reader and "chr" not in row_chr:
                    chr_check = "add"
                    util_log.debug(
                        "formatting mismatch: 'chr' only in fasta file")
                elif "chr1" not in fasta_reader and "chr" in row_chr:
                    chr_check = "delete"
                    util_log.debug(
                        "formatting mismatch: 'chr' only in MAF file")
                else:
                    util_log.debug("chromosome formatting matches")

            if chr_check == "add":
                row_chr = "chr" + row_chr
            elif chr_check == "delete":
                row_chr = row_chr.replace('chr', '')

            if row_chr != chrseq:
                sequence = fasta_reader[row_chr]
                chrseq = row_chr

            # check and update chromosome sequence
            # if record.CHROM != chrseq:
            #     sequence = fasta_reader[record.CHROM]
            #     chrseq = record.CHROM

            lseq = sequence[record.POS - (nbp + 1):record.POS + nbp].seq

            mu_type = record.REF + str(record.ALT[0])
            category = getCategory(mu_type)
            motif_a = getMotif(lseq)
            subtype = str(category + "." + motif_a)

            if subtype not in self.subtypes_dict:
                numsites_skip += 1
                continue

            st = self.subtypes_dict[subtype]

            # currently only works with singletons--
            if (self.args.samplefile and self.args.groupvar):

                gt_new = record.gt_types

                if (self.args.impute and 3 in gt_new):
                    gt_complete = gt_new[gt_new != 3]
                    freq = sum(gt_complete) / len(gt_complete)
                    gt_new[gt_new == 3] = freq

                else:
                    gt_new[gt_new == 3] = 0

                # if not any("/" in b for b in record.gt_bases):
                if self.args.haploid:
                    gt_new = np.divide(gt_new, 2.)

                # get array of genotypes only for samples in samplefile
                gt_sub = gt_new[samples_keep_idx]

                if gt_sub.sum() == 0:
                    numsites_skip += 1
                    continue

                # initialize dict of group allele counts = 0
                sg_counts = {k: 0 for k in sorted(list(set(sg_dict.values())))}

                # initialize dict of allele counts per sample
                d2 = dict(zip(samples_keep, gt_sub))

                # iterate per-sample counts and update per-group counts
                for key, value in d2.items():
                    sg_counts[sg_dict[key]] += value

                # add to matrix
                M[:, st] = M[:, st] + list(sg_counts.values())
                numsites_keep += 1

            else:
                gt_new = record.gt_types
                if (self.args.impute and 3 in gt_new):
                    gt_complete = gt_new[gt_new != 3]
                    freq = sum(gt_complete) / len(gt_complete)
                    gt_new[gt_new == 3] = freq

                else:
                    gt_new[gt_new == 3] = 0

                # if not any("/" in b for b in record.gt_bases):
                if self.args.haploid:
                    gt_new = np.divide(gt_new, 2.)

                M[:, st] = M[:, st] + gt_new
                numsites_keep += 1
                # util_log.debug(gt_new)

            if numsites_keep % 100000 != 0:
                continue
            util_log.debug("%s : %s sites counted", inputfile, numsites_keep)

        util_log.debug("%s : %s sites counted", inputfile, numsites_keep)
        util_log.debug("%s : %s sites skipped", inputfile, numsites_skip)

        out = collections.namedtuple('Out', ['M', 'samples'])(M, samples)
        if self.par:
            out = M

        return out
Example #28
0
    action="store_true",
    help=
    "this adds special weighting to the X chromosome if you want to run the full model",
    default=False)

args = parser.parse_args()
cpg = args.cpg
synonymous = args.synonymous
nosingletons = args.nosingletons
rfile = args.file
varflag = args.varflag
chromosomes = args.chromosomes
exclude = args.exclude
xweighted = args.xweighted

gnomad = VCF('data/gnomad-vep-vt.vcf.gz')
kcsq = gnomad["CSQ"]["Description"].split(":")[1].strip(' "').split("|")

ys, genes = [], []


def syn_density(pairs, d, gnomad, kcsq, nosingletons, varflag):
    syn = 0
    prevvar = None
    if varflag:
        if 'VARTRUE' in d[
                'varflag']:  # don't need syn for a 0 bp region, i.e., variant, so give it the lowest possible, 0
            return syn
    for pair in pairs:
        if varflag:
            r0 = str(int(pair[0]) + 1)
Example #29
0
functional = args.functional
variants = args.variants


def isfunctional(csqs):
    for csq in csqs.split(","):
        eff = csq.strip("|").split("|", 2)[0]
        if any([
                c in eff
                for c in ('stop_gained', 'stop_lost', 'start_lost',
                          'initiator_codon', 'rare_amino_acid', 'missense',
                          'protein_altering', 'frameshift',
                          'inframe_insertion', 'inframe_deletion')
        ]) or (('splice_donor' in eff or 'splice_acceptor' in eff)
               and 'coding_sequence' in eff):
            return True
    return False


vcf = VCF(variants)
print vcf.raw_header,

for v in vcf:
    if functional:
        csq = v.INFO.get("BCSQ") or v.INFO.get("CSQ")
        if csq is None or not isfunctional(csq):
            continue
        if v.INFO.get("_exclude"):
            continue
        print(str(v).strip())
Example #30
0
import sys
from cyvcf2 import VCF

kg_vcf = sys.argv[1]

pops = ["EUR", "AFR", "AMR", "SAS"]

good = 0
for v in VCF(kg_vcf):
    info = v.INFO
    if 'OLD_MULTIALLELIC' in info: continue
    if info['VT'] != 'SNP': continue
    if info['NS'] < 2500: continue
    if info['AF'] < 0.04: continue
    if info['AF'] > 0.95: continue
    try:
        info['EX_TARGET']
    except KeyError:
        continue

    if not all(info[p + "_AF"] > 0.04 for p in pops):
        continue

    good += 1
    print "%s:%d" % (v.CHROM, v.POS)
def get_header(vcf, vep_field, vep_separator):
    index_dict = dict()
    if vep_field:
        for h in vcf.header_iter():
            try:
                if h.info()['ID'] == vep_field:
                    csq_header = h.info()['Description'].split(vep_separator)
                    for elem in csq_header:
                        index_dict[elem] = csq_header.index(elem)
            except:
                pass
    return index_dict


vcf = VCF(sys.argv[1])

vcf.add_info_to_header({
    'ID': 'True_Label',
    'Description': 'True_Label of the variation',
    'Type': 'String',
    'Number': '1',
})

output = sys.argv[2]
o = Writer(output, vcf)
vep_field = sys.argv[3]
vep_separator = sys.argv[4]

index_dict = get_header(vcf, vep_field, vep_separator)
for record in tqdm(vcf):
Example #32
0
    joined_freq.to_csv("final_file.csv")
    return joined_freq


def impose_distance_requirement(sorted_vars, dist_bw_variants):
    # TODO
    print('No distance requirment imposed. Onwards!')
    return


if __name__ == "__main__":
    filename = sys.argv[1]
    variant_positions = defaultdict(Variant)
    kmer_len = 3
    if is_vcf(filename):  # import vcf file
        for variant in VCF(filename):
            if is_quality_variant(variant):
                # join is required because 'ALT' is returned as a list
                variant_positions[variant.POS] = Variant(variant.REF, "".join(variant.ALT), variant.POS)
        saved_csv_name = "chr22_variant_singletons.csv"
        generate_csv_from_variants(variant_positions, outfile=saved_csv_name)
        variant_singletons = import_variants(saved_csv_name)
    else:
        variant_singletons = import_variants(filename)
    print("Variants imported and saved.")
    if len(sys.argv) > 1:  # impose user supplied minimum distance between variants
        try:
            kmer_len = int(sys.argv[2])
            # dist_bw_variants = int(sys.argv[2])
            # if dist_bw_variants < 1:
            #     raise ValueError("Minimum distance must be positive integer!")
Example #33
0
    print("Cannot find input file ", args.inf)
    sys.exit(1)

vcf = cyvcf2.VCF(args.inf)

# create a new vcf Writer using the input vcf as a template.
w = Writer(f, vcf)

# Create other output
output = args.inf + ".stats"

df = pd.DataFrame(columns=vcf.samples)  #creates a new dataframe that's empty

v = -1

for variant in VCF(args.inf):  # or VCF('some.bcf')
    v = v + 1
    alt = [item.encode('utf-8') for item in variant.ALT]
    #print(variant.REF, alt) # e.g. REF='A', ALT=['C', 'T']

    # Somehow assessing the number of alternative alleles
    if (len(alt) == 1):
        pass
    # Multiple alternative alleles
    elif (len(alt) > 1):
        #print("Long",str(variant))
        pass
    # No alernative variant - possibly a SN deletion
    elif (len(alt) < 1):
        #print("Null",str(variant))
        pass
Example #34
0
def test_format_field():
    vcf = VCF(VCF_PATH)
    for v in vcf:
        assert isinstance(v.FORMAT, list)
Example #35
0
def vcf2tsv(query_vcf, out_tsv, skip_info_data, skip_genotype_data,
            keep_rejected_calls, compress, print_data_type_header):

    vcf = VCF(query_vcf, gts012=True)
    out = open(out_tsv, 'w')

    fixed_columns_header = [
        'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER'
    ]
    fixed_columns_header_type = [
        'String', 'Integer', 'String', 'String', 'String', 'Float', 'String'
    ]
    samples = vcf.samples
    info_columns_header = []
    format_columns_header = []
    sample_columns_header = []
    column_types = {}
    gt_present_header = 0

    if len(samples) > 0:
        sample_columns_header.append('VCF_SAMPLE_ID')

    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element.keys(
        ) and 'HeaderType' in header_element.keys():
            if header_element['HeaderType'] == 'INFO' or header_element[
                    'HeaderType'] == 'FORMAT':
                column_types[header_element['ID']] = header_element['Type']
            if header_element['HeaderType'] == 'INFO':
                if skip_info_data is False:
                    info_columns_header.append(header_element['ID'])
            if header_element['HeaderType'] == 'FORMAT':
                if len(sample_columns_header
                       ) > 0 and skip_genotype_data is False:
                    if header_element['ID'] != 'GT':
                        format_columns_header.append(header_element['ID'])
                    else:
                        gt_present_header = 1

    header_tags = fixed_columns_header
    if skip_info_data is False:
        header_tags = fixed_columns_header + sorted(info_columns_header)
        if len(sample_columns_header) > 0:
            if skip_genotype_data is False:
                header_tags = fixed_columns_header + sorted(
                    info_columns_header) + sample_columns_header + sorted(
                        format_columns_header) + ['GT']
            else:
                header_tags = fixed_columns_header + sorted(
                    info_columns_header)
    else:
        if len(sample_columns_header) > 0:
            if skip_genotype_data is False:
                header_tags = fixed_columns_header + sample_columns_header + sorted(
                    format_columns_header) + ['GT']
            else:
                header_tags = fixed_columns_header
    header_line = '\t'.join(header_tags)

    out.write('#https://github.com/sigven/vcf2tsv version=' + str(version) +
              '\n')
    if print_data_type_header is True:
        header_types = []
        for h in header_tags:
            if h in column_types:
                header_types.append(str(column_types[h]))
        header_line_type = '\t'.join(fixed_columns_header_type + header_types)
        out.write('#' + str(header_line_type) + '\n')
        out.write(str(header_line) + '\n')
    else:
        out.write(str(header_line) + '\n')

    for rec in vcf:
        rec_id = '.'
        rec_qual = '.'
        rec_filter = '.'
        alt = ",".join(str(n) for n in rec.ALT)
        if not rec.ID is None:
            rec_id = str(rec.ID)
        if not rec.QUAL is None:
            rec_qual = str("{0:.2f}".format(rec.QUAL))
        rec_filter = str(rec.FILTER)
        if rec.FILTER is None:
            rec_filter = 'PASS'

        pos = int(rec.start) + 1
        fixed_fields_string = str(
            rec.CHROM) + '\t' + str(pos) + '\t' + str(rec_id) + '\t' + str(
                rec.REF) + '\t' + str(alt) + '\t' + str(rec_qual) + '\t' + str(
                    rec_filter)

        if not 'PASS' in rec_filter and not keep_rejected_calls:
            continue

        variant_info = rec.INFO
        vcf_info_data = []
        if skip_info_data is False:
            for info_field in sorted(info_columns_header):
                if column_types[info_field] == 'Flag':
                    if variant_info.get(info_field) is None:
                        vcf_info_data.append('False')
                    else:
                        vcf_info_data.append('True')
                elif column_types[info_field] == 'Float' or column_types[
                        info_field] == 'Integer' or column_types[
                            info_field] == 'String' or column_types[
                                info_field] == 'Character':
                    if type(variant_info.get(info_field)) is list or type(
                            variant_info.get(info_field)) is tuple:
                        vcf_info_data.append(",".join(
                            str(n) for n in variant_info.get(info_field)))
                    else:
                        if variant_info.get(info_field) is None:
                            vcf_info_data.append('.')
                        else:
                            if column_types[info_field] == 'Float':
                                if not isinstance(variant_info.get(info_field),
                                                  float):
                                    print(
                                        'vcf2tsv.py WARNING:\tINFO tag ' +
                                        str(info_field) +
                                        ' is defined in the VCF header as type \'Float\', yet parsed as other type:'
                                        +
                                        str(type(variant_info.get(info_field)))
                                    )
                                    if not ',' in str(alt):
                                        print(
                                            'Warning: Multiple values in INFO tag for single ALT allele (VCF multiallelic sites not decomposed properly?):'
                                            + str(fixed_fields_string) + '\t' +
                                            str(info_field) + '=' +
                                            str(variant_info.get(info_field)))
                                    vcf_info_data.append('.')
                                else:
                                    val = str("{0:.7f}".format(
                                        variant_info.get(info_field)))
                                    vcf_info_data.append(val)
                            else:
                                if column_types[
                                        info_field] == 'String' or column_types[
                                            info_field] == 'Character':
                                    if isinstance(variant_info.get(info_field),
                                                  str):
                                        #print(str(info_field) + '\t' + variant_info.get(info_field).encode('ascii','ignore').rstrip().decode('ascii'))
                                        vcf_info_data.append(
                                            variant_info.get(
                                                info_field).encode(
                                                    'ascii',
                                                    'ignore').decode('ascii'))
                                    else:
                                        vcf_info_data.append('.')
                                        if column_types[
                                                info_field] == 'String':
                                            print(
                                                'vcf2tsv.py WARNING:\tINFO tag '
                                                + str(info_field) +
                                                ' is defined in the VCF header as type \'String\', yet parsed as other type:'
                                                + str(
                                                    type(
                                                        variant_info.get(
                                                            info_field))))
                                        if column_types[
                                                info_field] == 'Character':
                                            print(
                                                'vcf2tsv.py WARNING:\tINFO tag '
                                                + str(info_field) +
                                                ' is defined in the VCF header as type \'Character\', yet parsed as other type:'
                                                + str(
                                                    type(
                                                        variant_info.get(
                                                            info_field))))
                                else:
                                    if isinstance(variant_info.get(info_field),
                                                  int):
                                        vcf_info_data.append(
                                            str(variant_info.get(info_field)))
                                    else:
                                        print(
                                            'vcf2tsv.py WARNING:\tINFO tag ' +
                                            str(info_field) +
                                            ' is defined in the VCF header as type \'Integer\', yet parsed as other type:'
                                            + str(
                                                type(
                                                    variant_info.get(
                                                        info_field))))
                                        vcf_info_data.append(
                                            re.sub(
                                                '\(|\)', '',
                                                variant_info.
                                                get(info_field).encode(
                                                    'ascii',
                                                    'ignore').decode('ascii')))

        #print(str(vcf_info_data))
        #dictionary, with sample names as keys, values being genotype data (dictionary with format tags as keys)
        vcf_sample_genotype_data = {}
        if len(samples) > 0 and skip_genotype_data is False:
            gt_cyvcf = rec.gt_types
            i = 0
            while i < len(samples):
                vcf_sample_genotype_data[samples[i]] = {}
                gt = './.'
                if gt_present_header == 1:
                    if gt_cyvcf[i] == 0:
                        gt = '0/0'
                    if gt_cyvcf[i] == 1:
                        gt = '0/1'
                    if gt_cyvcf[i] == 2:
                        gt = '1/1'
                vcf_sample_genotype_data[samples[i]]['GT'] = gt
                i = i + 1

        for format_tag in sorted(format_columns_header):
            if len(samples) > 0 and skip_genotype_data is False:
                sample_dat = rec.format(format_tag)
                if sample_dat is None:
                    k = 0
                    while k < len(samples):
                        if samples[k] in vcf_sample_genotype_data:
                            vcf_sample_genotype_data[
                                samples[k]][format_tag] = '.'
                        k = k + 1
                    continue
                dim = sample_dat.shape
                j = 0
                ## sample-wise
                while j < dim[0]:
                    if sample_dat[j].size > 1:
                        d = ','.join(
                            str(e) for e in np.ndarray.tolist(sample_dat[j]))
                        if samples[j] in vcf_sample_genotype_data:
                            vcf_sample_genotype_data[
                                samples[j]][format_tag] = d
                    else:
                        d = '.'
                        if column_types[format_tag] == 'String':
                            d = str(sample_dat[j])
                        if column_types[format_tag] == 'Integer':
                            d = str(sample_dat[j][0])
                        if samples[j] in vcf_sample_genotype_data:
                            vcf_sample_genotype_data[
                                samples[j]][format_tag] = d
                    j = j + 1

        #print(str(vcf_sample_genotype_data))
        tsv_elements = []
        tsv_elements.append(fixed_fields_string)
        if skip_info_data is False:
            if skip_genotype_data is False:
                if len(sample_columns_header) > 0:
                    tsv_elements.append("\t".join(
                        str(n) for n in vcf_info_data))
                    ## one line per sample variant
                    for s in sorted(vcf_sample_genotype_data.keys()):
                        sample = s
                        line_elements = []
                        line_elements.extend(tsv_elements)
                        line_elements.append(sample)
                        gt_tag = '.'
                        for tag in sorted(
                                vcf_sample_genotype_data[sample].keys()):
                            if tag != 'GT':
                                line_elements.append(
                                    vcf_sample_genotype_data[sample]
                                    [tag].encode('ascii',
                                                 'ignore').decode('ascii'))
                            else:
                                gt_tag = vcf_sample_genotype_data[sample][
                                    tag].encode('ascii',
                                                'ignore').decode('ascii')
                        line_elements.append(gt_tag)
                        if gt_tag == './.' or gt_tag == '.':
                            if keep_rejected_calls:
                                out.write('\t'.join(line_elements) + '\n')
                        else:
                            out.write("\t".join(str(n)
                                                for n in line_elements) + '\n')

                else:
                    tsv_elements.append("\t".join(
                        str(n) for n in vcf_info_data))
                    line_elements = []
                    line_elements.extend(tsv_elements)
                    out.write('\t'.join(line_elements) + '\n')
            else:
                tsv_elements.append("\t".join(str(n) for n in vcf_info_data))
                line_elements = []
                line_elements.extend(tsv_elements)
                out.write('\t'.join(line_elements) + '\n')
        else:
            if skip_genotype_data is False:
                if len(sample_columns_header) > 0:
                    ## one line per sample variant
                    for s in sorted(vcf_sample_genotype_data.keys()):
                        sample = s
                        line_elements = []
                        line_elements.extend(tsv_elements)
                        line_elements.append(sample)
                        gt_tag = '.'
                        for tag in sorted(
                                vcf_sample_genotype_data[sample].keys()):
                            if tag != 'GT':
                                line_elements.append(
                                    vcf_sample_genotype_data[sample][tag])
                            else:
                                gt_tag = vcf_sample_genotype_data[sample][tag]
                        line_elements.append(gt_tag)
                        if gt_tag == './.' or gt_tag == '.':
                            if keep_rejected_calls:
                                out.write('\t'.join(line_elements) + '\n')
                        else:
                            out.write('\t'.join(line_elements) + '\n')
            else:
                line_elements = []
                line_elements.extend(tsv_elements)
                line_elements = tsv_elements
                out.write('\t'.join(line_elements) + '\n')

    out.close()

    if compress is True:
        command = 'gzip -f ' + str(out_tsv)
        check_subprocess(command)
Example #36
0
def test_snpeff_header():
    v = VCF(VCF_PATH2)

    f = v['SnpEffVersion']
    assert f != {}, f
    assert 'SnpEffVersion' in f
Example #37
0
def verify_pcgr_input(pcgr_directory, input_vcf, input_cna, tumor_dp_tag,
                      tumor_af_tag, normal_dp_tag, normal_af_tag,
                      call_conf_tag):
    """
   Function that reads the input files to PCGR (VCF file and Tab-separated values file with copy number segments) and performs the following checks:
   1. Check that VCF file is properly formatted (according to EBIvariation/vcf-validator - VCF v4.2)
   2. Check that no INFO annotation tags in the query VCF coincides with those generated by PCGR
   3. Check that provided columns for tumor/normal coverage and allelic depths are found in VCF
   4. Check that if VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose
   5. Check that copy number segment file has required columns and correct data types (and range)
   6. Any genotype data from VCF input file is stripped, and the resulting VCF file is sorted and indexed (bgzip + tabix) 
   """
    logger = pcgrutils.getlogger('pcgr-check-input')
    input_vcf_pcgr_ready = '/workdir/output/' + re.sub(
        r'(\.vcf$|\.vcf\.gz$)', '.pcgr_ready.tmp.vcf',
        os.path.basename(input_vcf))
    input_vcf_pcgr_ready_decomposed = '/workdir/output/' + re.sub(
        r'(\.vcf$|\.vcf\.gz$)', '.pcgr_ready.vcf', os.path.basename(input_vcf))

    if not input_vcf == 'None':
        logger.info('Validating VCF file with EBIvariation/vcf-validator')
        vcf_validation_output_file = '/workdir/output/' + re.sub(
            r'(\.vcf$|\.vcf\.gz$)', '.vcf_validator_output',
            os.path.basename(input_vcf))
        command_v42 = 'vcf_validator --input ' + str(
            input_vcf) + ' --version v4.2 > ' + str(vcf_validation_output_file)
        if input_vcf.endswith('.gz'):
            command_v42 = 'bgzip -dc ' + str(
                input_vcf) + ' | vcf_validator --version v4.2 > ' + str(
                    vcf_validation_output_file)

        os.system(command_v42)
        validation_results = is_valid_vcf(vcf_validation_output_file)

        if not validation_results['validation_status']:
            error_string_42 = '\n'.join(validation_results['error_messages'])
            validation_status = 'VCF file is NOT valid according to v4.2 specification'
            logger.error(validation_status + ':\n' + str(error_string_42))
            return -1
        else:
            validation_status = 'VCF file ' + str(
                input_vcf) + ' is valid according to v4.2 specification'
            logger.info(validation_status)

        tag_check = check_existing_vcf_info_tags(input_vcf, pcgr_directory,
                                                 logger)
        if tag_check == -1:
            return -1
        else:
            logger.info('No query VCF INFO tags coincide with PCGR INFO tags')

        if validation_results['validation_status']:
            multiallelic_alt = 0
            vcf = VCF(input_vcf)
            check_ad_dp_tags(vcf, tumor_dp_tag, tumor_af_tag, normal_dp_tag,
                             normal_af_tag, call_conf_tag, logger)
            for rec in vcf:
                POS = rec.start + 1
                alt = ",".join(str(n) for n in rec.ALT)
                if len(rec.ALT) > 1:
                    logger.warning("Multiallelic site detected:" +
                                   str(rec.CHROM) + '\t' + str(POS) + '\t' +
                                   str(rec.REF) + '\t' + str(alt))
                    multiallelic_alt = 1
            command_vcf_sample_free1 = 'egrep \'^##\' ' + str(
                input_vcf) + ' > ' + str(input_vcf_pcgr_ready)
            command_vcf_sample_free2 = 'egrep \'^#CHROM\' ' + str(
                input_vcf) + ' | cut -f1-8 >> ' + str(input_vcf_pcgr_ready)
            command_vcf_sample_free3 = 'egrep -v \'^#\' ' + str(
                input_vcf
            ) + ' | sed \'s/^chr//\' | cut -f1-8 | egrep \'^[0-9]\' | sort -k1,1n -k2,2n -k3,3 -k4,4 >> ' + str(
                input_vcf_pcgr_ready)
            command_vcf_sample_free4 = 'egrep -v \'^#\' ' + str(
                input_vcf
            ) + ' | sed \'s/^chr//\' | cut -f1-8 | egrep -v \'^[0-9]\' | egrep \'^[XYM]\' | sort -k1,1 -k2,2n -k3,3 -k4,4 >> ' + str(
                input_vcf_pcgr_ready)
            command_vcf_sample_free5 = 'egrep -v \'^#\' ' + str(
                input_vcf
            ) + ' | sed \'s/^chr//\' | cut -f1-8 | egrep -v \'^[0-9]\' | egrep -v \'^[XYM]\' | sort -k1,1 -k2,2n -k3,3 -k4,4 >> ' + str(
                input_vcf_pcgr_ready)
            if input_vcf.endswith('.gz'):
                command_vcf_sample_free1 = 'bgzip -dc ' + str(
                    input_vcf) + ' | egrep \'^##\' > ' + str(
                        input_vcf_pcgr_ready)
                command_vcf_sample_free2 = 'bgzip -dc ' + str(
                    input_vcf) + ' | egrep \'^#CHROM\' | cut -f1-8 >> ' + str(
                        input_vcf_pcgr_ready)
                command_vcf_sample_free3 = 'bgzip -dc ' + str(
                    input_vcf
                ) + ' | egrep -v \'^#\' | sed \'s/^chr//\' | cut -f1-8 | egrep \'^[0-9]\' | sort -k1,1n -k2,2n -k3,3 -k4,4 >> ' + str(
                    input_vcf_pcgr_ready)
                command_vcf_sample_free4 = 'bgzip -dc ' + str(
                    input_vcf
                ) + ' | egrep -v \'^#\' | sed \'s/^chr//\' | cut -f1-8 | egrep -v \'^[0-9]\' | egrep \'^[XYM]\' | sort -k1,1 -k2,2n -k3,3 -k4,4 >> ' + str(
                    input_vcf_pcgr_ready)
                command_vcf_sample_free5 = 'bgzip -dc ' + str(
                    input_vcf
                ) + ' | egrep -v \'^#\' | sed \'s/^chr//\' | cut -f1-8 | egrep -v \'^[0-9]\' | egrep -v \'^[XYM]\' | sort -k1,1 -k2,2n -k3,3 -k4,4 >> ' + str(
                    input_vcf_pcgr_ready)

            os.system(command_vcf_sample_free1)
            os.system(command_vcf_sample_free2)
            os.system(command_vcf_sample_free3)
            os.system(command_vcf_sample_free4)
            os.system(command_vcf_sample_free5)

            if multiallelic_alt == 1:
                logger.info(
                    'Decomposing multi-allelic sites in input VCF file using \'vt decompose\''
                )
                command_decompose = 'vt decompose -s ' + str(
                    input_vcf_pcgr_ready) + ' > ' + str(
                        input_vcf_pcgr_ready_decomposed
                    ) + ' 2> /workdir/output/decompose.log'
                os.system(command_decompose)
            else:
                command_copy = 'cp ' + str(input_vcf_pcgr_ready) + ' ' + str(
                    input_vcf_pcgr_ready_decomposed)
                os.system(command_copy)
            os.system('bgzip -f ' + str(input_vcf_pcgr_ready_decomposed))
            os.system('tabix -p vcf ' + str(input_vcf_pcgr_ready_decomposed) +
                      '.gz')
            os.system('rm -f ' + str(input_vcf_pcgr_ready) +
                      ' /workdir/output/decompose.log')

    if not input_cna == 'None':
        ret = is_valid_cna_segment_file(input_cna, logger)
        return ret

    return 0
Example #38
0
def test_iterate():

    for i, v in enumerate(VCF(VCF_PATH), start=1):
        pass
    assert i == 115, i
Example #39
0
def _af_annotate_and_filter(paired, items, in_file, out_file):
    """Populating FORMAT/AF, and dropping variants with AF<min_allele_fraction

    Strelka2 doesn't report exact AF for a variant, however it can be calculated as alt_counts/dp from existing fields:
    somatic
      snps:    GT:DP:FDP:SDP:SUBDP:AU:CU:GU:TU                 dp=DP                {ALT}U[0] = alt_counts(tier1,tier2)
      indels:  GT:DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50:BCN50  dp=DP                TIR = alt_counts(tier1,tier2)
    germline
      snps:    GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL(:PS)       dp=sum(alt_counts)   AD = ref_count,alt_counts
      indels:  GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL(:PS)             dp=sum(alt_counts)   AD = ref_count,alt_counts
    """
    data = paired.tumor_data if paired else items[0]
    min_freq = float(
        utils.get_in(data["config"],
                     ("algorithm", "min_allele_fraction"), 10)) / 100.0
    logger.debug(
        "Filtering Strelka2 calls with allele fraction threshold of %s" %
        min_freq)
    ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0]
    if not utils.file_exists(ungz_out_file) and not utils.file_exists(
            ungz_out_file + ".gz"):
        with file_transaction(data, ungz_out_file) as tx_out_file:
            vcf = VCF(in_file)
            vcf.add_format_to_header({
                'ID':
                'AF',
                'Description':
                'Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), '
                'TIR/DPI (somatic indels)',
                'Type':
                'Float',
                'Number':
                '.'
            })
            vcf.add_filter_to_header({
                'ID':
                'MinAF',
                'Description':
                'Allele frequency is lower than %s%% ' % (min_freq * 100) +
                ('(configured in bcbio as min_allele_fraction)'
                 if utils.get_in(data["config"],
                                 ("algorithm", "min_allele_fraction")) else
                 '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)'
                 )
            })
            w = Writer(tx_out_file, vcf)
            tumor_index = vcf.samples.index(data['description'])
            for rec in vcf:
                if paired:  # somatic?
                    if rec.is_snp:  # snps?
                        alt_counts = rec.format(
                            rec.ALT[0] +
                            'U')[:, 0]  # {ALT}U=tier1_depth,tier2_depth
                    else:  # indels
                        alt_counts = rec.format(
                            'TIR')[:, 0]  # TIR=tier1_depth,tier2_depth
                    dp = rec.format('DP')[:, 0]
                elif rec.format("AD") is not None:  # germline?
                    alt_counts = rec.format('AD')[:,
                                                  1:]  # AD=REF,ALT1,ALT2,...
                    dp = np.sum(rec.format('AD')[:, 0:], axis=1)
                else:  # germline gVCF record
                    alt_counts, dp = (None, None)
                if dp is not None:
                    with np.errstate(
                            divide='ignore', invalid='ignore'
                    ):  # ignore division by zero and put AF=.0
                        af = np.true_divide(alt_counts, dp)
                        af[~np.isfinite(af)] = .0  # -inf inf NaN -> .0
                    rec.set_format('AF', af)
                    if paired and np.all(af[tumor_index] < min_freq):
                        vcfutils.cyvcf_add_filter(rec, 'MinAF')
                w.write_record(rec)
            w.close()
    return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
from cyvcf2 import VCF
import sys
import numpy as np
import gzip
import csv
import glob
import os
import fnmatch
from collections import defaultdict, Counter

vcf = VCF(sys.argv[1], gts012=True)

samps = list(vcf.samples)

s_counts_snp = np.zeros((len(vcf.samples)))
s_counts_indel = np.zeros((len(vcf.samples)))

idx2samp = dict(zip(range(len(vcf.samples)), vcf.samples))
samp2idx = dict(zip(vcf.samples, range(len(vcf.samples))))

ii = np.arange(len(vcf.samples))
### TODO: make sure there are intervals for the ends of chromosomes!
for i, v in enumerate(vcf):
    if i % 10000 == 0:
        print('done with {} variants, on {}:{}'.format(i, v.CHROM, v.POS))
        print(np.sum(s_counts_snp))
    if v.call_rate != 1.: continue
    gts = v.gt_types
    unk_gts = np.where(gts == 3)

    unique, counts = np.unique(gts, return_counts=True)
vars['novelsv'] = []

# read genelist
knowngenelist = set()

with open(genelist, 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    for row in reader:
        l = row[0]
        p = re.compile("_exon_\d+")
        l = p.sub("", l)
        knowngenelist.add(l)

# get gene-level variants

genevcf = VCF(genevcf_file)
genecsq_fields = parse_csq_header(genevcf)

for variant in genevcf:

    csq = variant.INFO.get('CSQ')

    if csq is None:
        sys.exit("No VEP fields")

    transcripts = list(
        parse_csq_entries(csq.split(','), genecsq_fields).items())[0][
            1]  # just get the first allele in the list.
    genes = get_csq_entries_bygene(transcripts)

    vartype = ''
Example #42
0
if __name__ == '__main__':
    args = docopt(__doc__, version='1.0')
    #print(args)

    if (args['--format']):
        ShowFormat()
        sys.exit(-1)

    MISS_THRESHOLD = float(args['-m'])

    # API and example
    # https://brentp.github.io/cyvcf2/
    # https://brentp.github.io/cyvcf2/docstrings.html#api
    invcf = VCF(
        '/dev/stdin', lazy=True, gts012=True
    )  # if gts012=True, then gt_types will be 0=HOM_REF, 1=HET, 2=HOM_ALT, 3=UNKNOWN.
    # invcf = VCF('test.vcf.gz', lazy=True)

    # adjust the header to contain the new field
    # the keys 'ID', 'Description', 'Type', and 'Number' are required.

    invcf.add_filter_to_header({
        'ID':
        'VCFSiteMissingFilter.py',
        'Description':
        'Exclude the site with missing rate higher than > ' +
        str(MISS_THRESHOLD)
    })

    # create a new vcf Writer using the input vcf as a template.
Example #43
0
def test_empty_info():
    for v in VCF(VCF_PHASE_PATH):
        dict(v.INFO)
Example #44
0
def main():

    # Step 01: define argument variables
    parser = argparse.ArgumentParser()
    parser.add_argument("--vcf",
                        help="Sorted VCF file as input",
                        required=True)
    parser.add_argument(
        "--out",
        help="Name of the output file that contains simplified VCF as table.",
        required=True)
    parser.add_argument(
        "--samples",
        help="SAMPLE of interest; write as comma separated names, "
        "for e.g: 'sampleA,sampleB' or 'all'.",
        default='all')
    parser.add_argument(
        "--pre_header",
        help=
        "Comma separated pre-header fields before the 'INFO' field in the input VCF file. "
        "Write as comma separated fields, for e.g: 'CHR,POS,ID' or 'all'. "
        "Default: 'all'. ",
        default='all')
    parser.add_argument(
        "--infos",
        help="INFO tags that are of interest; write as comma separated tags; "
        "for e.g: 'AC,AF,AN' or 'all'.",
        default='all')
    parser.add_argument(
        "--formats",
        help="FORMAT tags that are of interest; for e.g: 'GT,PG,PI' or 'all'.",
        default='all')
    parser.add_argument("--keep_header",
                        default='no',
                        help="Keep the HEADER data in the output file."
                        "Options: 'yes' or 'no' ")

    parser.add_argument("--mode",
                        help="Structure of the output table."
                        "Options: wide(0), long(1). Default: 0 .",
                        required=False,
                        default=0)
    parser.add_argument("--gtbase",
                        help="write the GT field as IUPAC base code."
                        "Options: no(0), yes(1). Default: 0 .",
                        required=False,
                        default=0)

    global args  # creating a global argument variable
    args = parser.parse_args()

    global gtbase

    # ********************  only activate during non-interactive mode
    # Step 02: Set the parameters that are of interest in VCF file
    #pre_header = 'CHR,POS'
    #pre_header = 'all'

    #info_of_interest = 'AC,AN'
    #info_of_interest = 'all'

    #sample_of_interest = 'MA611,ms02g'
    #sample_of_interest = 'all'

    #format_of_interest = 'GT,PG,PL'
    #format_of_interest = 'all'

    # keep_header = 'yes'  # if 'yes' then add the header to the output file, but set default at 'no'
    # keep_header = 'no'
    # ************************

    # Step 02: Now, pipe the "input arguments" to a variable
    pre_header = args.pre_header
    info_of_interest = args.infos
    sample_of_interest = args.samples
    format_of_interest = args.formats
    keep_header = args.keep_header

    if args.mode == '1' or args.mode == 'long':
        mode = 'long'
    else:
        mode = 'wide'

    if args.gtbase == '1' or args.gtbase == 'yes':
        gtbase = 'yes'
    else:
        gtbase = 'no'

    # Step 03: Read vcf file using cyvcf2 and start mining the data
    start_time01 = time.time()
    #with open("simplified_vcf.txt", 'w') as write_block:
    with open(args.out, 'w') as write_block:
        #vcf_file = VCF('input_test.vcf')
        vcf_file = VCF(args.vcf)
        sample_ids = vcf_file.samples
        #print(sample_ids)
        print('- %i samples found.' % len(sample_ids))
        print()

        # mining header
        # add argument to keep or discard header while writing output file
        header = vcf_file.raw_header.split('\n')

        if keep_header == 'yes':
            write_block.write(vcf_file.raw_header)
        print()

        # Step 03-A: now, write the appropriate front part of the header
        all_header = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER']
        if pre_header != 'all':
            output_header = pre_header.split(',')

        else:
            output_header = all_header

        # Step 03-B: mine fields that are of interest - for rear part of the header and for computing values from variants.
        # all data are returned as list from the defined function
        # the output data are: sample names, infos and formats of interest which can be used in downstream analyses
        my_samples, my_infos, my_formats = process_fields_of_interest(
            header, sample_of_interest, info_of_interest, format_of_interest,
            sample_ids)

        # create empty dictionary - to store Key, Values from FORMAT field of each sample
        format_ids_values = collections.OrderedDict()
        for ks in my_formats:
            format_ids_values[ks] = None

        my_sample_idx = [sample_ids.index(x)
                         for x in my_samples]  # we can move this elsewhere

        ## Step 03-C: add tags from the "INFO" of interest to the header
        output_header += my_infos

        ## Step 03-D: Decide between "long" vs. "wide" way of representing FORMAT field values
        # Now, simplify the TAGS from the FORMAT field by assigning it to each SAMPLE names
        # add the tags from the "FORMAT" field as it is. Each sample is represented in different line
        if mode == 'long':
            output_header.append('SAMPLE')
            output_header.extend(my_formats)

            # write the final header to the output file
            print('\t'.join(output_header), file=write_block)

        elif mode == 'wide':
            # add the tags from the "FORMAT" field as suffix to the sample of interest (on output header)
            for name in my_samples:
                for tags in my_formats:
                    output_header.append(name + '_' + tags)

            # write the final header to the output file
            print('\t'.join(output_header), file=write_block)

        print()
        print('- Output header of the simplified VCF file:')
        print(output_header)
        print()

        chr_on_process = ''
        ''' Step 04: now, start parsing the VCF file using cyVCF2 and add the data for each header fields'''
        print('Reading the input vcf file ... ')
        print()
        for variant in vcf_file:

            line_to_write = ''  # create new emtpy variable

            contig = str(variant.CHROM)
            # find which chr is in the process
            if chr_on_process != contig:
                print('Contig %s is being processed ... ' % str(contig))
                print()
                chr_on_process = contig

            ##########################  **************************
            # these methods are deprecated for now, but keeping for future use.
            #pos = str(variant.POS)
            #id_ = variant.ID
            #ref_allele = variant.REF
            #alt_alleles = variant.ALT
            #all_alleles = [ref_allele] + alt_alleles
            #alt_freq = variant.INFO.get('AF')

            # pass the "alt_freq" values to a function to compute "all_freq" as string
            #all_freq = compute_allele_freq(alt_freq)
            ##########################  **************************

            # Step 04-A : Write the values for the pre-fields of the header (i.e CHR upto FILTER)
            # If user desires less number of fields that is also achievable
            chr_to_filter = str(variant).split('\t')[0:7]

            # to store the data when not all pre-headers are of interest
            pre_header_dict = collections.OrderedDict()
            if pre_header == 'all':
                #write_block.write('\t'.join(chr_to_filter))
                line_to_write += '\t'.join(chr_to_filter)

            elif pre_header != 'all':
                for idx, heads in enumerate(pre_header.split(',')):
                    pre_header_dict[heads] = chr_to_filter[all_header.index(
                        heads)]

                #write_block.write('\t'.join(pre_header_dict.values()))
                line_to_write += '\t'.join(pre_header_dict.values())

            # Step 04-B: compute values for the INFO tags of interest
            infos_to_write = process_info(my_infos, variant)
            #write_block.write('\t' + infos_to_write)
            line_to_write += '\t' + infos_to_write

            # Step 04-C: compute values for the FORMAT fields of interest for each SAMPLE names of interest
            # so, we need to use both format_fields and sample_names together
            # and pass it to a defined function
            if mode == 'wide':
                process_format_wide(variant, my_sample_idx, format_ids_values,
                                    write_block, line_to_write)

            elif mode == 'long':
                process_format_long(variant, my_sample_idx, format_ids_values,
                                    write_block, line_to_write, sample_ids)

            #write_block.write('\n')

        print('Elapsed time: ', time.time() - start_time01)
        print()
Example #45
0
def test_samples():
    v = VCF(VCF_PATH)
    assert len(v.samples) == 189
Example #46
0
def vcf2pd(vcf_in):
    '''
    VCF 파일을 읽어서 pandas dataframe 형식으로 return함
    :param vcf_in: VCF 파일 (.vcf/.vcf.gz/.bcf)
    :return: pandas dataframe
    '''

    vcf = VCF(vcf_in, gts012=True)
    lRows = []  # row의 list를 만들어서 마지막에 DataFrame으로 변환하는게 가장 빠르다.
    lInfo = []  # INFO list
    lFormat = []  # FORMAT list

    samples = vcf.samples
    n_samples = len(samples)

    # INFO FIELD의 item을 얻는다
    for h in vcf.header_iter():
        if (h['HeaderType'] == 'INFO'):
            lInfo.append(h['ID'])

        if (h['HeaderType'] == 'FORMAT'):
            lFormat.append(h['ID'])

    # sample information이 있는지 찾는다 (mutect2 output의 경우 이 정보가 포함되어있음)
    re_tumor = re.compile('##tumor_sample=.*')
    re_normal = re.compile('##normal_sample=.*')
    if re_tumor.search(vcf.raw_header) is not None:
        samples[samples.index(
            re_tumor.search(vcf.raw_header).group().split('=')[1])] = 'TUMOR'

    if re_normal.search(vcf.raw_header) is not None:
        samples[samples.index(
            re_normal.search(vcf.raw_header).group().split('=')[1])] = 'NORMAL'

    for v in vcf:

        # 8개의 fixed field를 저장한다.

        dVariant = {
            'CHROM': v.CHROM,
            'POS': v.POS,
            'ID': v.ID,
            'REF': v.REF,
            'ALT': ','.join(v.ALT),
            'QUAL': v.QUAL,
            'FILTER': v.FILTER
        }

        if not dVariant[
                'FILTER']:  # cyvcf2에서는 FILTER가 PASS일때 FILTER=None으로 저장하기 때문에 다시 'PASS'로 돌려줌
            dVariant['FILTER'] = 'PASS'

        # INFO field 처리
        for i in lInfo:
            dVariant[i] = v.INFO.get(i)

        # FORMAT field 처리
        for f in lFormat:

            for i in range(n_samples):

                if f == 'GT':
                    dVariant[samples[i] + '_' + f] = v.gt_types[i]
                    # v.format('GT')에는 이상한 형식으로 저장됨.
                    # gt_type = 0 --> hom_ref, gt_type = 1 --> hetero, gt_type = 2 --> hom_alt, gt_type = 3 --> unknown
                else:

                    if v.format(f) is not None:  # field가 None이 아니면

                        if isinstance(v.format(f)[i], str):
                            dVariant[samples[i] + '_' + f] = str(
                                v.format(f)
                                [i])  # string일 경우 각 letter들이 comma로 구분되는 것 방지
                        elif np.isnan(v.format(f)[i]).any():  # nan일 경우..
                            dVariant[samples[i] + '_' + f] = None
                        else:
                            dVariant[samples[i] + '_' + f] = ','.join(
                                list(map(str,
                                         v.format(f)[i])))

        lRows.append(dVariant)

    cols = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER']
    cols.extend(lInfo)
    cols.extend([s + '_' + f for s in samples for f in lFormat])

    if lRows:
        df = pd.DataFrame(lRows, columns=cols)
    else:
        df = pd.DataFrame(
            columns=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER'])

    return df
Example #47
0
def test_next():
    v = VCF(VCF_PATH)
    variant = next(v)
    assert isinstance(variant, Variant)
Example #48
0
def run(inheritance_model, ped, vcf, min_depth, min_gq, min_kindreds, severity):
    from cyvcf2 import VCF, Writer
    vcf = VCF(vcf, samples="-")

    annos = {}
    if "ANN" in vcf:
        desc = vcf["ANN"]["Description"]
        parts = [x.strip("\"'") for x in re.split("\s*\|\s*", desc.split(":", 1)[1].strip('" '))]
        annos["ANN"] = desc
    if "EFF" in vcf:
        desc = vcf["EFF"]["Description"]
        parts = [x.strip(" [])'(\"") for x in re.split("\||\(", desc.split(":", 1)[1].strip())]
        annos["EFF"] = parts
    if "CSQ" in vcf:
        desc = vcf["CSQ"]["Description"]
        parts = [x.strip(" [])'(\"") for x in re.split("\||\(", desc.split(":", 1)[1].strip())]
        annos["CSQ"] = parts

    vcf.update(id="inheritance", type="String", number="1", description="inheritance stuffs")
    out = Writer("-", vcf)

    vcf_order = dict((n, i) for i, n in (enumerate(vcf.samples)))
    fams = Family.from_ped(ped, order=vcf_order)
    for fam_id in fams:
        fams[fam_id] = (EvalFamily(fams[fam_id]), [s._i for s in fams[fam_id].subjects])

    def get_gene(variant):
        for anno in annos:
            consequences = variant.INFO[anno].split(",")
            effs = (Effect.new(anno, c, annos[anno]) for c in consequences)
            # limit to requested severity
            if severity is not None:
                effs = [e for e in effs if e.impact_severity in severity]
            effs = sorted(effs, reverse=True)
            for eff in effs:
                if eff.gene:
                    return eff.gene

    # TODO: more flexible groupby
    for gene, variants in it.groupby(vcf, get_gene):

        matching_fams = defaultdict(list)
        saved_vars = []
        uniq_fams = []

        for i, variant in enumerate(variants):
            saved_vars.append(variant)

            for family_id, (fam, idxs) in fams.items():
                fam.gt_types = variant.gt_types[idxs]
                fam.gt_depths = variant.gt_depths[idxs]
                fam.gt_quals = variant.gt_quals[idxs]
                # this dispatches to fam.auto_rec/auto_dom/de_novo/, etc. by the string
                # in inheritance model
                res = getattr(fam, inheritance_model)(min_depth=min_depth, min_gq=min_gq)

                # matched the inheritance model.
                if res: # can add custom logic here, e.g. and v.call_rate > 0.9:
                    matching_fams[i].append(family_id)
                    uniq_fams.append(family_id)

        if 0 < len(set(uniq_fams)) >= min_kindreds:

            if inheritance_model == 'comp_het':
                # TODO: idxs = matching_fams.keys()
                # run idxs[1:] vs idxs[:-1] for variants
                pass
            for i, family_ids in sorted(matching_fams.items()):
                variant = saved_vars[i]
                variant.INFO["inheritance"] = "%s:%s" % (gene, ",".join(set(family_ids)))

                out.write_record(variant)
Example #49
0
    def get_region_vcf(
        self,
        case_obj,
        chrom=None,
        start=None,
        end=None,
        gene_obj=None,
        variant_type="clinical",
        category="snv",
        rank_threshold=None,
    ):
        """Produce a reduced vcf with variants from the specified coordinates
           This is used for the alignment viewer.

        Args:
            case_obj(dict): A case from the scout database
            variant_type(str): 'clinical' or 'research'. Default: 'clinical'
            category(str): 'snv' or 'sv'. Default: 'snv'
            rank_threshold(float): Only load variants above this score. Default: 5
            chrom(str): Load variants from a certain chromosome
            start(int): Specify the start position
            end(int): Specify the end position
            gene_obj(dict): A gene object from the database

        Returns:
            file_name(str): Path to the temporary file
        """
        rank_threshold = rank_threshold or -100

        variant_file = None
        if variant_type == "clinical":
            if category == "snv":
                variant_file = case_obj["vcf_files"].get("vcf_snv")
            elif category == "sv":
                variant_file = case_obj["vcf_files"].get("vcf_sv")
            elif category == "str":
                variant_file = case_obj["vcf_files"].get("vcf_str")
            elif category == "cancer":
                variant_file = case_obj["vcf_files"].get("vcf_cancer")
        elif variant_type == "research":
            if category == "snv":
                variant_file = case_obj["vcf_files"].get("vcf_snv_research")
            elif category == "sv":
                variant_file = case_obj["vcf_files"].get("vcf_sv_research")

        if not variant_file:
            raise FileNotFoundError("VCF file does not seem to exist")

        try:
            vcf_obj = VCF(variant_file)
        except Exception:
            raise FileNotFoundError(
                "Could not access {}. The file is missing or malformed".format(
                    variant_file))

        region = ""

        if gene_obj:
            chrom = gene_obj["chromosome"]
            start = gene_obj["start"]
            end = gene_obj["end"]

        if chrom:
            if start and end:
                region = "{0}:{1}-{2}".format(chrom, start, end)
            else:
                region = "{0}".format(chrom)

        else:
            rank_threshold = rank_threshold or 5

        with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp:
            file_name = str(pathlib.Path(temp.name))
            for header_line in vcf_obj.raw_header.split("\n"):
                if len(header_line) > 3:
                    temp.write(header_line + "\n")
            try:
                for variant in vcf_obj(region):
                    temp.write(str(variant))
            except Exception:
                raise FileNotFoundError(
                    "Could not find index for {}".format(variant_file))

        return file_name