def test_by_id_vcf_in_regions():
    from kipoi.postprocessing.variant_effects.utils.generic import default_vcf_id_gen
    from kipoi.postprocessing.variant_effects.snv_predict import get_variants_in_regions_sequential_vcf
    vcf_path = kipoi.postprocessing.variant_effects.ensure_tabixed_vcf(
        "examples/rbp/example_files/variants.vcf")
    vcf_fh = cyvcf2.VCF(vcf_path, "r")
    ints1 = {"chr": [], "start": [], "end": [], "strand": [], "id": []}
    for rec in vcf_fh:
        ints1["chr"].append(rec.CHROM)
        ints1["start"].append(rec.POS - 20)
        ints1["end"].append(rec.POS + 20)
        ints1["strand"].append("*")
        ints1["id"].append(default_vcf_id_gen(rec))

    vcf_fh.close()
    vcf_fh = cyvcf2.VCF(vcf_path, "r")
    model_input = {"metadata": {"gr_a": ints1, "gr_b": ints1}}
    seq_to_meta = {"seq_a": "gr_a", "seq_a2": "gr_a", "seq_b": "gr_b"}
    vcf_records, process_lines, process_seq_fields, process_ids = get_variants_in_regions_sequential_vcf(
        model_input, seq_to_meta, vcf_fh, default_vcf_id_gen)
    num_entries = len(model_input["metadata"]["gr_a"]["chr"])
    assert len(vcf_records) == num_entries
    assert process_lines == list(range(num_entries))
    assert all(
        [set(el) == set(seq_to_meta.keys()) for el in process_seq_fields])
    #
    # Now imitate bad id in one range:
    ints2 = copy.deepcopy(ints1)
    ints2["id"][2] = ""
    model_input = {"metadata": {"gr_a": ints1, "gr_b": ints2}}
    seq_to_meta = {"seq_a": "gr_a", "seq_a2": "gr_a", "seq_b": "gr_b"}
    with pytest.raises(Exception):
        get_variants_in_regions_sequential_vcf(model_input, seq_to_meta,
                                               vcf_fh, default_vcf_id_gen)
def test__overlap_vcf_region():
    vcf_path = kipoi.postprocessing.variant_effects.ensure_tabixed_vcf(
        "examples/rbp/example_files/variants.vcf")
    vcf_obj = cyvcf2.VCF(vcf_path)
    all_records = [rec for rec in vcf_obj]
    vcf_obj.close()
    vcf_obj = cyvcf2.VCF(vcf_path)
    #
    regions_dict = {
        "chr": ["chr22"],
        "start": [21541589],
        "end": [36702137],
        "id": [0]
    }
    regions_gr = GenomicRanges(regions_dict["chr"], regions_dict["start"],
                               regions_dict["end"], regions_dict["id"])
    for regions in [regions_dict, regions_gr]:
        found_vars, overlapping_region = sp._overlap_vcf_region(
            vcf_obj, regions, exclude_indels=False)
        assert all([
            str(el1) == str(el2) for el1, el2 in zip(all_records, found_vars)
        ])
        assert len(overlapping_region) == len(found_vars)
        assert all([el == 0 for el in overlapping_region])

    regions_dict = {
        "chr": ["chr22", "chr22", "chr22"],
        "start": [21541589, 21541589, 30630220],
        "end": [36702137, 21541590, 30630222],
        "id": [0, 1, 2]
    }
    regions_gr = GenomicRanges(regions_dict["chr"], regions_dict["start"],
                               regions_dict["end"], regions_dict["id"])
    #
    plus_indel_results = all_records + all_records[:1] + all_records[3:4]
    snv_results = [el for el in plus_indel_results if not el.is_indel]
    #
    ref_lines_indel = [0] * len(all_records) + [1] + [2]
    snv_ref_lines = [
        el for el, el1 in zip(ref_lines_indel, plus_indel_results)
        if not el1.is_indel
    ]
    #
    for regions in [regions_dict, regions_gr]:
        for exclude_indels, ref_res, ref_lines in zip(
            [False, True], [plus_indel_results, snv_results],
            [ref_lines_indel, snv_ref_lines]):
            found_vars, overlapping_region = sp._overlap_vcf_region(
                vcf_obj, regions, exclude_indels)
            assert all([
                str(el1) == str(el2) for el1, el2 in zip(ref_res, found_vars)
                if not el1.is_indel
            ])
            assert overlapping_region == ref_lines
Exemple #3
0
def get_snp_genotypes(chromosome, position, samples=None):
    '''Returns a pandas DataFrame of genotypes, along with phasing status, for a specific SNP.

    >>> samples = ['HPSI0516i-pebf_2', 'HPSI0516i-zujs_5', 'HPSI1116pf-peru']
    >>> df = get_snp_genotypes(1,714439,samples)
    >>> df
                      chrA  chrB  phased
    HPSI0516i-pebf_2     0     0    True
    HPSI0516i-zujs_5     0     0    True
    HPSI1116pf-peru      0     0    True
    '''

    vcf_file = '/hps/nobackup/hipsci/scratch/genotypes/imputed/REL-2018-01/Full_Filtered/hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20180102.genotypes.chr.{chromosome}.norm.renamed.recode.vcf.gz'.format(
        chromosome=chromosome)

    if samples is None:
        vcf = cyvcf2.VCF(vcf_file)
        samples = vcf.samples
    else:
        if len(samples) != len(set(samples)):
            raise (ValueError('Duplicated samples in input list'))
        vcf = cyvcf2.VCF(vcf_file, samples=samples)
        for sample in samples:
            if sample not in vcf.samples:
                raise (KeyError('{} not in vcf'.format(sample)))
        # reorder samples to match order in which they'll be give by the vcf object
        samples = vcf.samples

    query_string = '{chromosome}:{position}-{position}'.format(
        chromosome=chromosome, position=position)

    variants = [x for x in vcf(query_string)]

    # only keep SNPs
    variants = [x for x in variants if x.is_snp]

    if len(variants) > 1:
        error_message = '''Input vcf file contains more than one SNP at position {chromosome}:{position}'''.format(
            chromosome=chromosome, position=position)
        raise (ValueError(error_message))
    if len(variants) == 0:
        error_message = '''Input vcf file has no SNP at position {chromosome}:{position}'''.format(
            chromosome=chromosome, position=position)
        raise (ValueError(error_message))

    var = variants[0]

    genotype_df = pd.DataFrame(index=samples,
                               columns=['chrA', 'chrB', 'phased'],
                               data=var.genotypes)

    return genotype_df
Exemple #4
0
def compare_vcfs(fpath1, fpath2):
    fh1 = cyvcf2.VCF(fpath1)
    fh2 = cyvcf2.VCF(fpath2)
    for rec1, rec2 in zip(fh1, fh2):
        i1 = dict(rec1.INFO)
        i2 = dict(rec2.INFO)
        for k in i1:
            if ':rID' in k:
                continue
            min_round = min(len(i1[k]) - i1[k].index(".") - 1, len(i2[k]) - i2[k].index(".") - 1) - 2  # -2 for more tolerance
            assert np.round(float(i1[k]), min_round) == np.round(float(i2[k]), min_round)
    fh2.close()
    fh1.close()
Exemple #5
0
def main(tumour, filter_germline_het, pass_only, just_best, info_af):
    logging.info('reading from stdin...')

    # we just want to get all the AFs
    afs = []
    vcf = cyvcf2.VCF('-')
    sample_id = vcf.samples.index(tumour)
    germline_id = 1 if sample_id == 0 else 0
    logging.debug('sample_id %i germline_id %i', sample_id, germline_id)
    skipped = 0
    gaf_range = (1.0, 0.0)

    for v in cyvcf2.VCF('-'):
        if pass_only and v.FILTER is not None:
            logging.debug('skipping non-pass at %s:%s', v.CHROM, v.POS)
            skipped += 1
            continue
        # gl af
        if not info_af:
            gaf = v.format('AF')[germline_id][0]  # mutect2 af
            gaf_range = (min([gaf_range[0], gaf]), max([gaf_range[1], gaf]))
            logging.debug('gaf %s range %s', gaf, gaf_range)
            if filter_germline_het and GL_HET[0] < gaf < GL_HET[1]:
                logging.debug('skipping germline het at %s:%s', v.CHROM, v.POS)
                skipped += 1
                continue

        # tumour af
        if info_af:
            af = v.INFO['AF']  # calculated af
        else:
            af = v.format('AF')[sample_id][0]  # mutect2 af
        logging.debug('appending %s to afs', af)
        afs.append(af)

    if len(afs) == 0:
        logging.warn('No afs')
        answer = [0.0, 0.5, 1.0]
    else:
        logging.debug('%i afs: %s', len(afs), afs)
        answer = numpy.percentile(afs, PERCENTILES)

    if just_best:
        sys.stdout.write('{:.2f}'.format(answer[1]))
    else:
        sys.stdout.write('Lower\tBest\tUpper\n')
        sys.stdout.write('{:.2f}\t{:.2f}\t{:.2f}\n'.format(
            answer[0], answer[1], answer[2]))

    logging.info('done. skipped %i included %i. gaf range %s', skipped,
                 len(afs), gaf_range)
Exemple #6
0
def main():
    input, vaf_threshold, output_dir = argument_parser()
    outputfile = os.path.join(
        output_dir, re.sub('.vcf$', '.filtered.vcf', os.path.basename(input)))

    vcf_handle = cyvcf2.VCF(input)
    print(vcf_handle)
    writer = cyvcf2.Writer(outputfile, vcf_handle)

    for variant in cyvcf2.VCF(input):
        if variant.INFO['PON_VAF'] < vaf_threshold:
            writer.write_record(variant)

    vcf_handle.close()
    writer.close()
Exemple #7
0
def main():
    input_vcf, reference, output_dir = argument_parser()

    output_vcf = os.path.join(
        output_dir, re.sub('.vcf$', '.sig9.vcf', os.path.basename(input_vcf)))
    vcf_handle = cyvcf2.VCF(input_vcf)
    output_vcf_handle = cyvcf2.Writer(output_vcf, vcf_handle)

    for variant in cyvcf2.VCF(input_vcf):
        var_position = Position(variant.CHROM, variant.POS, variant.POS)
        refbase, altbase, var_trinucleotide = get_trinucleotide(
            var_position, variant.REF, variant.ALT[0], reference)

        if var_trinucleotide in ['TTT', 'TTA', 'CTT'] and altbase == 'G':
            output_vcf_handle.write_record(variant)
Exemple #8
0
def unphase(inVcf, outVcf):

    # read the vcf with scikit-allel, just to get number of snps
    print("[GET_NR_SNPS]")
    print(f"Reading: {inVcf}")
    startTime = time.perf_counter()
    callset = allel.read_vcf(inVcf)
    print(f"Took {(time.perf_counter() - startTime):.2f} seconds.")

    # no tri-allelic?
    assert (sum(callset["variants/ALT"][:, 2] != '') == 0)
    assert (sum(callset["variants/ALT"][:, 1] != '') == 0)
    assert (sum(callset["variants/ALT"][:, 0] == '') == 0)

    snpsInFile = callset["calldata/GT"].shape[0]
    print(snpsInFile)

    print("[DONE]")

    print("[UNPHASE]")
    print(f"File to unphase: {inVcf}")
    print(f"Unphased output written to: {outVcf}")

    # go through the vcf
    vcfIFS = cyvcf2.VCF(inVcf)

    # get some randomness
    numIndividuals = len(vcfIFS.samples)
    randomness = numpy.random.randint(2, size=(numIndividuals, snpsInFile))

    # create a new vcf Writer using the input vcf as a template.
    vcfOFS = cyvcf2.Writer(outVcf, vcfIFS)

    count = 0
    allIdxs = numpy.arange(numIndividuals)

    for v in vcfIFS:

        # see what goes
        # what are the indices to be flipped?
        toFlip = allIdxs[randomness[:, count] == 1]
        for idx in toFlip:
            # flip it
            v.genotypes[idx][0], v.genotypes[idx][1] = v.genotypes[idx][
                1], v.genotypes[idx][0]

        # make sure we have new genotypes
        v.genotypes = v.genotypes
        # and write it
        vcfOFS.write_record(v)

        # increase count
        count += 1
        if (count % 100000 == 0):
            print(count)

    vcfOFS.close()
    vcfIFS.close()

    print("[DONE]")
Exemple #9
0
def filter_to_pass_and_reject(in_file, paired, out_dir=None):
    """Filter VCF to only those with a strict PASS/REJECT: somatic + germline.

    Removes low quality calls filtered but also labeled with REJECT.
    """
    from bcbio.heterogeneity import bubbletree
    out_file = "%s-prfilter.vcf.gz" % utils.splitext_plus(in_file)[0]
    if out_dir:
        out_file = os.path.join(out_dir, os.path.basename(out_file))
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            max_depth = bubbletree.max_normal_germline_depth(in_file, bubbletree.PARAMS, paired)
            tx_out_plain = tx_out_file.replace(".vcf.gz", ".vcf")
            with contextlib.closing(cyvcf2.VCF(in_file)) as reader:
                reader = _add_db_to_header(reader)
                with contextlib.closing(cyvcf2.Writer(tx_out_plain, reader)) as writer:
                    for rec in reader:
                        filters = rec.FILTER.split(";") if rec.FILTER else []
                        other_filters = [x for x in filters if x not in ["PASS", ".", "REJECT"]]
                        if len(other_filters) == 0 or bubbletree.is_info_germline(rec):
                            # Germline, check if we should include based on frequencies
                            if "REJECT" in filters or bubbletree.is_info_germline(rec):
                                stats = bubbletree._is_possible_loh(rec, reader, bubbletree.PARAMS, paired,
                                                                    use_status=True, max_normal_depth=max_depth)
                                if stats:
                                    rec.FILTER = "PASS"
                                    rec.INFO["DB"] = True
                                    writer.write_record(rec)
                            # Somatic, always include
                            else:
                                writer.write_record(rec)
            vcfutils.bgzip_and_index(tx_out_plain, paired.tumor_data["config"])
    return out_file
Exemple #10
0
def main(in_file):
    in_cyvcf = cyvcf2.VCF(in_file)
    writer = csv.writer(sys.stdout)
    writer.writerow(["chrom", "start", "end", "svtype", "samples", "size", "gene", "annotation", "detail"])
    for rec in in_cyvcf:
        calls = [parse_name(s) for s, gt in zip(in_cyvcf.samples, rec.gt_bases) if _has_call(gt)]
        anns = [x.split("|") for x in rec.INFO.get("SIMPLE_ANN", "").split(",")]
        svtypes = set([])
        all_genes = set([])
        annotations = set([])
        details = set([])
        for svtype, annotation, genes, _, detail, _ in (x for x in anns if x and len(x) > 1):
            if detail != "NOT_PRIORITISED":
                for c in "'[]' ":
                    svtype = svtype.replace(c, "")
                svtypes.add(svtype)
                all_genes.add(genes)
                annotations.add(annotation)
                details.add(detail)
        if svtypes:
            start = int(rec.POS)
            end = rec.INFO.get("END")
            size = abs(rec.INFO.get("SVLEN", end - start))
            writer.writerow([rec.CHROM, start, end, _combine(svtypes), size, ";".join(calls), _combine(all_genes),
                             _combine(annotations), _combine(details)])
Exemple #11
0
def main():
    """Run main function."""
    args = parse_args(sys.argv[1:])
    # =========================================================================
    #  Gather args
    # =========================================================================
    vcf_path = args.vcf
    outfile = args.outfile
    threads = args.threads
    label_by = args.pops_header
    meta = pd.read_csv(args.meta, sep="\t", index_col="sampleID", dtype=object)
    # =========================================================================
    #  Main executions
    # =========================================================================

    vcf = cyvcf2.VCF(vcf_path)
    with tsinfer.SampleData(path=f"{outfile}.samples",
                            sequence_length=chrom_len(vcf),
                            num_flush_threads=threads,
                            max_file_size=2**37) as samples:

        add_metadata(vcf, samples, meta, label_by)
        add_diploid_sites(vcf, samples)

    print(
        f"Sample file created for {samples.num_samples} samples ({samples.num_individuals}) with {samples.num_sites} variable sites.",
        flush=True)
Exemple #12
0
def load_vcf(
    input_vcf,
    threads=1,
    aaf_thresh=0.0
):  # the function "load_vcf" has arguments, namely the name of the input_vcf,
    # how many threads to use ( computing)
    #  and a threshold for alternative allele frequency the default threshold is 0.0
    """ load a vcf """
    vcf = cyvcf2.VCF(input_vcf, gts012=True, threads=threads)  # load the vcf
    gts = []  #init empty list for genotype_entries
    aaf = []  #init empty list for alternative allele frequencies
    chr_pos = []  #init empty list for positions

    for variant in vcf:  # for each variant/position, do:
        if variant.aaf > aaf_thresh:  # if the alt. allele freq. is above threshold:
            gts.append(
                variant.gt_types.astype(int))  # append genotype array to gts
            chr_pos.append(variant.POS)  # append position to position-list
            aaf.append(variant.aaf)  # append aaf to alt. allele. freq. list

    gt_array = np.array(
        gts
    )  # make list of per-position arrays into SAMPLE x POS rectangular matrix
    samples = vcf.samples  # extract list of sample names from vcf

    return aaf, [chr_pos, samples, gt_array
                 ]  # return aaf, and genotype matrix with column and row names
Exemple #13
0
def findEmptyRegions(vcfFN, bedFN, filteredFN):
    print(vcfFN)
    print(bedFN)
    vcf = cyvcf2.VCF(vcfFN)

    fp = open(bedFN, 'r')
    fpo = open(filteredFN, 'w+')
    found = 0
    empty = 0
    for l in fp:
        fields = l.rstrip().split('\t')
        chrom, start, end = fields

        c = False
        for var in vcf(f'{chrom}:{start}-{end}'):
            c = True
            break
        if c:
            #print(*fields, sep='\t')
            found += 1
            fpo.write(l)
        else:
            empty += 1

    fpo.close()
    fp.close()
    print(found, empty, sep='\t')
    def handle(self, *args, **options):

        variant_collection_id = options['variant_collection_id']
        variant_collection = VariantCollection.objects.get(
            pk=variant_collection_id)

        logging.debug("Inserting variant_collection_id = %d",
                      variant_collection_id)
        try:
            vcf_reader = cyvcf2.VCF("/dev/stdin")  # Must take a filename..
            bulk_inserter = BulkVCFCountInserter(variant_collection)

            for v in vcf_reader:
                bulk_inserter.process_entry(v)

            bulk_inserter.finish()  # Any leftovers
            variant_collection.count = bulk_inserter.rows_processed
            variant_collection.save()
        except Exception:
            details = get_traceback()
            logging.error(details)

            try:
                node = variant_collection.intersectioncache.node_version.node
                node.status = NodeStatus.ERROR
                errors = "Error inserting variants after bed intersection:\n"
                errors += details
                logging.error(errors)
                node.errors = errors
                node.save()
            except Exception as e:
                logging.error(e)
                create_event(None,
                             name="stdin_to_variant_collection",
                             details=details)
Exemple #15
0
def vcf2SVPosition(vcf_file):
    ''' create a generator of Position object of break points from a given vcf file
    added Manta/Lumpy support Dec 21 2017, and commented out old function above
    March 3 2018 edit. Created Dictionary to call each SV types separately. 
    '''
    bp_dict = dict({'BND':[], 'DUP':[], 'INS':[], 'DEL':[], 'INV':[]})

    for variant in cyvcf2.VCF(vcf_file):
        # if variant.FILTER == None:
        variant_type = variant.INFO.get('SVTYPE')
        if variant_type  == "BND":
            # this one can be used for any SV VCF with BND type
            bnd_pos= re.search(string=variant.ALT[0], pattern=r'[a-zA-Z]*[0-9]*:[0-9]+').group(0)
            bnd_chrom, bnd_pos = bnd_pos.split(':')
            bp2String = f'{bnd_chrom}:{bnd_pos}-{int(bnd_pos) + 1}'
        elif variant_type == "TRA":
            # this one is specific for Delly v0.7.6 annotation
            bp2String = f"{variant.INFO.get('CHR2')}:{variant.INFO.get('END')}-{variant.INFO.get('END') + 1}"
        else:
            bp2String = f'{variant.CHROM}:{variant.INFO.get("END")}-{variant.INFO.get("END") + 1}'

        bp1 = Position.fromstring(f'{variant.CHROM}:{variant.POS}-{variant.POS + 1}')
        bp2 = Position.fromstring(bp2String)

        bp_dict[variant_type].append((bp1, bp2))
        
    return bp_dict
Exemple #16
0
def main(out):
    logging.info('reading from stdin...')
    stats_all = collections.defaultdict(int)
    stats_exon = collections.defaultdict(int)
    stats_onco = collections.defaultdict(int)
    stats_exon_onco = collections.defaultdict(int)
    for variant in cyvcf2.VCF('-'):
        net = len(variant.ALT[0]) - len(variant.REF)
        stats_all[net] += 1
        if variant.INFO.get('msi_exon') is not None:
            stats_exon[net] += 1
            if variant.INFO.get('msi_oncogene') is not None:
                stats_exon_onco[net] += 1
        if variant.INFO.get('msi_oncogene') is not None:
            stats_onco[net] += 1

    out.write('Change\tAll\tExon\tOnco\tExonOnco\n')
    for stat in sorted(stats_all.keys()):
        out.write('{change}\t{total}\t{exon}\t{onco}\t{exon_onco}\n'.format(
            change=stat,
            total=stats_all[stat],
            exon=stats_exon[stat],
            onco=stats_onco[stat],
            exon_onco=stats_exon_onco[stat]))

    logging.info('done')
Exemple #17
0
    def __init__(self,
                 vcf_path,
                 db_path,
                 ped_path=None,
                 blobber=pack_blob,
                 black_list=None,
                 expand=None):
        self.vcf_path = vcf_path
        self.db_path = get_dburl(db_path)
        self.engine = sql.create_engine(self.db_path,
                                        poolclass=sql.pool.NullPool)
        self.impacts_headers = {}
        self.metadata = sql.MetaData(bind=self.engine)
        self.expand = expand or []
        self.stringers = []
        self.af_cols = []  # track these to set to -1
        self.extra_columns = []

        self.blobber = blobber
        self.ped_path = ped_path
        self.black_list = list(VCFDB._black_list) + list(
            VCFDB.effect_list) + (black_list or [])

        self.vcf = cyvcf2.VCF(vcf_path)
        # we use the cache to infer the lengths of string fields.
        self.cache = it.islice(self.vcf, 10000)
        self.create_columns()
        self.samples = self.create_samples()
        self.load()
        self.index()
Exemple #18
0
def main(threshold, common_in, out, position_only):
    # TODO position_only = true only supported
    common = set()
    first = True
    logging.info('reading common variants from %s', common_in)
    for line in open(common_in, 'r'):
        if first:
            first = False
            continue
        fields = line.strip('\n').split('\t')
        prop = float(fields[3])
        if prop >= threshold:
            common.add('{}\t{}'.format(fields[0], fields[1]))

    logging.info('reading vcf from stdin')
    vcf = cyvcf2.VCF('-')
    filtered = total = 0
    out.write(vcf.raw_header)
    for total, variant in enumerate(vcf):
        if '{}\t{}'.format(variant.CHROM, variant.POS) in common:
            filtered += 1
        else:
            out.write(str(variant))

    logging.info('filtered %i of %i', filtered, total + 1)
Exemple #19
0
    def readVCF(self):
        """
		read the whole vcf file into memory and create the dictionnary that contains the mutations loci
		normally somatic calls are smaller fiels compared to germline calls and therefore can be hold in memory
		even if you have up to 10 vcfs with 1 million lines each. This will of course require up to 16GB
		"""
        return (cyvcf2.VCF(self.fvcf))
def main(sample, chrom, pos, nofilter):
    logging.info('reading from stdin...')

    vcf_in = cyvcf2.VCF('-')
    sample_id = vcf_in.samples.index(sample)

    for variant in vcf_in:
        if not nofilter and variant.FILTER is not None:
            continue

        if variant.POS == pos and variant.CHROM == chrom:
            # check gt 0,1,2,3==HOM_REF, HET, UNKNOWN, HOM_ALT
            gt = variant.gt_types[sample_id]
            if gt == 1 or gt == 3:
                ad = variant.format('AD')[sample_id]
                gt_str = ['0/0', '0/1', './.', '1/1'][gt]
                sys.stdout.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                    sample, chrom, pos, ', '.join([str(x) for x in ad]),
                    gt_str, '1'))
                logging.info('done')
                sys.exit(0)

    # not found
    sys.stdout.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(sample, chrom, pos,
                                                       'NA', 'NA', '0'))
    logging.info('done')
Exemple #21
0
def get_vep_scores(vcf_name,
                   vep_vcf_key="CSQ",
                   sel_vep_keys=["phyloP46way_placental",
                                 "phyloP46way_primate",
                                 "CADD_phred",
                                 "CADD_raw"]):
    vcf_fh = cyvcf2.VCF(vcf_name)
    # get the correct elements
    for hdr in vcf_fh.header_iter():
        hdr_info = hdr.info()
        if 'ID' in hdr_info:
            if hdr_info['ID'] == vep_vcf_key:
                vep_keys = hdr_info['Description'].split(": ")[-1].rstrip('"').split("|")
                break
    sel_vep_elms = [vep_keys.index(k) for k in sel_vep_keys]
    info_tags = []
    entries = []
    # Iterate over all entries and extract the `info_tag` if set, otherwise return all INFO tags
    for rec in vcf_fh:
        info_dict = dict(rec.INFO)
        if vep_vcf_key in info_dict:
            vep_entries = info_dict[vep_vcf_key].split(",")[0].split("|")
            variant_uid = ":".join([rec.CHROM, str(rec.POS), rec.REF, rec.ALT[0]])
            vals = [vep_entries[i] for i in sel_vep_elms]
            entries.append(pd.Series([vep_entries[i] for i in sel_vep_elms], name = variant_uid, index = sel_vep_keys))
    # Turn into a data frame
    df = pd.DataFrame(entries,)
    df = df.replace("", "nan").astype(float)
    # dedup
    df = df.loc[~pd.Series(df.index.values).duplicated().values,:]
    return df
Exemple #22
0
def ksfs(args):
    """subroutine for ksfs subcommand
    """
    vcf = cyvcf2.VCF(args.vcf)

    ksfs_data = defaultdict(lambda: Counter())
    AN = None
    for variant in vcf:
        # AN must be the same for all sites (no missing genotypes)
        if AN is not None and variant.INFO['AN'] != AN:
            raise ValueError(f'different AN {variant.INFO["AN"]} and {AN}'
                             ' indicates missing genotypes')
        AN = variant.INFO['AN']
        ksfs_data[variant.INFO['mutation_type']][variant.INFO['AC']] += 1

    # exclude fixed sites AC=0, AC=AN
    index = range(1, AN)
    for mutation_type in sorted(ksfs_data):
        ksfs_data[mutation_type] = [
            ksfs_data[mutation_type][ac] for ac in index
        ]
    ksfs = pd.DataFrame(ksfs_data, index).reindex(sorted(ksfs_data),
                                                  axis='columns')
    try:
        print(ksfs.to_csv(sep='\t', index=True,
                          index_label='sample_frequency'))
    except BrokenPipeError:
        pass
Exemple #23
0
def variants(vcf_path, show_progress=False):

    output = subprocess.check_output(
        ["bcftools", "index", "--nrecords", vcf_path])
    num_rows = int(output)
    progress = tqdm.tqdm(total=num_rows, disable=not show_progress)

    vcf = cyvcf2.VCF(vcf_path)

    num_diploids = len(vcf.samples)
    num_samples = 2 * num_diploids
    j = 0
    for row in filter_duplicates(vcf):
        progress.update()
        ancestral_state = None
        try:
            aa = row.INFO["AA"]
            # Format = AA|REF|ALT|IndelType
            splits = aa.split("|")
            if len(splits) == 4 and len(splits[0]) == 1:
                base = splits[0].upper()
                if base in "ACTG":
                    ancestral_state = base
        except KeyError:
            pass
        if row.num_called == num_diploids and ancestral_state is not None:
            a = np.zeros(num_samples, dtype=np.uint8)
            if row.is_snp and len(row.ALT) == 1:
                # Fill in a with genotypes.
                bases = row.gt_bases
                for j in range(num_diploids):
                    a[2 * j] = bases[j][0] != ancestral_state
                    a[2 * j + 1] = bases[j][2] != ancestral_state
                yield Variant(position=row.POS, genotypes=a)
    vcf.close()
def read_vcf(fn, pass_only, dp_threshold, info_af):
    logging.info('reading vcf from stdin...')
    skipped_dp = skipped_pass = 0

    vcf_in = cyvcf2.VCF(fn)
    values = []

    for variant_count, variant in enumerate(vcf_in):
        # calculate vaf
        if len(variant.ALT) > 1:
            logging.warn('variant %i is multi-allelic', variant_count + 1)

        is_pass = variant.FILTER is None or variant.FILTER == 'alleleBias'
        if pass_only and not is_pass:
            skipped_pass += 1
            continue

        if variant.INFO["DP"] < dp_threshold:  # somatic + germline
            skipped_dp += 1
            continue

        if info_af:
            value = variant.INFO["AF"]
        else:
            ad = variant.format("AD")[sample_id]
            ref = ad[0]
            alt = ad[1]
            if ref + alt > 0:
                value = alt / (ref + alt)
            else:
                value = 0

        values.append(value)

    return values
def main(qual, af, dp):
    logging.info(
        'reading vcf from stdin. qual filter %i af filter %f dp filter %i',
        qual, af, dp)

    vcf_in = cyvcf2.VCF('-')
    sys.stdout.write(vcf_in.raw_header)

    allowed = 0
    denied = 0
    for variant in vcf_in:
        ok = (variant.QUAL is None or variant.QUAL >= qual
              ) and variant.INFO["AF"] >= af and variant.INFO["DP"] >= dp

        if ok:
            sys.stdout.write(str(variant))
            allowed += 1
        else:
            denied += 1

        if (allowed + denied) % 100000 == 0:
            logging.debug('%i processed. %i allowed.', allowed + denied,
                          allowed)

    logging.info('done. wrote {}. skipped {}. total {}'.format(
        allowed, denied, allowed + denied))
Exemple #26
0
def _remove_prioritization(in_file, data, out_dir=None):
    """Remove tumor-only prioritization and return non-filtered calls.
    """
    out_file = "%s-germline.vcf" % utils.splitext_plus(in_file)[0]
    if out_dir:
        out_file = os.path.join(out_dir, os.path.basename(out_file))
    if not utils.file_uptodate(out_file, in_file) and not utils.file_uptodate(
            out_file + ".gz", in_file):
        with file_transaction(data, out_file) as tx_out_file:
            reader = cyvcf2.VCF(str(in_file))
            reader.add_filter_to_header({
                'ID':
                'Somatic',
                'Description':
                'Variant called as Somatic'
            })
            # with open(tx_out_file, "w") as out_handle:
            #     out_handle.write(reader.raw_header)
            with contextlib.closing(cyvcf2.Writer(tx_out_file,
                                                  reader)) as writer:
                for rec in reader:
                    rec = _update_prioritization_filters(rec)
                    # out_handle.write(str(rec))
                    writer.write_record(rec)
    return out_file
 def process_metadata(self, metadata_file, show_progress=False):
     """
     Adds the Max Planck metadata.
     """
     with open(metadata_file, "r") as max_planck_metadata:
         # Parse the individual metadata out of the file.
         lines = max_planck_metadata.read().splitlines()
         metadata = {}
         row = lines[1].split(" ")
         name = row[0]
         metadata["name"] = name
         metadata["age"] = int(row[2]) / GENERATION_TIME
         population = row[1]
     vcf = cyvcf2.VCF(self.data_file)
     individual_names = list(vcf.samples)
     vcf.close()
     self.num_samples = len(individual_names) * 2
     pop_id = self.samples.add_population({
         "name": population,
         "super_population": "Max Planck"
     })
     self.samples.add_individual(time=metadata["age"],
                                 metadata=metadata,
                                 population=pop_id,
                                 ploidy=2)
Exemple #28
0
def main(gene_filter):
    logging.info('starting...')

    # 1       45794974        479982  G       C       .       .       AF_EXAC=0.00001;ALLELEID=472328;CLNDISDB=MedGen:C0027672,SNOMED_CT:699346009|MedGen:CN517202;CLNDN=Hereditary_cancer-predisposing_syndrome|not_provided;CLNHGVS=NC_000001.10:g.45794974G>C;CLNREVSTAT=criteria_provided,_multiple_submitters,_no_conflicts;CLNSIG=Uncertain_significance;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=MUTYH:4595;MC=SO:0001619|non-coding_transcript_variant,SO:0001624|3_prime_UTR_variant;ORIGIN=1;RS=758118037
    stat = collections.defaultdict(int)
    genes = collections.defaultdict(int)

    for count, variant in enumerate(cyvcf2.VCF('-')):
        try:
            gene = variant.INFO['GENEINFO']
        except KeyError:
            gene = 'notspecified'

        genes[gene] += 1

        gene_name = gene.split(':')[0]

        if gene_filter is not None and gene_name != gene_filter:
            continue
        stat[variant.INFO['CLNSIG']] += 1
        if count % 10000 == 0:
            logging.debug('%s processed...', count)

    sys.stdout.write('CLNSIG\tCount\tPct\n')
    total = sum([stat[c] for c in stat])
    for c in stat:
        sys.stdout.write('{}\t{}\t{:.3f}\n'.format(c, stat[c],
                                                   stat[c] / total))

    #logging.info(genes)
    logging.info('done')
def main(manta_vcf, truth_bed):
    BUFFER = 1000
    manta_bed = manta_vcf.replace(".vcf", ".bed")
    with open(manta_bed, "w") as out_handle:
        for rec in cyvcf2.VCF(manta_vcf):
            if not rec.FILTER and rec.gt_types[0] in set([1, 3]):
                out_handle.write(
                    "%s\t%s\t%s\n" %
                    (rec.CHROM, max(0, rec.start - BUFFER), rec.end + BUFFER))
    manta_compare = "%s-giab.bed" % (os.path.splitext(manta_bed)[0])
    cmd = "bedtools intersect -c -wa -a {truth_bed} -b {manta_bed} > {manta_compare}"
    subprocess.check_call(cmd.format(**locals()), shell=True)

    counts = collections.defaultdict(int)
    totals = collections.defaultdict(int)
    with open(manta_compare) as in_handle:
        for chrom, start, end, svtype, info, matches in (l.strip().split("\t")
                                                         for l in in_handle):
            totals[svtype] += 1
            if int(matches) > 0:
                counts[svtype] += 1

    for svtype, total in totals.items():
        print("| %s | %s (%.1f%%) |" %
              (svtype, counts[svtype], float(counts[svtype]) / total * 100.0))
Exemple #30
0
 def add_to_database(self):
     self.vcf_reader = cyvcf2.VCF(self.f, gts012=True)
     self._create_new_variant_set()
     self._create_variant_set_meta_data()
     self._create_call_sets()
     self._create_variants_and_calls()
     self.vcf_reader.close()