Example #1
0
def read_vep(vep_result_path, max_per_var=False):
    ''' Read MMSplice VEP plugin output. Only support vcf type output.

    Args:
        vep_result_path: file path to the returned result of VEP plugin.
        max_per_var: return maximum absolute effect size per variant.
    '''

    from cyvcf2 import VCF
    from collections import defaultdict

    score_pred = []

    keys = [
        'mmsplice_alt_acceptor', 'mmsplice_alt_acceptorIntron',
        'mmsplice_alt_donor', 'mmsplice_alt_donorIntron', 'mmsplice_alt_exon',
        'mmsplice_delta_logit_psi', 'mmsplice_pathogenicity',
        'mmsplice_ref_acceptor', 'mmsplice_ref_acceptorIntron',
        'mmsplice_ref_donor', 'mmsplice_ref_donorIntron', 'mmsplice_ref_exon'
    ]

    alt_seqs = defaultdict(list)
    ref_seqs = defaultdict(list)

    for l in VCF(vep_result_path):
        csq = l.INFO['CSQ'].split(',')
        predictions = map(lambda x: tuple(x.split('|')[-len(keys):]), csq)

        for pred in predictions:
            if pred != ('', ) * len(keys):
                x = dict(
                    zip(keys, map(float, (i if i != '' else 0 for i in pred))))
                x['ID'] = "%s:%d:%s:%s" % (l.CHROM, int(l.start) + 1, l.REF,
                                           l.ALT)
                score_pred.append(x)

    df_plugin = pd.DataFrame(score_pred)

    if max_per_var:
        df_plugin = max_varEff(df_plugin).set_index('ID')

    return df_plugin
Example #2
0
def validate_cpsr_input(pcgr_directory, input_vcf, custom_list_fname, preserved_info_tags, vcf_validation, genome_assembly, sample_id, virtual_panel_id, diagnostic_grade_only, output_dir, debug):
   """
   Function that reads the input files to CPSR (VCF file) and performs the following checks:
   1. Check that VCF file is properly formatted (according to EBIvariation/vcf-validator - VCF v4.2) - optional (vcf_validation in config file)
   2. Check that no INFO annotation tags in the query VCF coincides with those generated by CPSR
   3. Check that custom VCF INFO tags set by user as retained for output is found in query VCF
   4. Check that if VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose
   5. Check that VCF contains a single sample column
   6. The resulting VCF file is sorted and indexed (bgzip + tabix)
   """
   logger = utils.getlogger('cpsr-validate-input-arguments')

   custom_list_bed_fname = 'None'
   if not custom_list_fname == 'None':
      logger.info('Establishing BED track with custom list of genes from panel 0')
      custom_list_bed_fname = os.path.join(output_dir, sample_id + '.cpsr.' + genome_assembly + '.custom_list.bed')
      get_valid_custom_genelist(custom_list_fname, custom_list_bed_fname, pcgr_directory, genome_assembly, logger, debug)

   #config_options = annoutils.read_config_options(configuration_file, pcgr_directory, genome_assembly, logger, wflow = 'cpsr')
   if not input_vcf == 'None':
      if vcf_validation == 1:
         logger.info('Skipping validation of VCF file (deprecated as of Dec 2021)')
      else:
         logger.info('Skipping validation of VCF file as provided by option --no_vcf_validate')

      tag_check = check_existing_vcf_info_tags(input_vcf, pcgr_directory, genome_assembly, logger)
      if tag_check == -1:
         return -1

      if preserved_info_tags != "None":
         custom_check = check_preserved_vcf_info_tags(input_vcf, preserved_info_tags, logger)
         if custom_check == -1:
            return -1

      vcf = VCF(input_vcf)
      samples = vcf.samples
      if len(samples) > 1:
         err_msg = "Query VCF contains more than one sample column (" + ', '.join(samples) + ") - CPSR expects a germline VCF with a single sample column - exiting"
         return error_message(err_msg, logger)
      simplify_vcf(input_vcf, vcf, custom_list_bed_fname, pcgr_directory, genome_assembly, virtual_panel_id, sample_id, diagnostic_grade_only, output_dir, logger, debug)

   return 0
Example #3
0
def bar_chart(vcf, outname="stacked_bar.png"):
    """
    Make a stacked bar chart for length of the SV split by validation status
    This ignores zygosity.
    """
    from cyvcf2 import VCF
    from surpyvor import utils
    import numpy as np

    len_dict = {"True": [], "False": [], "Missed": []}
    for v in VCF(vcf):
        if not v.INFO.get('SVTYPE') == 'TRA' and abs(
                v.INFO.get('SVLEN')) >= 50:
            calls = [utils.is_variant(call) for call in v.gt_types]
            if calls == [True, True]:
                len_dict['True'].append(v.INFO.get('SVLEN'))
            elif calls == [False, True]:
                len_dict['False'].append(v.INFO.get('SVLEN'))
            elif calls == [True, False]:
                len_dict['Missed'].append(v.INFO.get('SVLEN'))
    plt.subplot(2, 1, 1)
    plt.hist(x=np.array(list(len_dict.values())),
             bins=[i for i in range(0, 2000, 10)],
             stacked=True,
             histtype='bar',
             label=list(len_dict.keys()))
    plt.xlabel('Length of structural variant')
    plt.ylabel('Number of variants')
    plt.legend(frameon=False, fontsize="small")
    plt.subplot(2, 1, 2)
    plt.hist(x=np.array(list(len_dict.values())),
             bins=[i for i in range(0, 20000, 100)],
             stacked=True,
             histtype='bar',
             label=list(len_dict.keys()),
             log=True)
    plt.xlabel('Length of structural variant')
    plt.ylabel('Number of variants')
    plt.legend(frameon=False, fontsize="small")
    plt.tight_layout()
    plt.savefig(outname)
    plt.close()
Example #4
0
def process_files(options):
    '''
    Arguments:
       options: the command line options of the program
    Result:
       None
    '''
    writer = csv.writer(sys.stdout, delimiter="\t")
    header = [
        "chr1", "pos1", "chr2", "pos2", "sense1", "sense2", "insertlen",
        "qual", "sample"
    ]
    writer.writerow(header)
    for vcf_filename in options.vcf_files:
        logging.info("Processing VCF file from %s", vcf_filename)
        vcf = VCF(vcf_filename)
        samples = vcf.samples
        results = process_variants(options.qual, options.ispass, samples, vcf)
        for row in results:
            writer.writerow(row)
Example #5
0
def get_file_handle(file_path):
    """Return cyvcf2 VCF object
    
    Args:
        file_path(str)
    
    Returns:
        vcf_obj(cyvcf2.VCF)
    """
    logger.debug("Check if file end is correct")

    if not os.path.exists(file_path):
        raise IOError("No such file:{0}".format(file_path))

    if not os.path.splitext(file_path)[-1] in VALID_ENDINGS:
        raise IOError("Not a valid vcf file name: {}".format(file_path))

    vcf_obj = VCF(file_path)

    return vcf_obj
Example #6
0
def test_id_field_updates():
    # 1 10172   .       CCCTAA  C       92.0    PASS
    v = VCF(VCF_PATH)
    variant = next(v)
    assert variant.ID is None, variant.ID

    variant.ID = 'foo'
    assert variant.ID == 'foo', variant.ID

    variant.ID = 100
    assert variant.ID == '100', variant.ID

    variant.ID = 100.1
    assert variant.ID == '100.1', variant.ID

    variant.ID = '.'
    assert variant.ID is None, variant.ID

    variant.ID = None
    assert variant.ID is None, variant.ID
Example #7
0
def parse_segments(vcffile):
    """ Extract all copy number segments from a CANVAS file

    VCF line looks like:
    chr1    788879  Canvas:GAIN:chr1:788880-821005  N       <CNV>   2       q10
    SVTYPE=CNV;END=821005;CNVLEN=32126      RC:BC:CN:MCC    157:4:3:2
    """
    from cStringIO import StringIO
    from cyvcf2 import VCF

    output = StringIO()
    for v in VCF(vcffile):
        chrom = v.CHROM
        start = v.start
        end = v.INFO.get("END") - 1
        (cn, ) = v.format("CN")[0]
        print("\t".join(str(x) for x in (chrom, start, end, cn)), file=output)

    beds = BedTool(output.getvalue(), from_string=True)
    return beds
Example #8
0
    def read_vcf(cls: Type[V], path: Path) -> Generator[V, None, None]:
        """
        Read VCF record from `path`.

        This function walks through each variant record in the given VCF using :class:`cyvcf2.VCF
        <cyvcf2.cyvcf2.VCF>`, and yields the record as a :class:`Variant` object.

        See also :meth:`read_and_parse_vcf` to read and parse the VCF.

        Args:
            path: Path to the VCF.

        Returns:
            An generator walking through all variants per record.

        """
        with closing(VCF(str(path))) as vcf:
            for cy_variant in vcf:
                variant = cls.from_cyvcf2(cy_variant)
                yield variant
Example #9
0
def get_call_rate_Ychr(sample_list):
    vcf = VCF(path_to_vcf)
    vcf.set_samples(sample_list)

    # initialize lists
    ids = []
    call_rate = []

    # count # of samples of each genotype for every variant
    for variant in vcf:
        if variant.CHROM == "chrY": # only look at variants on Y chromosome
            ids.append(variant.ID)
            call_rate.append(variant.call_rate)
    
    # create dataframe
    df = pd.DataFrame(
        {"SV" : ids,
        "call_rate" : call_rate
        })
    return(df)
Example #10
0
def generate_sample_vcf(vcf_path, outfile='samp_build38.vcf'):
    """Takes a large VCF file and takes random samples from each chromosome to make a smaller VCF for testing"""
    vcf = VCF(vcf_path)
    write = Writer(outfile, vcf)
    write.write_header()
    key_values = zip(vcf.seqnames, vcf.seqlens)
    chrom_keys = defaultdict(int)
    chroms = get_autosome_names_grch38()
    for kv in list(key_values):
        if kv[0] in chroms:
            chrom_keys[kv[0]] = kv[1]
    for chrom_num, chrom_len in chrom_keys.items():
        begin = random.randint(100000, chrom_len - 100000)
        os.system(
            append_variants_to_vcf(vcf_path,
                                   chrom_num,
                                   begin,
                                   begin + 10_000,
                                   outfile=outfile))
    write.close()
Example #11
0
def compute_referencepanel(ref_file, samples_to_use, variants_to_use):
    #compute size of ref_matrix and initialize it with zeros
    #width=number of variants between first and last relevant variant
    #height=twice the number of samples, as each sample offers two haplotypes

    height = 2 * len(samples_to_use)

    refmatrix = ['' for i in range(height)]
    vcf_ref = VCF(ref_file, lazy=True)
    doubleset = set()
    index = -1
    h**o = 0
    homozyg = False
    for variant in variants_to_use:
        if (variant.POS not in doubleset):
            index += 1
            doubleset.add(variant.POS)
            counter = 0
            homozyg = False
            for baseindex in samples_to_use:
                genotype = variant.genotypes[baseindex]
                if (genotype[0] == -1 or genotype[1] == -1):
                    refmatrix[counter] += '-'
                    refmatrix[counter + 1] += '-'
            #	elif (genotype[0]==0 and genotype[1]==0):
            #		refmatrix[counter] += '-'
            #		refmatrix[counter+1] += '-'
                else:
                    if (variant.gt_phases[baseindex] == False
                            and variant.gt_types[baseindex] == 1):
                        refmatrix[counter] += '-'
                        refmatrix[counter + 1] += '-'
                    else:
                        refmatrix[counter] += str(genotype[0])
                        refmatrix[counter + 1] += str(genotype[1])
                if (genotype[0] == genotype[1]):
                    homozyg = True
                counter += 2
            if homozyg:
                h**o += 1
    return (refmatrix)
def main(vcffile, querylist, outfile):
    """
    把vcf文件转为treemix的输入文件,不进行群体合并,每个个体都是分开的
    !!!!!!这个脚本还没写完呢!!!!!!
    """
    querysamples = [x.strip() for x in open(querylist)]
    vcf_query = VCF(vcffile, gts012=True, samples=querysamples)
    if len(querysamples) > len(vcf_query.samples):
        miss = set(querysamples) - set(vcf_query.samples)
        print(f'query sample miss: {miss}')
        for ind in miss:
            querysamples.remove(ind)
    df = []
    index = []
    for variant_query in vcf_query(region):
        arr = variant_query.gt_types  # 0=HOM_REF, 1=HET, 2=HOM_ALT, 3=UNKNOWN
        df.append(arr.tolist())
        index.append(variant_query.POS)

    df = pd.DataFrame(df, columns=vcf_query.samples, index=index)
    df = df[querysamples] # 排序
    df = df.replace(3, np.nan) # 自动 int to float
    print(f'{os.path.basename(vcffile)} {region}:\n{df.shape}')

    # 改名
    id2groups = loadgroup(groupfile)
    df.columns = [f'{id2groups[x]}_{x}' for x in df.columns]

    # MAF筛选
    freqs = df.sum(axis=1).values / (df.count(axis=1).values * 2)
    df = df.loc[((1-maf)>=freqs)&(freqs>=maf), :]
    print(f'filter maf({maf}):\n{df.shape}')

    # 画图
    fig, ax = plt.subplots(1, 1, figsize=figsize)
    ax.set_facecolor("grey")
    sns.heatmap(df.T, yticklabels=1, cmap='OrRd', ax=ax)
    for label in (ax.get_xticklabels() + ax.get_yticklabels()):
        label.set_fontsize(ticklabelsize)
    plt.savefig(outfile, dpi=dpi)
    plt.close()
Example #13
0
 def processVariants(self):
     cyVCF = VCF(self.vcfFilePath)
     self.families.setSampleIdxs(cyVCF.samples)
     getCSQList = self.getCSQList(cyVCF.raw_header)
     cyVCF.add_info_to_header({
         "ID":
         "Evidence_Codes",
         "Number":
         "1",
         "Type":
         "String",
         "Description":
         "All ACMG evidence codes that apply to this variant"
     })
     cyVCF.add_info_to_header({
         "ID":
         "Posterior_Pathogenic_Probability",
         "Number":
         "1",
         "Type":
         "String",
         "Description":
         "Posterior Pathogenic Probability"
     })
     self.outputVCF.write(cyVCF.raw_header)
     for v in cyVCF:
         matchingClinVarVariants = []
         for alt in v.ALT:
             key = "%s:%s:%s:%s" % (v.CHROM, v.POS, v.REF, alt)
             if key in self.clinVarData:
                 matchingClinVarVariants.append(self.clinVarData[key])
         var = variant.Variant(v, self.families, self.gnomAD_AF_Threshold,
                               self.REVEL_Threshold, getCSQList,
                               matchingClinVarVariants)
         if not var.printVariant:
             continue
         posterior = self.getPosterior(var)
         v.INFO["Evidence_Codes"] = var.getEvidenceCodesString()
         v.INFO["Posterior_Pathogenic_Probability"] = str(
             format(self.getPosterior(var), '.3f'))
         self.outputVCF.write(str(v))
Example #14
0
def tables(vcfFile, dataset, prefix, chunksize, verbose):
    """
    Reads an opened file as a VCF and returns pandas tables
    """
    vcf_reader = VCF(vcfFile)
    samples = vcf_reader.samples

    sample_map = sample_to_sampleId(samples)
    write_callset_table(samples, dataset, prefix)
    write_sample_table(samples, dataset, prefix)

    data_tables = empty_tables()
    writers = {"variants": None, "annotations": None, "gts": None}

    for vid, record in enumerate(vcf_reader):
        data_tables["variants"]['vId'].append(vid)
        data_tables["variants"]['chrom'].append(record.CHROM)
        data_tables["variants"]['pos'].append(record.POS)
        data_tables["variants"]['ref'].append(record.REF)
        data_tables["variants"]['alt'].append(str(record.ALT[0]))
        
        if record.INFO['geneSymbol']:
            data_tables["annotations"]['vId'].append(vid)
            data_tables["annotations"]['geneSymbol'].append(record.INFO['geneSymbol'])

        # gt_types is array of 0,1,2,3==HOM_REF, HET, UNKNOWN, HOM_ALT
        has_calls = np.where(record.gt_types % 2 == 1)[0]
        for idx in has_calls:
            sid, gt = sample_map[samples[idx]], record.gt_types[idx]
            data_tables["gts"]['vId'].append(vid)
            data_tables["gts"]['callsetId'].append(sid)
            data_tables["gts"]['genotype'].append(gt)

        if (vid+1) % chunksize == 0:
            update_files(data_tables, writers, prefix)
            data_tables = empty_tables()
            if verbose:
                print(vid+1)

    if len(data_tables["variants"]["vId"]):
        update_files(data_tables, writers, prefix)
Example #15
0
def validate_panel_normal_vcf(vcf, logger):
    """
    Function that checks the INFO tags in the panel of normal VCF for the presense of 'PANEL_OF_NORMAL' (logical tag)
    If any coinciding tags, an error will be returned
    """

    vcf = VCF(vcf)
    ret = -1
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element.keys() and 'HeaderType' in header_element.keys():
            if header_element['HeaderType'] == 'INFO' and header_element['Type'] == 'Flag':
                if header_element['ID'] == 'PANEL_OF_NORMALS':
                    logger.info('Found \'PANEL_OF_NORMALS\' INFO flag in the VCF header section of the of panel of normals VCF file')
                    ret = 1

    if ret == -1:
        err_msg = 'INFO flag \'PANEL_OF_NORMALS\' is missing from the panel of normal VCF header'
        return error_message(err_msg, logger)

    return ret
Example #16
0
def vcf2df(vcf_fname, dfsamples):
    """Convert a vcf file (from the 1kg aisnps) to a pandas DataFrame
    :param vcf_fname: path to the vcf file with aisnps for every 1kg sample
    :type vcf_fname: str
    :param dfsamples: DataFrame with sample-level info on each 1kg sample.
    :type dfsamples: pandas DataFrame
    :return: DataFrame with genotypes for aisnps as columns and samples as rows.
    :rtype: pandas DataFrame
    """
    vcf_file = VCF(vcf_fname)
    df = pd.DataFrame(index=vcf_file.samples)
    for variant in vcf_file():
        # TODO: ensure un-phasing variants is the desired behavior
        # sorted() normalizes the order of the genotypes
        df[variant.ID] = [
            "".join(sorted(gt.replace("|", ""))) for gt in variant.gt_bases
        ]

    df = df.join(dfsamples, how="inner")

    return df
Example #17
0
    def write_R_qtl_csv_header(self, out_file):

        tmp_vcf_obj = VCF(self.__my_vcf_file)
        marker_line = ""
        #make strings for all the positions
        for variant in tmp_vcf_obj:
            position = "_".join(
                [variant.CHROM,
                 str(variant.start),
                 str(variant.end)])
            marker_line = marker_line + "," + position

        #if there is an LG_file, then get the LG's for the markers and writ them out
        #if not then just print the marker line again
        out_file.write(marker_line + "\n")
        if (len(self.__LG_file) > 0):
            #i know i'm actually running __make_chr_line twice to do this.
            out_file.write(self.__make_chr_line()[0] + "\n")
            out_file.write(self.__make_chr_line()[1] + "\n")
        else:
            out_file.write(marker_line)
Example #18
0
def perchrom(vcf_chrom):
    outs=[0,0]
    vcf, chrom = vcf_chrom
    viter = VCF(VCF_PATH)(chrom)
    for v in viter:
        outs[1]+=1
        info = v.INFO
        if v.FILTER: continue
        try:
            csqs = [dict(zip(kcsq, c.split("|"))) for c in info['CSQ'].split(",")]
        except KeyError:
            continue
        for csq in (c for c in csqs if c['BIOTYPE'] == 'protein_coding'): # getting duplicate rows because of this, wastes memory and potentially compute time, could remove and replace with just if isfunctional, add to rows then move on?
            # skipping intronic
            if csq['Feature'] == '' or csq['EXON'] == '' : continue #or csq['cDNA_position'] == '': continue
            if u.isfunctional(csq): 
                outs[0]+=1
#                print v,
#                sys.stdout.flush()
                break
    return outs
Example #19
0
def count_variants(vcf):
    """Count the number of variants in a vcf file

    Args:
        vcf(iterable): An iterable VCF with variants

    Returns:
        nr_variants(int): Number of variants in file
    """
    if type(vcf) == str:
        try:
            vcf = VCF(vcf)
        except:
            LOG.critical('Please provide a valid path to a VCF file!')
            sys.exit()

    nr_variants = 0
    for variant in vcf:
        nr_variants += 1

    return nr_variants
def circularity_build_list(dir_query):
    print(dir_query)
    print(os.getcwd())
    list_dir_query = os.listdir(dir_query)
    pbar = tqdm(list_dir_query)
    tmp_list_q = list()
    for fq in pbar:
        if fq.endswith('.gz') and 'CADD' not in fq:
            # if fq.endswith('.gz'):
            pbar.set_description(
                'Building list of variants - Processing file : {}'.format(fq))
            vcf_query = VCF(dir_query + '/' + fq)
            for counter, record in enumerate(vcf_query):
                if counter == 10000:
                    break
                tmp_rec = str(record.CHROM) + '_' + str(
                    record.POS) + '_' + str(record.REF) + '_' + str(
                        record.ALT[0])
                tmp_list_q.append(tmp_rec)
    tmp_list_q = set(tmp_list_q)
    return tmp_list_q
Example #21
0
    def spliceSiteGenerator(vcf_file, exonTree, variant_filter=True):
        variants = VCF(vcf_file)
        for var in variants:
            if variant_filter and var.FILTER:
                next
            iv = VariantInterval.from_Variant(var)

            matches = map(lambda x: x.interval,
                          exonTree.intersect(iv, ignore_strand=True))

            for match in matches:
                side = get_var_side((
                    var.POS,
                    var.REF,
                    var.ALT,
                    match.Exon_Start,
                    match.Exon_End,
                    match.strand
                ))
                var = iv.to_Variant(match.strand, side)  # to my Variant class
                yield match, var
Example #22
0
def test_access_gts():
    vcf = VCF('{}/test-format-string.vcf'.format(HERE))
    """
7	55086956	.	C	G	0	.	.	GT:ADP_ALL:RULE	0/0:6728,1:F	1|1:22,1:G
7	55086957	.	T	A,C,G	0	.	.	GT:ADP_ALL:RULE	1/2:6768,2,2,1:F2,F3,F4	2|3:1,2,3,4:G2,G3,G4
7	55086958	.	T	G	0	.	.	GT:ADP_ALL:RULE	0/1/.:6768,2,2,1:F2,F3,F4	0:1,2,3,4:G2,G3,G4
7	55086959	.	T	G,T	0	.	.	GT:ADP_ALL:RULE	.	0|2:1,2,3,4:G2,G3,G4
    """

    v = next(vcf)
    gts = v.genotypes
    assert gts == [[0, 0, False], [1, 1, True]], gts

    v = next(vcf)
    assert v.genotypes == [[1, 2, False], [2, 3, True]], v.genotypes

    v = next(vcf)
    assert v.genotypes == [[0, 1, -1, False], [0, True]], v.genotypes

    v = next(vcf)
    assert v.genotypes == [[-1, True], [0, 2, True]], v.genotypes
Example #23
0
def vep_dbnsfp_meta_vcf(query_vcf, info_tags_wanted):
   vep_to_pcgr_af = {'gnomAD_AMR_AF':'AMR_AF_GNOMAD','gnomAD_AFR_AF':'AFR_AF_GNOMAD','gnomAD_EAS_AF':'EAS_AF_GNOMAD','gnomAD_NFE_AF':'NFE_AF_GNOMAD','gnomAD_AF':'GLOBAL_AF_GNOMAD',
                     'gnomAD_SAS_AF':'SAS_AF_GNOMAD','gnomAD_OTH_AF':'OTH_AF_GNOMAD','gnomAD_ASJ_AF':'ASJ_AF_GNOMAD','gnomAD_FIN_AF':'FIN_AF_GNOMAD','AFR_AF':'AFR_AF_1KG',
                     'AMR_AF':'AMR_AF_1KG','SAS_AF':'SAS_AF_1KG','EUR_AF':'EUR_AF_1KG','EAS_AF':'EAS_AF_1KG', 'AF':'GLOBAL_AF_1KG'}

   vcf = VCF(query_vcf)
   vep_csq_index2fields = {}
   vep_csq_fields2index = {}
   dbnsfp_prediction_algorithms = []
   for e in vcf.header_iter():
      header_element = e.info()
      if 'ID' in header_element.keys():
         identifier = str(header_element['ID'])
         if identifier == 'CSQ' or identifier == 'DBNSFP':
            description = str(header_element['Description'])
            if 'Format: ' in description:
               subtags = description.split('Format: ')[1].split('|')
               if identifier == 'CSQ':
                  i = 0
                  for t in subtags:
                     v = t
                     if t in vep_to_pcgr_af:
                        v = str(vep_to_pcgr_af[t])
                     if v in info_tags_wanted:
                        vep_csq_index2fields[i] = v
                        vep_csq_fields2index[v] = i
                     i = i + 1
               if identifier == 'DBNSFP':
                  i = 7
                  while(i < len(subtags)):
                     dbnsfp_prediction_algorithms.append(str(re.sub(r'((_score)|(_pred))"*$','',subtags[i])))
                     i = i + 1

   vep_dbnsfp_meta_info = {}
   vep_dbnsfp_meta_info['vep_csq_fieldmap'] = {}
   vep_dbnsfp_meta_info['vep_csq_fieldmap']['field2index'] = vep_csq_fields2index
   vep_dbnsfp_meta_info['vep_csq_fieldmap']['index2field'] = vep_csq_index2fields
   vep_dbnsfp_meta_info['dbnsfp_prediction_algorithms'] = dbnsfp_prediction_algorithms

   return vep_dbnsfp_meta_info
Example #24
0
def get_split_vcf_regions(vcf_path, nprocs):
    vcf = VCF(vcf_path)
    nprocs = int(nprocs)
    num_entries = np.sum(vcf.seqlens)
    chunk_size = int(num_entries / nprocs) + 1
    names = get_autosome_names_grch38()
    num_chunk = 0
    regions = []
    gen_pos = 0
    current_chromosome = 0
    chrom_pos = 0
    while num_chunk < nprocs and current_chromosome < len(names):
        current_chunk = 0
        region = []
        while current_chunk < chunk_size:
            if current_chromosome >= len(names):
                current_chunk = chunk_size
                continue
            remaining_chunk = chunk_size - current_chunk
            if remaining_chunk <= (vcf.seqlens[current_chromosome] -
                                   chrom_pos):
                new_region = GRegion(vcf.seqnames[current_chromosome],
                                     chrom_pos, chrom_pos + remaining_chunk)
                region.append(new_region)
                chrom_pos += remaining_chunk
                current_chunk += new_region.size()
                gen_pos += new_region.size()
                continue
            else:  # remaining chunk can fit remainder of chromosome and then some
                new_region = GRegion(vcf.seqnames[current_chromosome],
                                     chrom_pos,
                                     vcf.seqlens[current_chromosome])
                region.append(new_region)
                current_chunk += new_region.size()
                gen_pos += new_region.size()
                chrom_pos = 0
                current_chromosome += 1
        regions.append(region)
    # print(regions)
    return regions
Example #25
0
def test_haploid():

    for (gts012, (HOM_REF, HOM_ALT, UNKNOWN)) in ((False, [0, 3, 2]),
                                                  (True, [0, 2, 3])):
        vcf = VCF("%s/test-haploidX.vcf" % HERE, gts012=gts012)
        for i, v in enumerate(vcf):
            assert not any("/" in b
                           for b in v.gt_bases), (v.start + 1, v.gt_bases)
            if i == 0:
                assert (v.gt_types == [HOM_ALT, HOM_ALT,
                                       HOM_ALT]).all(), v.gt_types
            elif v.start == 2800676:
                assert (v.gt_types == [UNKNOWN, HOM_ALT,
                                       UNKNOWN]).all(), v.gt_types
            elif v.start == 2832771:
                assert (v.gt_types == [HOM_REF, HOM_ALT,
                                       HOM_ALT]).all(), v.gt_types
                break

            if v.start == 2700156:
                assert (v.gt_bases == ['A', 'A', 'A']).all(), v.gt_bases
        break
def test_load_sv_case_variants(mongo_adapter, sv_case_obj):
    db = mongo_adapter.db
    ## GIVEN a mongo adatper with snv variant file
    vcf_obj = VCF(sv_case_obj["vcf_sv_path"])
    ## WHEN loading the variants
    nr_variants = load_variants(
        adapter=mongo_adapter,
        vcf_obj=vcf_obj,
        case_obj=sv_case_obj,
        variant_type="sv",
    )

    nr_loaded_svs = 0
    for nr_loaded_svs, variant in enumerate(db.structural_variant.find(), 1):
        pass
    nr_loaded_snvs = 0
    for nr_loaded_snvs, variant in enumerate(db.variant.find(), 1):
        pass
    ## THEN assert that the correct number of variants was loaded
    assert nr_loaded_svs > 0
    assert nr_loaded_snvs == 0
    assert nr_loaded_svs == sv_case_obj["nr_sv_variants"]
Example #27
0
def check_existing_vcf_info_tags(input_vcf, pcgr_directory, genome_assembly, logger):

   """
   Function that compares the INFO tags in the query VCF and the INFO tags generated by PCGR
   If any coinciding tags, an error will be returned
   """

   pcgr_infotags_desc = annoutils.read_infotag_file(os.path.join(pcgr_directory,'data',genome_assembly, 'cpsr_infotags.tsv'))

   vcf = VCF(input_vcf)
   logger.info('Checking if existing INFO tags of query VCF file coincide with CPSR INFO tags')
   ret = 1
   for e in vcf.header_iter():
      header_element = e.info()
      if 'ID' in header_element.keys() and 'HeaderType' in header_element.keys():
         if header_element['HeaderType'] == 'INFO':
            if header_element['ID'] in pcgr_infotags_desc.keys():
               err_msg = 'INFO tag ' + str(header_element['ID']) + ' in the query VCF coincides with a VCF annotation tag produced by CPSR - please remove or rename this tag in your query VCF'
               return annoutils.error_message(err_msg, logger)

   logger.info('No query VCF INFO tags coincide with CPSR INFO tags')
   return ret
Example #28
0
    def __init__(self, vcf_file):
        """Iteratively a vcf file intoa dictionary

        Args:
          vcf_file: .vcf file path (can be also .vcf.gz, .bcf, .bcf.gz)

        Iterator returns:
          a nested dictionary with the schema:
             - variant:
               - id
               - chr
               - pos
               - ref
               - alt
             - other:
               - f1
               - f2
             - kipoi:
               - model:
                 - type:
                   - feature1...
                   - feature2...
        """
        from cyvcf2 import VCF
        self.vcf_file = vcf_file
        self.vcf = VCF(vcf_file)
        self.info_tags = get_info_tags(self.vcf)
        self.info_ids = get_info_ids(self.info_tags)
        self.kipoi_colnames = get_kipoi_colnames(self.info_tags)
        self.kipoi_columns = [
            x for x in self.info_ids if x in self.kipoi_colnames
        ]
        self.other_columns = [
            x for x in self.info_ids if x not in self.kipoi_columns
        ]
        self.kipoi_parsed_colnames = {
            k: parse_kipoi_colname(k)
            for k in self.kipoi_colnames
        }
Example #29
0
def load_clinvar(cpath):
    from cyvcf2 import VCF
    from collections import defaultdict

    lookup = {}
    for v in VCF(cpath):
        info = v.INFO
        gene = info.get('GENEINFO')
        if gene is None: continue
        diseases = [x.decode('utf8', 'ignore').encode('ascii', 'ignore') for x in info.get('CLNDBN').split("|") if not x in (".", "not_specified", "not_provided")]
        if diseases == []: continue

        genes = [x.split(":")[0] for x in gene.split("|")]
        for gene in genes:
            key = v.CHROM, gene
            if key in lookup:
                lookup[key].extend(diseases)
            else:
                lookup[key] = diseases
    for k in lookup:
        lookup[k] = "|".join(sorted(set(lookup[k]))).lower()
    return lookup
Example #30
0
def match_sites(args):
    vcf_path, sites = args
    vcf = VCF(vcf_path)
    matches = []
    t0 = time.time()
    for s in sites:
        for v in vcf("%s:%d-%d" % (s[0], s[1] - 1, s[1])):
            if len(v.ALT) > 1:
                continue
            ra = (v.REF, v.ALT[0])
            if ra != (s[2], s[3]) and ra != (s[3], s[2]):
                continue
            if v.start != s[1] - 1:
                continue
            matches.append(v)

    print(
        "found %d out of %d sites in %.1f seconds" %
        (len(matches), len(sites), time.time() - t0),
        file=sys.stderr,
    )
    return matches