Esempio n. 1
0
 def get_total_vcf_sample(self):
     all_samples = set()
     vcf = pysam.VariantFile(self.vcf_path)
     for record in vcf.fetch():
         for sample in record.samples:
             all_samples.add(sample)
         break
     return all_samples
Esempio n. 2
0
def map_samples(trees, vcf_path):
    # key: regionName
    # value: list of sample names with variants
    RS_dict = {}

    vcf = pysam.VariantFile(vcf_path)
    for record in vcf.fetch():

        # Determine which tree to query
        chrom = record.chrom
        if chrom == "X":
            chrom_index = 22
        elif chrom == "Y":
            chrom_index = 23
        else:
            try:
                chrom_index = int(chrom) - 1
            except ValueError:
                pass

        # Determine query interval
        start = int(record.pos)
        SVTYPE = record.info.get("SVTYPE")
        if SVTYPE == None:  # SNP: region = SNP base
            end = start + 1
        elif SVTYPE == "INS":  # INS: region = flanking bases
            end = start + 2
        elif SVTYPE == "DEL":  # DEL: region = deleted bases
            SVLEN = int(record.info.get("SVLEN"))
            end = start + abs(SVLEN)
        else:
            raise ValueError("Undefined SVTYPE: %s" % SVTYPE)

        # Find overlapping regions
        intervals = trees[chrom_index].find(start, end)
        regions = set()
        for i in intervals:
            regions.add(i.value['regionName'])

        # If variant overlaps with any regions, append sample names to dict of each region
        if (regions != set()):
            # Get sample names with this variant
            names = []
            samples = record.samples
            for i in range(0, len(samples)):
                if samples[i].get("GT") != (
                        None, None) and samples[i].get("GT") != (0, 0):
                    names.append(samples[i].name)
            # Append sample names to dictionary for each region (key)
            for regionName in regions:
                for name in names:
                    RS_dict.setdefault(regionName, {})[name] = 1
    print("map_samples --done")
    return RS_dict
Esempio n. 3
0
    def file_info(self):
        vcf = pysam.VariantFile(self.vcf_path)
        vcf_var = 0
        for record in vcf.fetch():
            vcf_sample = len(record.samples)
            vcf_var += 1

        expression_gene = 0
        with open(self.expression_matrix_path, 'r') as f:
            samples = f.readline().strip("\n").split("\t")[1:]
            expression_sample = len(samples)
            f.readline()
            for line in f:
                expression_gene += 1

        cov_used_num = len(self.covariate_numerical) + len(
            self.covariate_categorical)
        cov_sample = 0
        with open(self.covariate_file_path, 'r') as f:
            cov_num = len(f.readline().strip("\n").split("\t")) - 1
            for line in f:
                cov_sample += 1

        return vcf_sample, vcf_var, expression_sample, expression_gene, cov_sample, cov_num, cov_used_num
Esempio n. 4
0
#load in the HGDP genomes Gerp, or really RS-annotated 
vcf = vcf.Reader(open('all_combined.snps.autos_and_PAR.vqsr99.BEAGLE.snpEff.RefAnc.GERP.callableMask.vcf.gz','r'))
#and load in the bed file of windows - here, the start and stop positions of the canonical transcripts 
winfile = open("hg19_intergenic.bed", "r")
wins = np.genfromtxt(winfile, dtype=None)  #import the text of the windows file as a numpy array


for line in range(0, len(wins)):
#look up SNPs in the VCF file in terms of the windows. The first col is the transript ID, then the chrom and then start and stop pos for the canonical transcript
    chrom = wins[line][0]
    start = wins[line][1]
    stop = wins[line][2]
    
    count = 0  #count is the number of SNPs in this window
    gerpavg = 0.0
    gerpsum = 0.0  #ensure this will be output in float format
    
    for record in vcf.fetch(chrom,start,stop):  #get all the SNPs in the vcf file that are in this window
        count = count + 1
    	#gerpsum = 0.0 
        gerpsum = gerpsum + float(record.INFO['RS'][0])
    if count > 0 :
    	gerpavg = gerpsum/count
    else : gerpavg = "NA"
    output = "%s\t%s\t%s\t%s\t%s\n" % (str(chrom), str(start), str(stop), str(gerpavg), str(count))
    #outputs the chromosome, the start and stop sites of the particular window, the average GERP score for SNPs in that window, 
        
    outFile.write(output)
outFile.close()
    hap_index[header.strip("\n").split("\t").index(i)] = i
vcf = vcf.Reader(open(args.GZVCF))

with open(args.combined) as FILE:
    index = 0
    previous_pos = start
    for line in FILE:
        if line == header:
            continue
        items = line.strip("\n").split("\t")
        var_pos = int(items[0])
        print var_pos
        ### The water gets a little choppy here.
        ## what this section does is to grab the chunk of the genome before the
        if var_pos - previous_pos > 1:
            chunk = vcf.fetch(chrom, start, var_pos - 1)
            ref_chunk = "".join([j.REF for j in chunk])
            for key in hap_index.keys():
                full_haps[hap_index[key]] += ref_chunk + items[key]
        elif var_pos - previous_pos == 1:
            for key in hap_index.keys():
                full_haps[hap_index[key]] += items[key]
        previous_pos = var_pos
        index += 1

    ####
### Var_pos should now represent the last line in the file...
if end - var_pos > 1:
    chunk = vcf.fetch(chrom, var_pos + 1, end)
    ref_chunk = "".join([j.REF for j in chunk])
    for key in hap_index.keys():