def get_total_vcf_sample(self): all_samples = set() vcf = pysam.VariantFile(self.vcf_path) for record in vcf.fetch(): for sample in record.samples: all_samples.add(sample) break return all_samples
def map_samples(trees, vcf_path): # key: regionName # value: list of sample names with variants RS_dict = {} vcf = pysam.VariantFile(vcf_path) for record in vcf.fetch(): # Determine which tree to query chrom = record.chrom if chrom == "X": chrom_index = 22 elif chrom == "Y": chrom_index = 23 else: try: chrom_index = int(chrom) - 1 except ValueError: pass # Determine query interval start = int(record.pos) SVTYPE = record.info.get("SVTYPE") if SVTYPE == None: # SNP: region = SNP base end = start + 1 elif SVTYPE == "INS": # INS: region = flanking bases end = start + 2 elif SVTYPE == "DEL": # DEL: region = deleted bases SVLEN = int(record.info.get("SVLEN")) end = start + abs(SVLEN) else: raise ValueError("Undefined SVTYPE: %s" % SVTYPE) # Find overlapping regions intervals = trees[chrom_index].find(start, end) regions = set() for i in intervals: regions.add(i.value['regionName']) # If variant overlaps with any regions, append sample names to dict of each region if (regions != set()): # Get sample names with this variant names = [] samples = record.samples for i in range(0, len(samples)): if samples[i].get("GT") != ( None, None) and samples[i].get("GT") != (0, 0): names.append(samples[i].name) # Append sample names to dictionary for each region (key) for regionName in regions: for name in names: RS_dict.setdefault(regionName, {})[name] = 1 print("map_samples --done") return RS_dict
def file_info(self): vcf = pysam.VariantFile(self.vcf_path) vcf_var = 0 for record in vcf.fetch(): vcf_sample = len(record.samples) vcf_var += 1 expression_gene = 0 with open(self.expression_matrix_path, 'r') as f: samples = f.readline().strip("\n").split("\t")[1:] expression_sample = len(samples) f.readline() for line in f: expression_gene += 1 cov_used_num = len(self.covariate_numerical) + len( self.covariate_categorical) cov_sample = 0 with open(self.covariate_file_path, 'r') as f: cov_num = len(f.readline().strip("\n").split("\t")) - 1 for line in f: cov_sample += 1 return vcf_sample, vcf_var, expression_sample, expression_gene, cov_sample, cov_num, cov_used_num
#load in the HGDP genomes Gerp, or really RS-annotated vcf = vcf.Reader(open('all_combined.snps.autos_and_PAR.vqsr99.BEAGLE.snpEff.RefAnc.GERP.callableMask.vcf.gz','r')) #and load in the bed file of windows - here, the start and stop positions of the canonical transcripts winfile = open("hg19_intergenic.bed", "r") wins = np.genfromtxt(winfile, dtype=None) #import the text of the windows file as a numpy array for line in range(0, len(wins)): #look up SNPs in the VCF file in terms of the windows. The first col is the transript ID, then the chrom and then start and stop pos for the canonical transcript chrom = wins[line][0] start = wins[line][1] stop = wins[line][2] count = 0 #count is the number of SNPs in this window gerpavg = 0.0 gerpsum = 0.0 #ensure this will be output in float format for record in vcf.fetch(chrom,start,stop): #get all the SNPs in the vcf file that are in this window count = count + 1 #gerpsum = 0.0 gerpsum = gerpsum + float(record.INFO['RS'][0]) if count > 0 : gerpavg = gerpsum/count else : gerpavg = "NA" output = "%s\t%s\t%s\t%s\t%s\n" % (str(chrom), str(start), str(stop), str(gerpavg), str(count)) #outputs the chromosome, the start and stop sites of the particular window, the average GERP score for SNPs in that window, outFile.write(output) outFile.close()
hap_index[header.strip("\n").split("\t").index(i)] = i vcf = vcf.Reader(open(args.GZVCF)) with open(args.combined) as FILE: index = 0 previous_pos = start for line in FILE: if line == header: continue items = line.strip("\n").split("\t") var_pos = int(items[0]) print var_pos ### The water gets a little choppy here. ## what this section does is to grab the chunk of the genome before the if var_pos - previous_pos > 1: chunk = vcf.fetch(chrom, start, var_pos - 1) ref_chunk = "".join([j.REF for j in chunk]) for key in hap_index.keys(): full_haps[hap_index[key]] += ref_chunk + items[key] elif var_pos - previous_pos == 1: for key in hap_index.keys(): full_haps[hap_index[key]] += items[key] previous_pos = var_pos index += 1 #### ### Var_pos should now represent the last line in the file... if end - var_pos > 1: chunk = vcf.fetch(chrom, var_pos + 1, end) ref_chunk = "".join([j.REF for j in chunk]) for key in hap_index.keys():