def do_work(fd_vcf, csv): vcf = Vcf(fd_vcf) vcf.load_meta_header() grps = Group(csv) #sys.stderr.write("# Of groups loaded: %s\n" % grps.num()) min_threshold = float(sys.argv[2]) min_num_all = float(sys.argv[3]) ComputeSnps(vcf, grps, min_threshold, min_num_all).run()
def __load_vcf(self): self.vcf = Vcf(self.fd_vcf) self.vcf.load_meta_header() if self.drop and (not self.coordinates_in_file and self.vcf.num_of_samples < 2): drdcommon.error( "I need a population level vcf in order to drop species snps.")
def do_work(fd_vcf, tsv_pheno, tsv_haplo): vcf = Vcf(fd_vcf) vcf.load_meta_header() grps_pheno, grps_haplo = Group(tsv_pheno), Group(tsv_haplo) matrix, a_sites, a_groups = prepare(vcf, grps_pheno, grps_haplo) print matrix.shape print a_groups #values = np.random.randn(100,100) * 10 #a_sites = ['1_100', '2_200', '3_300'] #a_groups = ['grp1', 'grp1', 'grp2'] cb_labels = ['HETE', 'HOMO_VAR', 'OTHER', 'NO_COVERAGE', 'HOMO_REF'] drdplots.Heatmap(matrix, cb_labels, a_sites, a_groups).plot()
def doHeader(fd): vcf = Vcf(fd) vcf.load_meta_header() ot_already_there = str(vcf.check_info('OT')) if ot_already_there == True: error("This vcf seems to have an OT INFO already. Bailing out.") vcf.add_info('OT', '0', 'Flag', 'The site is on target.') # print "Checking if INFO id=OT is there: " + str(vcf.check_info('OT')) print vcf.get_meta() return vcf
def doHeader(fd): vcf = Vcf(fd) vcf.load_meta_header() ot_already_there = str(vcf.check_info('RDP')) if ot_already_there == True: error("This vcf seems to have an RDP INFO already. Bailing out.") vcf.add_info('RDP', '1', 'Integer', 'Raw read coverage at locus.') # print "Checking if INFO id=OT is there: " + str(vcf.check_info('OT')) print vcf.get_meta() return vcf
def do_work(fd_vcf, min_num_samples): vcf = Vcf(fd_vcf) vcf.load_meta_header() report(process_snps(vcf, min_num_samples))
class SnpFreq(object): GENOME_SIZE = {'wgs': 3000000000, 'wes': 34000000} MIN_QUAL = 20 def __init__(self, fd_vcf, exp_type, options): self.fd_vcf = fd_vcf self.exp_type = exp_type self.drop = options.drop self.list_s_snps = options.list_s_snps self.__validate_type() self.coordinates_in_file = options.coordinates_in_file self.__load_vcf() if options.coor_fn: self.coor_fn = options.coor_fn self.__load_species_snp_coordinates() def __load_species_snp_coordinates(self): fd = drdcommon.xopen(self.coor_fn) d = {} self.d_species_coor = d n = 0 for l in fd: n += 1 chrm, coor = l.split() if not d.has_key(chrm): d[chrm] = {} d[chrm][int(coor)] = 1 fd.close() logging.info("# of coordinates loaded: %d" % n) logging.info("current memory usage in %dkb" % drdcommon.memory_usage()) def __load_vcf(self): self.vcf = Vcf(self.fd_vcf) self.vcf.load_meta_header() if self.drop and (not self.coordinates_in_file and self.vcf.num_of_samples < 2): drdcommon.error( "I need a population level vcf in order to drop species snps.") def __validate_type(self): if self.exp_type != 'wgs' and \ self.exp_type != 'wes' and \ self.exp_type != 'null': raise_it('Invalid experiment type. Valid types: wgs or wes') def __list_species_snps(self): for l in self.vcf.each_snp(): snp = VcfSnp(l) if snp.is_a_substitution() and \ snp.has_high_quality(self.MIN_QUAL) and \ snp.species_snp(): print(snp.coordinate(' ')) def __is_a_species_snp(self, snp): if self.coordinates_in_file: ch, co = snp.coordinate(' ').split() return self.d_species_coor.has_key( ch) and self.d_species_coor[ch].has_key(int(co)) else: return snp.species_snp() def __calculate_snp_freq(self): """ Compute the snp frequency (# of snps per kbp) Drop snps that are indels, have low quality If wes, also drop non coding regions If drop is True, we have to drop species snps """ num_snps = 0 total = 0 for l in self.vcf.each_snp(): snp = VcfSnp(l) total += 1 if snp.is_a_substitution() and snp.has_high_quality(self.MIN_QUAL): if self.exp_type == 'wgs': if not self.drop or (self.drop and not self.__is_a_species_snp(snp)): num_snps += 1 if self.exp_type == 'wes' and snp.in_coding_region(): if not self.drop or (self.drop and not self.__is_a_species_snp(snp)): num_snps += 1 logging.info("Total/counted: %d/%d" % (total, num_snps)) return (float(num_snps) / self.GENOME_SIZE[self.exp_type]) * 1000 def run(self): if self.list_s_snps: self.__list_species_snps() else: return self.__calculate_snp_freq()
def do_work(fd_vcf): vcf = Vcf(fd_vcf) report(process_snps(vcf))
def __load_vcf(self): self.vcf = Vcf(self.fd_vcf) self.vcf.load_meta_header() if self.drop and (not self.coordinates_in_file and self.vcf.num_of_samples < 2): drdcommon.error("I need a population level vcf in order to drop species snps.")
class SnpFreq(object): GENOME_SIZE = {'wgs': 3000000000, 'wes': 34000000} MIN_QUAL = 20 def __init__(self, fd_vcf, exp_type, options): self.fd_vcf = fd_vcf self.exp_type = exp_type self.drop = options.drop self.list_s_snps = options.list_s_snps self.__validate_type() self.coordinates_in_file = options.coordinates_in_file self.__load_vcf() if options.coor_fn: self.coor_fn = options.coor_fn self.__load_species_snp_coordinates() def __load_species_snp_coordinates(self): fd = drdcommon.xopen(self.coor_fn) d = {} self.d_species_coor = d n = 0 for l in fd: n += 1 chrm, coor = l.split() if not d.has_key(chrm): d[chrm] = {} d[chrm][int(coor)] = 1 fd.close() logging.info("# of coordinates loaded: %d" % n) logging.info("current memory usage in %dkb" % drdcommon.memory_usage()) def __load_vcf(self): self.vcf = Vcf(self.fd_vcf) self.vcf.load_meta_header() if self.drop and (not self.coordinates_in_file and self.vcf.num_of_samples < 2): drdcommon.error("I need a population level vcf in order to drop species snps.") def __validate_type(self): if self.exp_type != 'wgs' and \ self.exp_type != 'wes' and \ self.exp_type != 'null': raise_it('Invalid experiment type. Valid types: wgs or wes') def __list_species_snps(self): for l in self.vcf.each_snp(): snp = VcfSnp(l) if snp.is_a_substitution() and \ snp.has_high_quality(self.MIN_QUAL) and \ snp.species_snp(): print(snp.coordinate(' ')) def __is_a_species_snp(self, snp): if self.coordinates_in_file: ch, co = snp.coordinate(' ').split() return self.d_species_coor.has_key(ch) and self.d_species_coor[ch].has_key(int(co)) else: return snp.species_snp() def __calculate_snp_freq(self): """ Compute the snp frequency (# of snps per kbp) Drop snps that are indels, have low quality If wes, also drop non coding regions If drop is True, we have to drop species snps """ num_snps = 0 total = 0 for l in self.vcf.each_snp(): snp = VcfSnp(l) total += 1 if snp.is_a_substitution() and snp.has_high_quality(self.MIN_QUAL): if self.exp_type == 'wgs': if not self.drop or (self.drop and not self.__is_a_species_snp(snp)): num_snps += 1 if self.exp_type == 'wes' and snp.in_coding_region(): if not self.drop or (self.drop and not self.__is_a_species_snp(snp)): num_snps += 1 logging.info("Total/counted: %d/%d" % (total, num_snps)) return (float(num_snps)/self.GENOME_SIZE[self.exp_type])*1000 def run(self): if self.list_s_snps: self.__list_species_snps() else: return self.__calculate_snp_freq()