def discoverFromVCFWithoutFilter(self, inputFname=None, outputFname=None, **keywords): """ 2012.9.11 read minDepth from self.minDepth 2012.9.5 add minDepth=0 to VCFFile #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos need a conversion in between 2012.5.8 """ vcfFile = VCFFile(inputFname=inputFname, minDepth=self.minDepth) vcfFile.parseFile() read_group2col_index = vcfFile.sample_id2index locus_id2row_index = vcfFile.locus_id2row_index #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos new_locus_id2row_index = {} for locus_id, row_index in locus_id2row_index.iteritems(): new_locus_id = '%s_%s'%(locus_id[0], locus_id[1]) new_locus_id2row_index[new_locus_id] = row_index locus_id2row_index = new_locus_id2row_index data_matrix = vcfFile.genotype_call_matrix self.outputCallMatrix(data_matrix, refFastaFname=None, outputFname=outputFname, refNameSet=None, \ read_group2col_index=read_group2col_index, \ locus_id2row_index=locus_id2row_index, outputDelimiter=self.outputDelimiter)
def run(self): """ """ if self.debug: import pdb pdb.set_trace() vcfFile1 = VCFFile(inputFname=self.inputFname) vcfFile1.parseFile() vcfFile2 = VCFFile(inputFname=self.jnputFname) vcfFile2.parseFile() writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') header = ['#chromosome', 'position', 'mismatchRate'] no_of_sites_of_input1 = len(vcfFile1.locus_id_ls) no_of_sites_of_input2 = len(vcfFile2.locus_id_ls) overlapping_sites_set = set(vcfFile1.locus_id_ls)&set(vcfFile2.locus_id_ls) no_of_overlapping_sites = len(overlapping_sites_set) no_of_total_sites = no_of_sites_of_input1+no_of_sites_of_input2-no_of_overlapping_sites no_of_samples = len(vcfFile1.sample_id2index) no_of_samples_in_vcf2 = len(vcfFile2.sample_id2index) overlapping_sample_id_set = set(vcfFile1.sample_id2index.keys()) & set(vcfFile2.sample_id2index.keys()) overlapping_sample_id_list = list(overlapping_sample_id_set) overlapping_sample_id_list.sort() if no_of_samples!=no_of_samples_in_vcf2: sys.stderr.write("Warning: sample size in %s is %s, in %s is %s. not matching.\n"%\ (self.inputFname, no_of_samples, self.jnputFname, no_of_samples_in_vcf2)) no_of_samples_to_compare = len(overlapping_sample_id_set) writer.writerow(header) locus_id2mismatchData = {} for locus_id in overlapping_sites_set: row_index1 = vcfFile1.locus_id2row_index[locus_id] row_index2 = vcfFile2.locus_id2row_index[locus_id] no_of_mismatches = 0 no_of_non_NA_pairs = 0.0 for j in xrange(len(overlapping_sample_id_list)): sample_id = overlapping_sample_id_list[j] col_index1 = vcfFile1.sample_id2index.get(sample_id) col_index2 = vcfFile2.sample_id2index.get(sample_id) call1 = vcfFile1.genotype_call_matrix[row_index1][col_index1] call2 = vcfFile2.genotype_call_matrix[row_index2][col_index2] if call1!='NA' and call2!='NA': no_of_non_NA_pairs += 1 if call1!=call2: no_of_mismatches += 1 else: #do nothing pass if no_of_non_NA_pairs>0: mismatchRate = no_of_mismatches/float(no_of_non_NA_pairs) else: mismatchRate = -1 locus_id2mismatchData[locus_id] = [mismatchRate, no_of_mismatches, no_of_non_NA_pairs] counter = 0 locus_id_ls = locus_id2mismatchData.keys() locus_id_ls.sort() for locus_id in locus_id_ls: mismatchData = locus_id2mismatchData.get(locus_id) mismatchRate = mismatchData[0] if mismatchRate<=self.maxMismatchRate: counter += 1 chr, pos = locus_id[:2] writer.writerow([chr, pos, mismatchRate]) sys.stderr.write("%s loci passed the maxMismatchRate out of %s overlapped loci.\n"%(counter, len(overlapping_sites_set)))