def run(self): """ 2008-5-18 """ if self.debug: import pdb pdb.set_trace() snpsd_ls = dataParsers.parseCSVData(self.input_fname, withArrayIds=self.withArrayIds) snpData = RawSnpsData_ls2SNPData(snpsd_ls, use_nt2number=1) del snpsd_ls newSnpData = transposeSNPData(snpData) del snpData newSnpData.tofile(self.output_fname, transform_to_numpy=0)
def run(self): if self.debug: import pdb pdb.set_trace() from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix #to check whether two input file are in different orientation file_format2count = {} file_format_ls = [self.input_fname1_format, self.input_fname2_format] for file_format in file_format_ls: if file_format not in file_format2count: file_format2count[file_format] = 0 file_format2count[file_format] += 1 #2008-05-15 TwoSNPData can handle character matrix/2D-list. but transposeSNPData needs numeric matrix to transpose except when numpy is installed. if 1 in file_format2count and file_format2count[1]==1: #there's one and only one strain x snp format. #it needs transpose matrix. only numpy works on character matrix. not sure Numeric or numarray is imported. so transform the input matrix to integer. use_nt2number = 1 else: use_nt2number = 0 if self.input_fname1_format==1: header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname1) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) elif self.input_fname1_format==2: snpsd_ls = dataParsers.parseCSVData(self.input_fname1, withArrayIds=False, use_nt2number=use_nt2number) snpData1 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0) #already nt in number del snpsd_ls elif self.input_fname1_format==3: snpsd_ls = dataParsers.parseCSVData(self.input_fname1, withArrayIds=True, use_nt2number=use_nt2number) snpData1 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0) del snpsd_ls else: sys.stderr.write('Error: unsupported input_fname1 format, %s\n' % self.input_fname1_format) sys.exit(2) if self.run_type!=2: if self.input_fname2_format==1: header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname2) snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list,\ data_matrix=data_matrix) elif self.input_fname2_format==2: snpsd_ls = dataParsers.parseCSVData(self.input_fname2, withArrayIds=False, use_nt2number=use_nt2number) snpData2 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0) del snpsd_ls else: sys.stderr.write('Error: unsupported input_fname2 format, %s\n' % self.input_fname2_format) sys.exit(2) if 1 in file_format2count and file_format2count[1]==1: #there's one and only one strain x snp format. transpose the 2nd snpData snpData2 = transposeSNPData(snpData2, report=self.report) if self.input_fname1_format == 1: #row_id for the 1st file = (ecotype_id, duplicate). for 2nd file, row_id=ecotype_id. row_matching_by_which_value = 0 col_matching_by_which_value = None elif self.input_fname1_format == 2: #col_id for the 1st file = accession. for 2nd file, col_id=accession. row_matching_by_which_value = None col_matching_by_which_value = None elif self.input_fname1_format == 3: #col_id for the 1st file = (array_id, accession). for 2nd file, col_id=accession. row_matching_by_which_value = None col_matching_by_which_value = 1 else: #2008-10-12 pairwise mismatch between same data snpData2 = snpData1 row_matching_by_which_value = None col_matching_by_which_value = None twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, row_matching_by_which_value=row_matching_by_which_value,\ col_matching_by_which_value=col_matching_by_which_value, debug=self.debug) if self.run_type==3: #2008-10-12 compare snpData1 and snpData2 only for designated entries from snpData1 if not self.ecotype_id_ls: sys.stderr.write("Run_type %s: ecotype_id_ls (%s) is not specified.\n"%(self.run_type, self.ecotype_id_ls)) sys.exit(3) ecotype_id_set = Set(self.ecotype_id_ls) row_id_ls = [] #test against for row_id in snpData1.row_id_ls: if not isinstance(row_id, str) and hasattr(row_id, '__len__'): ecotype_id = row_id[0] else: ecotype_id = row_id if ecotype_id in ecotype_id_set: row_id_ls.append(row_id) print '%s arrays'%(len(row_id_ls)) if self.ecotype_id_ls: for row_id in row_id_ls: col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise(row_id=row_id) if col_id2NA_mismatch_rate: if not isinstance(row_id, str) and hasattr(row_id, '__len__'): row_id_name = '_'.join(row_id) else: row_id_name = row_id output_fname = '%s_%s'%(self.output_fname, row_id_name) twoSNPData.output_col_id2NA_mismatch_rate_InGWRFormat(col_id2NA_mismatch_rate, output_fname) elif self.run_type==2: #2008-10-12 column-wise mismatch of snpData1 vs snpData1 between rows with same ecotype_id but different array_id row_id_pair_set = Set() for row_id in snpData1.row_id_ls: if not isinstance(row_id, str) and hasattr(row_id, '__len__'): ecotype_id = row_id[0] else: ecotype_id = row_id for row_id2 in snpData2.row_id_ls: if row_id2[0]==ecotype_id and row_id2[1]!=row_id[1]: #same ecotype_id but different array_id row_id_pair_set.add((row_id, row_id2)) print '%s arrays'%(len(row_id_pair_set)) for row_id1, row_id2 in row_id_pair_set: row_id12row_id2 = {row_id1:row_id2} col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise(row_id=row_id1, row_id12row_id2=row_id12row_id2) if col_id2NA_mismatch_rate: output_fname = '%s_%s_vs_%s'%(self.output_fname, '_'.join(row_id1), '_'.join(row_id2)) twoSNPData.output_col_id2NA_mismatch_rate_InGWRFormat(col_id2NA_mismatch_rate, output_fname) elif self.run_type==1: #sys.exit(2) #2008-10-12 skip all original functions row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise() col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise() if row_id2NA_mismatch_rate: QC_250k.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname, file_1st_open=1) if col_id2NA_mismatch_rate: QC_250k.output_row_id2NA_mismatch_rate(col_id2NA_mismatch_rate, self.output_fname, file_1st_open=0)