Esempio n. 1
0
	def run(self):
		"""
		2008-5-18
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		snpsd_ls = dataParsers.parseCSVData(self.input_fname, withArrayIds=self.withArrayIds)
		snpData = RawSnpsData_ls2SNPData(snpsd_ls, use_nt2number=1)
		del snpsd_ls
		newSnpData = transposeSNPData(snpData)
		del snpData
		newSnpData.tofile(self.output_fname, transform_to_numpy=0)
Esempio n. 2
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
		
		#to check whether two input file are in different orientation
		file_format2count = {}
		file_format_ls = [self.input_fname1_format, self.input_fname2_format]
		for file_format in file_format_ls:
			if file_format not in file_format2count:
				file_format2count[file_format] = 0
			file_format2count[file_format] += 1
		

		#2008-05-15 TwoSNPData can handle character matrix/2D-list. but transposeSNPData needs numeric matrix to transpose except when numpy is installed.
		if 1 in file_format2count and file_format2count[1]==1:	#there's one and only one strain x snp format.
			#it needs transpose matrix. only numpy works on character matrix. not sure Numeric or numarray is imported. so transform the input matrix to integer.
			use_nt2number = 1
		else:
			use_nt2number = 0
		
		if self.input_fname1_format==1:
			header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname1)
			snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
							data_matrix=data_matrix)
		elif self.input_fname1_format==2:
			snpsd_ls = dataParsers.parseCSVData(self.input_fname1, withArrayIds=False, use_nt2number=use_nt2number)
			snpData1 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0)	#already nt in number
			del snpsd_ls
		elif self.input_fname1_format==3:
			snpsd_ls = dataParsers.parseCSVData(self.input_fname1, withArrayIds=True, use_nt2number=use_nt2number)
			snpData1 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0)
			del snpsd_ls
		else:
			sys.stderr.write('Error: unsupported input_fname1 format, %s\n' % self.input_fname1_format)
			sys.exit(2)
		
		if self.run_type!=2:
			if self.input_fname2_format==1:
				header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname2)
				snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list,\
								data_matrix=data_matrix)
			elif self.input_fname2_format==2:
				snpsd_ls = dataParsers.parseCSVData(self.input_fname2, withArrayIds=False, use_nt2number=use_nt2number)
				snpData2 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0)
				del snpsd_ls
			else:
				sys.stderr.write('Error: unsupported input_fname2 format, %s\n' % self.input_fname2_format)
				sys.exit(2)
			
	
			if 1 in file_format2count and file_format2count[1]==1:	#there's one and only one strain x snp format. transpose the 2nd snpData
				snpData2 = transposeSNPData(snpData2, report=self.report)
			
			if self.input_fname1_format == 1:	#row_id for the 1st file = (ecotype_id, duplicate). for 2nd file, row_id=ecotype_id.
				row_matching_by_which_value = 0
				col_matching_by_which_value = None
			elif self.input_fname1_format == 2:	#col_id for the 1st file = accession. for 2nd file, col_id=accession.
				row_matching_by_which_value = None
				col_matching_by_which_value = None
			elif self.input_fname1_format == 3:	#col_id for the 1st file = (array_id, accession). for 2nd file, col_id=accession.
				row_matching_by_which_value = None
				col_matching_by_which_value = 1
		else:
			#2008-10-12 pairwise mismatch between same data
			snpData2 = snpData1
			row_matching_by_which_value = None
			col_matching_by_which_value = None
		
		twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, row_matching_by_which_value=row_matching_by_which_value,\
							col_matching_by_which_value=col_matching_by_which_value, debug=self.debug)
		
		if self.run_type==3:
			#2008-10-12 compare snpData1 and snpData2 only for designated entries from snpData1
			if not self.ecotype_id_ls:
				sys.stderr.write("Run_type %s: ecotype_id_ls (%s) is not specified.\n"%(self.run_type, self.ecotype_id_ls))
				sys.exit(3)
			ecotype_id_set = Set(self.ecotype_id_ls)
			row_id_ls = []	#test against 
			for row_id in snpData1.row_id_ls:
				
				if not isinstance(row_id, str) and hasattr(row_id, '__len__'):
					ecotype_id = row_id[0]
				else:
					ecotype_id = row_id
				if ecotype_id in ecotype_id_set:
					row_id_ls.append(row_id)
			print '%s arrays'%(len(row_id_ls))
			if self.ecotype_id_ls:
				for row_id in row_id_ls:
					col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise(row_id=row_id)
					if col_id2NA_mismatch_rate:
						if not isinstance(row_id, str) and hasattr(row_id, '__len__'):
							row_id_name = '_'.join(row_id)
						else:
							row_id_name = row_id
						output_fname = '%s_%s'%(self.output_fname, row_id_name)
						twoSNPData.output_col_id2NA_mismatch_rate_InGWRFormat(col_id2NA_mismatch_rate, output_fname)
		elif self.run_type==2:
			#2008-10-12	column-wise mismatch of snpData1 vs snpData1 between rows with same ecotype_id but different array_id
			row_id_pair_set = Set()
			for row_id in snpData1.row_id_ls:
				
				if not isinstance(row_id, str) and hasattr(row_id, '__len__'):
					ecotype_id = row_id[0]
				else:
					ecotype_id = row_id
				for row_id2 in snpData2.row_id_ls:
					if row_id2[0]==ecotype_id and row_id2[1]!=row_id[1]:	#same ecotype_id but different array_id
						row_id_pair_set.add((row_id, row_id2))
			
			print '%s arrays'%(len(row_id_pair_set))
			for row_id1, row_id2 in row_id_pair_set:
				row_id12row_id2 = {row_id1:row_id2}
				col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise(row_id=row_id1, row_id12row_id2=row_id12row_id2)
				if col_id2NA_mismatch_rate:
					output_fname = '%s_%s_vs_%s'%(self.output_fname, '_'.join(row_id1), '_'.join(row_id2))
					twoSNPData.output_col_id2NA_mismatch_rate_InGWRFormat(col_id2NA_mismatch_rate, output_fname)
		elif self.run_type==1:
			#sys.exit(2)	#2008-10-12 skip all original functions
			row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
			col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise()
			if row_id2NA_mismatch_rate:
				QC_250k.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname, file_1st_open=1)
			if col_id2NA_mismatch_rate:
				QC_250k.output_row_id2NA_mismatch_rate(col_id2NA_mismatch_rate, self.output_fname, file_1st_open=0)