예제 #1
0
	def run(self):
		"""
		2008-05-20 read_call_matrix returns PassingData object
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.user,
				   password=self.passwd, hostname=self.hostname, database=self.dbname)
		db.setup(create_tables=False)
		session = db.session
		QC_method_id = 0 	#just for QC_250k.get_call_info_id2fname()
		call_data = QC_250k.get_call_info_id2fname(db, QC_method_id, self.call_method_id, filter_calls_QCed=0, \
												max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, input_dir=self.input_dir,\
												take_unique_ecotype=self.take_unique_ecotype)
		#snps_with_best_QC_ls = self.get_snps_with_best_QC_ls(db, self.call_method_id)
		if self.max_snp_mismatch_rate<1 or self.max_snp_NA_rate<1:	#2008-05-18 only do this when it's necessary
			snps_name_set = self.get_snps_name_set_given_criteria(db, self.call_method_id, self.max_snp_mismatch_rate, self.max_snp_NA_rate)
		else:
			snps_name_set = None
		pdata = QC_250k.read_call_matrix(call_data.call_info_id2fname, self.min_probability, snps_name_set)	#2008-05-20 read_call_matrix returns PassingData object
		strain_acc_list, category_list = pdata.ecotype_id_ls, pdata.array_id_ls
		write_data_matrix(pdata.data_matrix, self.output_fname, pdata.header, strain_acc_list, category_list)
예제 #2
0
파일: QC.py 프로젝트: bopopescu/gwasmodules
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        #to check whether two input file are in different orientation
        file_format2count = {}
        file_format_ls = [self.input_fname1_format, self.input_fname2_format]
        for file_format in file_format_ls:
            if file_format not in file_format2count:
                file_format2count[file_format] = 0
            file_format2count[file_format] += 1

        #2008-05-15 TwoSNPData can handle character matrix/2D-list. but transposeSNPData needs numeric matrix to transpose except when numpy is installed.
        if 1 in file_format2count and file_format2count[
                1] == 1:  #there's one and only one strain x snp format.
            #it needs transpose matrix. only numpy works on character matrix. not sure Numeric or numarray is imported. so transform the input matrix to integer.
            use_nt2number = 1
        else:
            use_nt2number = 0

        if self.input_fname1_format == 1:
            snpData1 = SNPData(input_fname=self.input_fname1,
                               turn_into_array=1,
                               ignore_2nd_column=1)
            #header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname1)
            #snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
            #				data_matrix=data_matrix)
        elif self.input_fname1_format == 2:
            snpsd_ls = dataParsers.parseCSVData(self.input_fname1,
                                                withArrayIds=False,
                                                use_nt2number=use_nt2number)
            snpData1 = RawSnpsData_ls2SNPData(
                snpsd_ls, report=self.report,
                use_nt2number=0)  #already nt in number
            del snpsd_ls
        elif self.input_fname1_format == 3:
            snpsd_ls = dataParsers.parseCSVData(self.input_fname1,
                                                withArrayIds=True,
                                                use_nt2number=use_nt2number)
            snpData1 = RawSnpsData_ls2SNPData(snpsd_ls,
                                              report=self.report,
                                              use_nt2number=0)
            del snpsd_ls
        else:
            sys.stderr.write('Error: unsupported input_fname1 format, %s\n' %
                             self.input_fname1_format)
            sys.exit(2)

        if self.run_type != 2:
            if self.input_fname2_format == 1:
                snpData2 = SNPData(input_fname=self.input_fname2,
                                   turn_into_array=1,
                                   ignore_2nd_column=1)
                #header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname2)
                #snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list,\
                #				data_matrix=data_matrix)
            elif self.input_fname2_format == 2:
                snpsd_ls = dataParsers.parseCSVData(
                    self.input_fname2,
                    withArrayIds=False,
                    use_nt2number=use_nt2number)
                snpData2 = RawSnpsData_ls2SNPData(snpsd_ls,
                                                  report=self.report,
                                                  use_nt2number=0)
                del snpsd_ls
            else:
                sys.stderr.write(
                    'Error: unsupported input_fname2 format, %s\n' %
                    self.input_fname2_format)
                sys.exit(2)

            if 1 in file_format2count and file_format2count[
                    1] == 1:  #there's one and only one strain x snp format. transpose the 2nd snpData
                snpData2 = transposeSNPData(snpData2, report=self.report)

            if self.input_fname1_format == 1:  #row_id for the 1st file = (ecotype_id, duplicate). for 2nd file, row_id=ecotype_id.
                row_matching_by_which_value = 0
                col_matching_by_which_value = None
            elif self.input_fname1_format == 2:  #col_id for the 1st file = accession. for 2nd file, col_id=accession.
                row_matching_by_which_value = None
                col_matching_by_which_value = None
            elif self.input_fname1_format == 3:  #col_id for the 1st file = (array_id, accession). for 2nd file, col_id=accession.
                row_matching_by_which_value = None
                col_matching_by_which_value = 1
        else:
            #2008-10-12 pairwise mismatch between same data
            snpData2 = snpData1
            row_matching_by_which_value = None
            col_matching_by_which_value = None

        twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, row_matching_by_which_value=row_matching_by_which_value,\
             col_matching_by_which_value=col_matching_by_which_value, debug=self.debug)

        if self.run_type == 3:
            #2008-10-12 compare snpData1 and snpData2 only for designated entries from snpData1
            if not self.ecotype_id_ls:
                sys.stderr.write(
                    "Run_type %s: ecotype_id_ls (%s) is not specified.\n" %
                    (self.run_type, self.ecotype_id_ls))
                sys.exit(3)
            ecotype_id_set = set(self.ecotype_id_ls)
            row_id_ls = []  #test against
            for row_id in snpData1.row_id_ls:

                if not isinstance(row_id, str) and hasattr(row_id, '__len__'):
                    ecotype_id = row_id[0]
                else:
                    ecotype_id = row_id
                if ecotype_id in ecotype_id_set:
                    row_id_ls.append(row_id)
            print '%s arrays' % (len(row_id_ls))
            if self.ecotype_id_ls:
                for row_id in row_id_ls:
                    col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise(
                        row_id=row_id)
                    if col_id2NA_mismatch_rate:
                        if not isinstance(row_id, str) and hasattr(
                                row_id, '__len__'):
                            row_id_name = '_'.join(row_id)
                        else:
                            row_id_name = row_id
                        output_fname = '%s_%s' % (self.output_fname,
                                                  row_id_name)
                        twoSNPData.output_col_id2NA_mismatch_rate_InGWRFormat(
                            col_id2NA_mismatch_rate, output_fname)
        elif self.run_type == 2:
            #2008-10-12	column-wise mismatch of snpData1 vs snpData1 between rows with same ecotype_id but different array_id
            row_id_pair_set = set()
            for row_id in snpData1.row_id_ls:

                if not isinstance(row_id, str) and hasattr(row_id, '__len__'):
                    ecotype_id = row_id[0]
                else:
                    ecotype_id = row_id
                for row_id2 in snpData2.row_id_ls:
                    if row_id2[0] == ecotype_id and row_id2[1] != row_id[
                            1]:  #same ecotype_id but different array_id
                        row_id_pair_set.add((row_id, row_id2))

            print '%s arrays' % (len(row_id_pair_set))
            for row_id1, row_id2 in row_id_pair_set:
                row_id12row_id2 = {row_id1: row_id2}
                col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise(
                    row_id=row_id1, row_id12row_id2=row_id12row_id2)
                if col_id2NA_mismatch_rate:
                    output_fname = '%s_%s_vs_%s' % (self.output_fname,
                                                    '_'.join(row_id1),
                                                    '_'.join(row_id2))
                    twoSNPData.output_col_id2NA_mismatch_rate_InGWRFormat(
                        col_id2NA_mismatch_rate, output_fname)
        elif self.run_type == 1:
            #sys.exit(2)	#2008-10-12 skip all original functions
            row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
            col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise()
            if row_id2NA_mismatch_rate:
                QC_250k.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate,
                                                       self.output_fname,
                                                       file_1st_open=1)
            if col_id2NA_mismatch_rate:
                QC_250k.output_row_id2NA_mismatch_rate(col_id2NA_mismatch_rate,
                                                       self.output_fname,
                                                       file_1st_open=0)
예제 #3
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
		
		#to check whether two input file are in different orientation
		file_format2count = {}
		file_format_ls = [self.input_fname1_format, self.input_fname2_format]
		for file_format in file_format_ls:
			if file_format not in file_format2count:
				file_format2count[file_format] = 0
			file_format2count[file_format] += 1
		

		#2008-05-15 TwoSNPData can handle character matrix/2D-list. but transposeSNPData needs numeric matrix to transpose except when numpy is installed.
		if 1 in file_format2count and file_format2count[1]==1:	#there's one and only one strain x snp format.
			#it needs transpose matrix. only numpy works on character matrix. not sure Numeric or numarray is imported. so transform the input matrix to integer.
			use_nt2number = 1
		else:
			use_nt2number = 0
		
		if self.input_fname1_format==1:
			header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname1)
			snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
							data_matrix=data_matrix)
		elif self.input_fname1_format==2:
			snpsd_ls = dataParsers.parseCSVData(self.input_fname1, withArrayIds=False, use_nt2number=use_nt2number)
			snpData1 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0)	#already nt in number
			del snpsd_ls
		elif self.input_fname1_format==3:
			snpsd_ls = dataParsers.parseCSVData(self.input_fname1, withArrayIds=True, use_nt2number=use_nt2number)
			snpData1 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0)
			del snpsd_ls
		else:
			sys.stderr.write('Error: unsupported input_fname1 format, %s\n' % self.input_fname1_format)
			sys.exit(2)
		
		if self.run_type!=2:
			if self.input_fname2_format==1:
				header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname2)
				snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list,\
								data_matrix=data_matrix)
			elif self.input_fname2_format==2:
				snpsd_ls = dataParsers.parseCSVData(self.input_fname2, withArrayIds=False, use_nt2number=use_nt2number)
				snpData2 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0)
				del snpsd_ls
			else:
				sys.stderr.write('Error: unsupported input_fname2 format, %s\n' % self.input_fname2_format)
				sys.exit(2)
			
	
			if 1 in file_format2count and file_format2count[1]==1:	#there's one and only one strain x snp format. transpose the 2nd snpData
				snpData2 = transposeSNPData(snpData2, report=self.report)
			
			if self.input_fname1_format == 1:	#row_id for the 1st file = (ecotype_id, duplicate). for 2nd file, row_id=ecotype_id.
				row_matching_by_which_value = 0
				col_matching_by_which_value = None
			elif self.input_fname1_format == 2:	#col_id for the 1st file = accession. for 2nd file, col_id=accession.
				row_matching_by_which_value = None
				col_matching_by_which_value = None
			elif self.input_fname1_format == 3:	#col_id for the 1st file = (array_id, accession). for 2nd file, col_id=accession.
				row_matching_by_which_value = None
				col_matching_by_which_value = 1
		else:
			#2008-10-12 pairwise mismatch between same data
			snpData2 = snpData1
			row_matching_by_which_value = None
			col_matching_by_which_value = None
		
		twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, row_matching_by_which_value=row_matching_by_which_value,\
							col_matching_by_which_value=col_matching_by_which_value, debug=self.debug)
		
		if self.run_type==3:
			#2008-10-12 compare snpData1 and snpData2 only for designated entries from snpData1
			if not self.ecotype_id_ls:
				sys.stderr.write("Run_type %s: ecotype_id_ls (%s) is not specified.\n"%(self.run_type, self.ecotype_id_ls))
				sys.exit(3)
			ecotype_id_set = Set(self.ecotype_id_ls)
			row_id_ls = []	#test against 
			for row_id in snpData1.row_id_ls:
				
				if not isinstance(row_id, str) and hasattr(row_id, '__len__'):
					ecotype_id = row_id[0]
				else:
					ecotype_id = row_id
				if ecotype_id in ecotype_id_set:
					row_id_ls.append(row_id)
			print '%s arrays'%(len(row_id_ls))
			if self.ecotype_id_ls:
				for row_id in row_id_ls:
					col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise(row_id=row_id)
					if col_id2NA_mismatch_rate:
						if not isinstance(row_id, str) and hasattr(row_id, '__len__'):
							row_id_name = '_'.join(row_id)
						else:
							row_id_name = row_id
						output_fname = '%s_%s'%(self.output_fname, row_id_name)
						twoSNPData.output_col_id2NA_mismatch_rate_InGWRFormat(col_id2NA_mismatch_rate, output_fname)
		elif self.run_type==2:
			#2008-10-12	column-wise mismatch of snpData1 vs snpData1 between rows with same ecotype_id but different array_id
			row_id_pair_set = Set()
			for row_id in snpData1.row_id_ls:
				
				if not isinstance(row_id, str) and hasattr(row_id, '__len__'):
					ecotype_id = row_id[0]
				else:
					ecotype_id = row_id
				for row_id2 in snpData2.row_id_ls:
					if row_id2[0]==ecotype_id and row_id2[1]!=row_id[1]:	#same ecotype_id but different array_id
						row_id_pair_set.add((row_id, row_id2))
			
			print '%s arrays'%(len(row_id_pair_set))
			for row_id1, row_id2 in row_id_pair_set:
				row_id12row_id2 = {row_id1:row_id2}
				col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise(row_id=row_id1, row_id12row_id2=row_id12row_id2)
				if col_id2NA_mismatch_rate:
					output_fname = '%s_%s_vs_%s'%(self.output_fname, '_'.join(row_id1), '_'.join(row_id2))
					twoSNPData.output_col_id2NA_mismatch_rate_InGWRFormat(col_id2NA_mismatch_rate, output_fname)
		elif self.run_type==1:
			#sys.exit(2)	#2008-10-12 skip all original functions
			row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
			col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise()
			if row_id2NA_mismatch_rate:
				QC_250k.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname, file_1st_open=1)
			if col_id2NA_mismatch_rate:
				QC_250k.output_row_id2NA_mismatch_rate(col_id2NA_mismatch_rate, self.output_fname, file_1st_open=0)