def run(self):
		"""
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		session.begin()
		
		isq_id2data ={}
		no_of_total_lines = 0
		no_of_isqf_lines = 0
		no_of_isqf_in_db = 0
		for inputFname in self.inputFnameLs:
			reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname))
			header = reader.next()
			colName2Index = utils.getColName2IndexFromHeader(header, skipEmptyColumn=True)
			isq_id_index = colName2Index.get('isq_id')
			isqf_id_index = colName2Index.get('isqf_id')
			read_count_index = colName2Index.get("read_count")
			base_count_index = colName2Index.get("base_count")
			for row in reader:
				isq_id = int(row[isq_id_index])
				isqf_id = row[isqf_id_index]
				read_count = int(row[read_count_index])
				base_count = int(row[base_count_index])
				if isq_id not in isq_id2data:
					isq_id2data[isq_id] = PassingData(read_count=0, base_count=0)
				isq_id2data[isq_id].read_count += read_count
				isq_id2data[isq_id].base_count += base_count
				if isqf_id and isqf_id!='0':
					isqf_id = int(isqf_id)
					no_of_isqf_lines += 1
					no_of_isqf_in_db += self.updateIndividualSequenceFileReadBaseCount(self.db_vervet, isqf_id=isqf_id, \
											read_count=read_count, base_count=base_count)
				no_of_total_lines += 1
			del reader
		logMsg1="%s isqf out of %s were put into db. %s lines in total.\n"%(no_of_isqf_in_db, no_of_isqf_lines, no_of_total_lines)
		sys.stderr.write(logMsg1)
		
		counter = 0
		real_counter = 0
		for isq_id, data in isq_id2data.iteritems():
			real_counter += self.updateIndividualSequenceReadBaseCount(self.db_vervet, isq_id=isq_id, \
										read_count=data.read_count, base_count=data.base_count, genomeSize=self.genomeSize)
			counter += 1
		logMsg2="%s isq out of %s were put into db.\n"%(real_counter, counter)
		sys.stderr.write(logMsg2)
		
		if self.logFilename:
			logF = open(self.logFilename, 'w')
			logF.write(logMsg1)
			logF.write(logMsg2)
			del logF
			
		
		if self.commit:
			self.db_vervet.session.flush()
			self.db_vervet.session.commit()
Example #2
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		import MySQLdb
		conn = MySQLdb.connect(db=self.dbname,host=self.hostname, user=self.user, passwd = self.passwd)
		curs = conn.cursor()
		
		if self.ecotype_duplicate2tg_ecotypeid_table:
			ecotype_duplicate2tg_ecotypeid = self.get_ecotype_duplicate2tg_ecotypeid(curs, self.ecotype_duplicate2tg_ecotypeid_table)
		else:
			ecotype_duplicate2tg_ecotypeid = None
		from pymodule import figureOutDelimiter
		delimiter = figureOutDelimiter(self.input_fname)
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter)
		
		tg_ecotypeid2ecotypeid_duplicate_index_ls = self.get_tg_ecotypeid2ecotypeid_duplicate_index_ls(strain_acc_list, category_list, ecotype_duplicate2tg_ecotypeid)
		
		ecotypeid2nativename = get_ecotypeid2nativename(curs, ecotype_table=self.ecotype_table)
		tg_ecotypeid_ls, merge_matrix = self.get_merged_matrix(tg_ecotypeid2ecotypeid_duplicate_index_ls, data_matrix, \
															ecotypeid2nativename, self.stat_output_fname)
		
		tg_nativename_ls = []
		for ecotypeid in tg_ecotypeid_ls:
			tg_nativename_ls.append(ecotypeid2nativename[ecotypeid])
		header[1] = 'nativename'
		write_data_matrix(merge_matrix, self.output_fname, header, tg_ecotypeid_ls, tg_nativename_ls, delimiter=delimiter)
	def run(self):
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		inconsistent_rate_ls = []
		for inputFname in self.inputFnameLs:
			if os.path.isfile(inputFname):
				try:
					reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname))
					header = reader.next()
					col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True)
					inconsistent_rate_index = col_name2index.get("inconsistency")
					for row in reader:
						inconsistency = float(row[inconsistent_rate_index])
						inconsistent_rate_ls.append(inconsistency)
					del reader
				except:
					sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
					import traceback
					traceback.print_exc()
		
		if self.title is None:
			title = "histogram of inconsistent rate from %s refs"%(len(inconsistent_rate_ls))
		else:
			title = self.title
		if len(inconsistent_rate_ls)>10:
			medianInconsistentRate = numpy.median(inconsistent_rate_ls)
			title += " median %.4f"%(medianInconsistentRate)
		yh_matplotlib.drawHist(inconsistent_rate_ls, title=title, \
									xlabel_1D="Inconsistent Rate", xticks=None, outputFname=self.outputFname, min_no_of_data_points=20, needLog=False, \
									dpi=200)
	def run(self):
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
		writer.writerow(['#sampleID', 'chromosome', 'meanDepth', 'medianDepth'])
		for inputFname in self.inputFnameLs:
			inputFile = utils.openGzipFile(inputFname)
			delimiter = figureOutDelimiter(inputFile)
			reader = csv.reader(inputFile, delimiter=delimiter)
			header = reader.next()
			col_name2index = getColName2IndexFromHeader(header)
			
			intervalIDIndex = col_name2index.get("Target")
			#only the first read group among the output (so don't run the DepthOfCoverageWalker over multi-read-group bam files
			avgCoverageIndex = 4
			sampleID = header[avgCoverageIndex][:-9]	#this column header is like $sampleID_mean_cvg. so get rid of _mean_cvg
			medianCoverageIndex = 6
			
			for row in reader:
				intervalID = row[intervalIDIndex]
				writer.writerow([sampleID, intervalID, row[avgCoverageIndex], row[medianCoverageIndex]])
		del writer
		sys.stderr.write("Done.\n")
	def readDataMatrix(self, inputFname, minExprSumPerGene=180):
		"""
		2012.5.8
		"""
		sys.stderr.write("Reading the gene expression matrix from %s ..."%(inputFname))
		
		suffix = os.path.splitext(inputFname)[1]
		if suffix=='.gz':
			import gzip
			inf = gzip.open(inputFname, 'r')
		else:
			inf = open(inputFname, 'r')
		
		reader = csv.reader(inf, delimiter=figureOutDelimiter(inf))
		header = reader.next()	#first line is taken as header
		colName2Index = getColName2IndexFromHeader(header)
		data_matrix = []
		row_id_ls = []
		counter = 0
		real_counter = 0
		for row in reader:
			data_row = row[1:]
			data_row = map(float, data_row)
			exprSumPerGene = sum(data_row)
			counter += 1
			if exprSumPerGene>=minExprSumPerGene:
				real_counter += 1
				row_id_ls.append(row[0])
				data_matrix.append(data_row)
		data_matrix = numpy.array(data_matrix)
		sys.stderr.write("%s rows out of %s selected. %s rows , %s columns.\n"%(real_counter, counter, \
																	len(row_id_ls), len(header)-1))
		return PassingData(row_id_ls=row_id_ls, header=header, data_matrix=data_matrix)
Example #6
0
	def putQCIntoDB(self, session, input_fname, no_of_lines_to_skip, data_source_obj, cnv_type_obj, cnv_method_obj=None, \
				run_type=1, original_id=None):
		"""
		2009-10-28
		"""
		sys.stderr.write("Putting QC data into database ... \n")
		reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname))
		for i in range(no_of_lines_to_skip):
			reader.next()
		
		counter = 0
		for row in reader:
			if run_type ==1:
				cnv_qc_call = self.generateCNVQCCallObjType1(session, row, data_source_obj, cnv_type_obj, cnv_method_obj)
			elif run_type ==2:
				cnv_qc_call = self.generateCNVQCCallObjType2(session, row, data_source_obj, cnv_type_obj, cnv_method_obj)
			elif run_type==3:
				cnv_qc_call = self.generateCNVQCCallObjFromBobSchmitzData(session, row, data_source_obj, cnv_type_obj,\
																		cnv_method_obj, original_id=original_id)
			else:
				sys.stderr.write("Run type %s not supported.\n"%run_type)
			session.save(cnv_qc_call)
			counter += 1
			if counter%5000==0:
				sys.stderr.write("%s%s"%('\x08'*40, counter))
		session.flush()
		sys.stderr.write("%s records. Done.\n"%counter)
	def getMonkeyIDPair2Correlation(self, smartpcaCorrelationFname=None):
		"""
		2012.3.1
			smartpcaCorrelationFname is output from  PCAOnVCFWorkflow.py (with modified smartpca). tab-delimited.
				553_2_VRC_ref_GA_vs_524	555_15_1987079_GA_vs_524        Case    Case    0.025
				553_2_VRC_ref_GA_vs_524	556_16_1985088_GA_vs_524        Case    Case    -0.020
				553_2_VRC_ref_GA_vs_524	557_17_1986014_GA_vs_524        Case    Case    -0.106
				553_2_VRC_ref_GA_vs_524	558_18_1988009_GA_vs_524        Case    Case    -0.059
		
		"""
		sys.stderr.write("Reading correlation from %s ... "%(smartpcaCorrelationFname))
		monkey_id_pair2genotype_correlation = {}
		import csv
		reader = csv.reader(open(smartpcaCorrelationFname), delimiter=figureOutDelimiter(smartpcaCorrelationFname))
		monkey_id_extract = lambda x: x.split('_')[2]
		for row in reader:
			monkey1 = row[0]
			monkey2 = row[1]
			cor = float(row[4])
			pair_in_ls = [monkey_id_extract(monkey1), monkey_id_extract(monkey2)]
			pair_in_ls.sort()
			pair_key = tuple(pair_in_ls)
			monkey_id_pair2genotype_correlation[pair_key] = cor
		sys.stderr.write("%s pairs .\n"%(len(monkey_id_pair2genotype_correlation)))
		return monkey_id_pair2genotype_correlation
	def readInput(self, inputFnameLs, ):
		sys.stderr.write("Reading distance data from %s files ..."%(len(inputFnameLs)))
		sampleId2index = {}
		samplePair2data = {}	#value is [no_of_mismatches, no_of_total_non_NA]
		for inputFname in self.inputFnameLs:
			reader = csv.reader(open(inputFname, ), delimiter=figureOutDelimiter(inputFname))
			matrixStart = False
			for row in reader:
				if row[0]=='':
					matrixStart = True
					break
				
				sample1Id = row[0]
				if sample1Id not in sampleId2index:
					sampleId2index[sample1Id] = len(sampleId2index)
				sample2Id = row[1]
				if sample2Id not in sampleId2index:
					sampleId2index[sample2Id] = len(sampleId2index)
				no_of_mismatches = float(row[-2])
				no_of_total_non_NA = float(row[-1])
				samplePair = (sample1Id, sample2Id)
				if samplePair not in samplePair2data:
					samplePair2data[samplePair] = [0, 0]
				samplePair2data[samplePair][0] += no_of_mismatches
				samplePair2data[samplePair][1] += no_of_total_non_NA
			
			del reader
		sys.stderr.write("Done.\n")
		return sampleId2index, samplePair2data
	def trioInconsistentRateFileWalker(self, inputFname, processFunc=None, minNoOfTotal=100, run_type=1):
		"""
		2011-11-2
			remove the maxDepth filter. apply afterwards through filterDataByDepth().
		2011-9-30
		
		"""
		reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname))
		header = reader.next()
		col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True)
		isInconsistent_index = col_name2index.get("isInconsistent")
		index_of_fa_depth = col_name2index.get("depthOfFather")
		index_of_mo_depth = col_name2index.get('depthOfMother')
		index_of_child_depth = col_name2index.get('depthOfChild')
		for row in reader:
			fa_depth = int(float(row[index_of_fa_depth]))
			mo_depth = int(float(row[index_of_mo_depth]))
			child_depth = int(float(row[index_of_child_depth]))
			isInconsistent = float(float(row[isInconsistent_index]))
			#if fa_depth<=self.maxDepth and mo_depth <=self.maxDepth and child_depth<=self.maxDepth:
			self.fa_depth_ls.append(fa_depth)
			self.mo_depth_ls.append(mo_depth)
			self.child_depth_ls.append(child_depth)
			self.inconsistent_ls.append(isInconsistent)
		del reader
	def run(self):
		"""
		2012.4.3
			each input has this as its header:
			
			['alignmentID', 'total_no_of_reads', 'perc_reads_mapped', 'perc_duplicates', 'perc_paired', 'perc_properly_paired', \
				'perc_both_mates_mapped', 'perc_singletons',\
				'perc_mapped_to_diff_chrs']
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		session.begin()
		
		no_of_total_lines = 0
		for inputFname in self.inputFnameLs:
			reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname))
			header = reader.next()
			colName2Index = utils.getColName2IndexFromHeader(header, skipEmptyColumn=True)
			alignment_id_index = colName2Index.get('alignmentID')
			total_no_of_reads_index = colName2Index.get('total_no_of_reads')
			perc_reads_mapped_index = colName2Index.get("perc_reads_mapped")
			perc_duplicates_index = colName2Index.get("perc_duplicates")
			perc_paired_index = colName2Index.get("perc_paired")
			perc_properly_paired_index = colName2Index.get("perc_properly_paired")
			perc_both_mates_mapped_index = colName2Index.get("perc_both_mates_mapped")
			perc_singletons_index = colName2Index.get("perc_singletons")
			perc_mapped_to_diff_chrs_index = colName2Index.get("perc_mapped_to_diff_chrs")
			perc_mapq5_mapped_to_diff_chrs_index = colName2Index.get("perc_mapq5_mapped_to_diff_chrs")
			for row in reader:
				alignmentID = int(row[alignment_id_index])
				alignment = VervetDB.IndividualAlignment.get(alignmentID)
				alignment.perc_reads_mapped = float(row[perc_reads_mapped_index])
				alignment.perc_duplicates = float(row[perc_duplicates_index])
				alignment.perc_paired = float(row[perc_paired_index])
				alignment.perc_properly_paired = float(row[perc_properly_paired_index])
				alignment.perc_both_mates_mapped = float(row[perc_both_mates_mapped_index])
				alignment.perc_singletons = float(row[perc_singletons_index])
				alignment.perc_mapped_to_diff_chrs = float(row[perc_mapped_to_diff_chrs_index])
				alignment.perc_mapq5_mapped_to_diff_chrs = float(row[perc_mapq5_mapped_to_diff_chrs_index])
				alignment.total_no_of_reads = int(float(row[total_no_of_reads_index]))
				session.add(alignment)
				no_of_total_lines += 1
			del reader
		sys.stderr.write("%s alignments in total.\n"%(no_of_total_lines))
		
		if self.logFilename:
			logF = open(self.logFilename, 'w')
			logF.write("%s alignments in total.\n"%(no_of_total_lines))
			del logF
		
		if self.commit:
			self.db_vervet.session.flush()
			self.db_vervet.session.commit()
Example #11
0
    def run(self):
        """
		2007-02-27
		2007-09-14
			filtering_bits
		-read_data()
		-remove_rows_with_too_many_NAs()
		-remove_cols_with_too_many_NAs()
		-remove_identity_strains()
		-write_data_matrix()
		"""
        if self.debug:
            import pdb

            pdb.set_trace()
        delimiter = figureOutDelimiter(self.input_fname, report=self.report)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname, int(self.nt_alphabet_bits[0]), delimiter=delimiter
        )
        data_matrix = num.array(data_matrix)
        if self.filtering_bits[0] == "1":
            remove_rows_data = self.remove_rows_with_too_many_NAs(data_matrix, self.row_cutoff)
            rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set
            strain_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs
        else:
            rows_with_too_many_NAs_set = Set()
        if self.filtering_bits[1] == "1":
            remove_cols_data = self.remove_cols_with_too_many_NAs(data_matrix, col_cutoff, rows_with_too_many_NAs_set)
            cols_with_too_many_NAs_set = remove_cols_data.cols_with_too_many_NAs_set
        else:
            cols_with_too_many_NAs_set = Set()
        if self.filtering_bits[2] == "1":
            no_of_rows, no_of_cols = data_matrix.shape
            total_rows_set = Set(range(no_of_rows))
            rows_to_be_checked = total_rows_set - rows_with_too_many_NAs_set
            total_cols_set = Set(range(no_of_cols))
            cols_to_be_checked = total_cols_set - cols_with_too_many_NAs_set
            identity_strains_to_be_removed = self.remove_identity_strains(
                data_matrix, rows_to_be_checked, cols_to_be_checked
            )
        else:
            identity_strains_to_be_removed = Set()
        rows_to_be_tossed_out = rows_with_too_many_NAs_set | identity_strains_to_be_removed
        # self.write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, int(self.nt_alphabet_bits[1]))
        write_data_matrix(
            data_matrix,
            self.output_fname,
            header,
            strain_acc_list,
            category_list,
            rows_to_be_tossed_out,
            cols_with_too_many_NAs_set,
            nt_alphabet=int(self.nt_alphabet_bits[1]),
            delimiter=delimiter,
        )
Example #12
0
	def read_input_fname(self, input_fname):
		sys.stderr.write("Getting gene pairs from %s ..."%input_fname)
		reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname))
		gene_id_pair_ls = []
		for row in reader:
			gene1_id = int(row[0])
			gene2_id = int(row[1])
			gene_id_pair_ls.append((gene1_id, gene2_id))
		
		sys.stderr.write("Done.\n")
		return gene_id_pair_ls
Example #13
0
	def putHaplotypeGroupIntoDB(self, session, input_fname, tg_ecotypeid2row, max_snp_typing_error_rate, snp_id_ls):
		"""
		2009-3-31
		2009-4-4
			add argument tg_ecotypeid2row
		"""
		sys.stderr.write("Constructing haplotype groups ...\n")
		pattern_ecotypeid = re.compile(r'(?<=\))\d+')
		reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname))
		col_name2col_index = getColName2IndexFromHeader(reader.next())
		ecotypeid_idx = col_name2col_index['ecotypeid']
		haplo_name_idx = col_name2col_index['haplogroup']
		geographic_integrity_idx = col_name2col_index['geographic_integrity']
		filtered_SNPs_idx = col_name2col_index['filtered_SNPs']
		counter = 0
		for tg_ecotypeid, row in tg_ecotypeid2row.iteritems():
			ecotypeid = int(row[ecotypeid_idx])
			ecotypeid = tg_ecotypeid	#2009-4-4 use tg_ecotypeid instead
			haplo_name = row[haplo_name_idx]
			geographic_integrity_name = row[geographic_integrity_idx]
			filtered_SNPs = row[filtered_SNPs_idx]
			ref_ecotypeid = int(pattern_ecotypeid.search(haplo_name).group(0))
			haplo_group = StockDB.HaploGroup.query.filter_by(short_name=haplo_name).first()
			if not haplo_group:
				haplo_group = StockDB.HaploGroup(short_name=haplo_name, ref_ecotypeid=ref_ecotypeid, max_snp_typing_error_rate=max_snp_typing_error_rate)
				session.save(haplo_group)
				session.flush()
			
			ecotype = StockDB.Ecotype.get(ecotypeid)
			haplo_group.ecotypes.append(ecotype)
			geographic_integrity = StockDB.GeographicIntegrity.query.filter_by(short_name=geographic_integrity_name).first()
			if not geographic_integrity:
				geographic_integrity = StockDB.GeographicIntegrity(short_name=geographic_integrity_name)
				session.save(geographic_integrity)
				session.flush()
			ecotype.geographic_integrity = geographic_integrity
			session.save_or_update(ecotype)
			#one bit of ecotype: link the ecotypeid to tg_ecotype_id
			
			
			#deal with filtered SNPs
			for i in range(len(filtered_SNPs)):
				allele = filtered_SNPs[i]
				if allele=='_':
					continue
				fc = StockDB.FilteredCalls(ecotypeid=ecotypeid, snpid=snp_id_ls[i], allele=allele)
				session.save(fc)
				session.flush()
			counter += 1
			if counter%500==0 and self.report:
				sys.stderr.write('%s%s'%('\x08'*80, counter))
		session.flush()
		sys.stderr.write("Done.\n")
Example #14
0
    def run(self):
        """
		2008-5-12
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        #database connection and etc
        db = self.db_250k

        session = db.session
        session.begin()

        delimiter = figureOutDelimiter(self.inputFname, report=self.report)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.inputFname, delimiter=delimiter)

        if self.snp_id_type == 1:
            #2011-2-27 translate the db_id into chr_pos because the new StrainXSNP dataset uses db_id to identify SNPs.
            # but if col-id is already chr_pos, it's fine.
            new_header = header[:2]
            data_matrix_col_index_to_be_kept = []
            for i in xrange(2, len(header)):
                snp_id = header[i]
                chr_pos = db.get_chr_pos_given_db_id2chr_pos(snp_id, )
                if chr_pos is not None:
                    data_matrix_col_index_to_be_kept.append(i - 2)
                    new_header.append(chr_pos)
            # to remove no-db_id columns from data matrix
            data_matrix = numpy.array(data_matrix)
            data_matrix = data_matrix[:, data_matrix_col_index_to_be_kept]
            header = new_header

        if self.array_id_2nd_column:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
                data_matrix=data_matrix)
        else:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\
                data_matrix=data_matrix) #ignore category_list

        rawSnpsData_ls = SNPData2RawSnpsData_ls(snpData,
                                                need_transposeSNPData=1,
                                                report=self.report)
        chromosomes = [
            rawSnpsData.chromosome for rawSnpsData in rawSnpsData_ls
        ]
        snpsdata.writeRawSnpsDatasToFile(self.outputFname,
                                         rawSnpsData_ls,
                                         chromosomes=chromosomes,
                                         deliminator=',',
                                         withArrayIds=self.array_id_2nd_column)
	def read_input_fname(self, input_fname):
		sys.stderr.write("Getting gene pairs from %s ..."%input_fname)
		reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname))
		gene_id_pair_ls = []
		gene_id_set = Set()
		for row in reader:
			gene1_id = int(row[0])
			gene2_id = int(row[1])
			gene_id_pair_ls.append((gene1_id, gene2_id))
			gene_id_set.add(gene1_id)
			gene_id_set.add(gene2_id)
		sys.stderr.write("Done.\n")
		return gene_id_pair_ls, gene_id_set
    def generate_params(self,
                        gene_id_fname,
                        pdata,
                        block_size=1000,
                        **keywords):
        """
		2009-2-12
			use yield to become a generator
		2008-11-25
			read gene ids from gene_id_fname and generate pairs among them
			each node handle a number of pairs, depending on many SNP pairs it incurs
		"""
        #sys.stderr.write("Generating parameters ...")
        #params_ls = []
        no_of_phenotypes = len(pdata.phenotype_index_ls)
        start_index = 0  #for each computing node: the index of gene >= start_index
        #no_of_genes = len(pdata.gene_id2snps_id_ls)
        no_of_tests_per_node = 0

        reader = csv.reader(open(gene_id_fname),
                            delimiter=figureOutDelimiter(gene_id_fname))
        gene_id_ls = []
        for row in reader:
            gene_id = int(row[0])
            gene_id_ls.append(gene_id)
        del reader

        no_of_genes = len(gene_id_ls)
        gene_id_pairs_for_each_node = []
        for i in range(no_of_genes):
            gene1_id = gene_id_ls[i]
            n1 = len(
                pdata.gene_id2snps_id_ls[gene1_id])  #no_of_snps_of_this_gene
            #for gene2_id in pdata.gene_id2snps_id_ls:	#2009-2-8 another setting: gene_id_fname vs all genes
            for j in range(i + 1, no_of_genes):
                gene2_id = gene_id_ls[j]
                n2 = len(pdata.gene_id2snps_id_ls[gene2_id])
                est_no_of_tests = (
                    n1 * n2
                ) * no_of_phenotypes  #this is the upper bound for the number of tests for each gene on a computing node. data missing would make the number smaller.
                no_of_tests_per_node += est_no_of_tests
                gene_id_pairs_for_each_node.append((gene1_id, gene2_id))
                if no_of_tests_per_node >= block_size:
                    yield gene_id_pairs_for_each_node
                    #reset gene_id_pairs_for_each_node
                    gene_id_pairs_for_each_node = []
                    no_of_tests_per_node = 0  #reset this to 0

        #pick up the last gene_id_pairs_for_each_node if it's not empty
        if gene_id_pairs_for_each_node:
            yield gene_id_pairs_for_each_node
Example #17
0
	def get_snp_pair2value_type(self, boolean_pair_fname):
		"""
		2008-11-25
		"""
		sys.stderr.write("Getting snp_pair2value_type ...")
		snp_pair2value_type = {}
		reader = csv.reader(open(boolean_pair_fname), delimiter=figureOutDelimiter(boolean_pair_fname))
		reader.next()
		min_value = None
		max_value = None
		for row in reader:
			snp1_id, gene1_id, snp2_id, gene2_id, bool_type, pvalue, count1, count2 = row
			if not snp2_id:
				snp2_id = snp1_id
				continue	#2008-11-26 skip a row if it's pvalue from single SNP.
			
			if not gene2_id:
				gene2_id = gene1_id
			
			snp1_id = snp1_id.split('_')
			snp1_id = map(int, snp1_id)
			
			snp2_id = snp2_id.split('_')
			snp2_id = map(int, snp2_id)
			
			snp_pair = [tuple(snp1_id), tuple(snp2_id)]
			snp_pair.sort()
			snp_pair = tuple(snp_pair)
			pvalue = -math.log10(float(pvalue))
			
			value = pvalue
			if min_value is None:
				min_value =value
			elif value<min_value:
				min_value = value
				
			if max_value is None:
				max_value = value
			elif value>max_value:
				max_value = value
			
			bool_type = int(bool_type)
			if snp_pair not in snp_pair2value_type:
				snp_pair2value_type[snp_pair] = (pvalue, bool_type)
			else:
				if pvalue>snp_pair2value_type[snp_pair][0]:	#only take maximum
					snp_pair2value_type[snp_pair] = (pvalue, bool_type)
		del reader
		sys.stderr.write("Done.\n")
		return snp_pair2value_type, min_value, max_value
Example #18
0
    def getNoOfLociFromSNPData(self, inputFname):
        """
		2012.3.2
		"""
        sys.stderr.write("Getting no of loci from %s ..." %
                         (os.path.basename(inputFname)))
        reader = csv.reader(open(inputFname),
                            delimiter=figureOutDelimiter(inputFname))
        header = reader.next()
        first_data_row = reader.next()
        no_of_cols = len(first_data_row) - 2
        del reader
        sys.stderr.write("%s columns.\n" % (no_of_cols))
        return no_of_cols
Example #19
0
    def generate_params(self,
                        gene_id_fname,
                        pdata,
                        block_size=1000,
                        **keywords):
        """
		2009-2-12
			use yield make this function a generator
		2009-2-9
			add argument gene_id_fname to restrict analysis on genes from it
		2008-09-09
			estimate the number of tests each gene would encompass, and decide how many genes should be included in a set to send out
		2008-09-06
			each node handles a certain number of genes. identified by the index of the 1st gene and the index of the last gene.
		"""
        no_of_phenotypes = len(pdata.phenotype_index_ls)
        start_index = 0  #for each computing node: the index of gene >= start_index
        no_of_tests_per_node = 0

        #2009-2-9
        if gene_id_fname and os.path.isfile(gene_id_fname):
            reader = csv.reader(open(gene_id_fname),
                                delimiter=figureOutDelimiter(gene_id_fname))
            gene_id_ls = []
            for row in reader:
                gene_id = int(row[0])
                gene_id_ls.append(gene_id)
            del reader
            pdata.gene_id_ls = gene_id_ls  #replace pdata's gene_id_ls

        no_of_genes = len(pdata.gene_id_ls)

        for i in range(no_of_genes):
            gene_id = pdata.gene_id_ls[i]
            n = len(
                pdata.gene_id2snps_id_ls[gene_id])  #no_of_snps_of_this_gene
            est_no_of_tests = (
                n * (n - 1) * 5 / 2.0 + n
            ) * no_of_phenotypes  #this is the upper bound for the number of tests for each gene on a computing node. data missing would make the number smaller.
            no_of_tests_per_node += est_no_of_tests
            if no_of_tests_per_node >= block_size:
                yield (
                    start_index, i + 1
                )  #the computing node is gonna handle genes from pdata.gene_id_ls[start_index] to pdata.gene_id_ls[i]
                #reset the starting pointer to the index of the next gene
                start_index = i + 1
                no_of_tests_per_node = 0  #reset this to 0
            elif i == no_of_genes - 1:  #this is the last gene, have to include them
                yield (start_index, i + 1)
	def getSampleID2FamilyCount(self, inputFname):
		"""
		2012.3.29
		"""
		sys.stderr.write("Getting sampleID2FamilyCount from %s ..."%(inputFname))
		reader = csv.reader(open(inputFname, 'r'), delimiter=figureOutDelimiter(inputFname))
		header = reader.next()
		colName2Index = getColName2IndexFromHeader(header)
		sampleID2FamilyCount = {}
		for row in reader:
			individualID = row[colName2Index.get("individualID")]
			familyCount = int(row[colName2Index.get("familyCount")])
			sampleID2FamilyCount[individualID] = familyCount
		sys.stderr.write("%s individuals.\n"%(len(sampleID2FamilyCount)))
		return sampleID2FamilyCount
	def get_isqID2coverage(self, seqCoverageFname, defaultCoverage=None):
		"""
		2011-9-2
		"""
		sys.stderr.write("Fetching sequence coverage info from %s ..."%(seqCoverageFname))
		
		reader = csv.reader(open(seqCoverageFname, 'r'), delimiter=figureOutDelimiter(seqCoverageFname))
		isqID2coverage = {}
		header = reader.next()
		for row in reader:
			isqID = int(row[0])
			coverage = float(row[1])
			isqID2coverage[isqID] = coverage
		sys.stderr.write("%s entries.\n"%len(isqID2coverage))
		return isqID2coverage
    def dropRedundantEcotypes(self, input_fname, ecotypeid2tg_ecotypeid):
        """
		2009-4-10
			not used. decided to keep all of them.
		2009-4-4
			retain only one row out of duplicated ecotype rows based on ecotypeid2tg_ecotypeid.
				it's not random. usually the one with same ecotype id as tg_ecotypeid unless tg_ecotypeid doesn't appear.
			if duplicated ecotypes belong to different haplotype group, choose the one with tg_ecotypeid otherwise random.
		"""
        sys.stderr.write("Dropping redundant ecotypes ...\n")
        reader = csv.reader(open(input_fname),
                            delimiter=figureOutDelimiter(input_fname))
        col_name2col_index = getColName2IndexFromHeader(reader.next())
        ecotypeid_idx = col_name2col_index['ecotypeid']
        haplo_name_idx = col_name2col_index['haplogroup']
        nativename_idx = col_name2col_index['nativename']
        tg_ecotypeid2row = {}
        no_of_duplicates = 0
        no_of_duplicates_with_different_haplogroups = 0
        counter = 0
        for row in reader:
            ecotypeid = int(row[ecotypeid_idx])
            haplo_name = row[haplo_name_idx]
            nativename = row[nativename_idx]
            if ecotypeid in ecotypeid2tg_ecotypeid:
                tg_ecotypeid = ecotypeid2tg_ecotypeid[ecotypeid]
                if tg_ecotypeid not in tg_ecotypeid2row:
                    tg_ecotypeid2row[tg_ecotypeid] = row
                else:
                    no_of_duplicates += 1
                    old_row = tg_ecotypeid2row[tg_ecotypeid]
                    old_ecotypeid = int(old_row[ecotypeid_idx])
                    old_haplo_name = old_row[haplo_name_idx]
                    old_nativename = row[nativename_idx]
                    if old_haplo_name != haplo_name:
                        sys.stderr.write("ecotype %s(%s) in haplotype group %s, while duplicate %s(%s) in haplotype group %s.\n"%\
                             (ecotypeid, nativename, haplo_name, old_ecotypeid, old_nativename, old_haplo_name))
                        no_of_duplicates_with_different_haplogroups += 1
                    if ecotypeid == tg_ecotypeid:  #replace if the new ecotypeid matching the tg_ecotypeid whether the haplotype group is same or not.
                        tg_ecotypeid2row[tg_ecotypeid] = row
            else:
                sys.stderr.write(
                    "Warning: ecotype %s not in ecotypeid2tg_ecotypeid.\n" %
                    (ecotypeid))
            counter += 1
        sys.stderr.write("no_of_duplicates: %s, out of which %s encompass different haplotype groups. %s accessions in total. Done.\n"%\
             (no_of_duplicates, no_of_duplicates_with_different_haplogroups, counter))
        return tg_ecotypeid2row
Example #23
0
	def generate_params(self, gene_id_fname, pdata, block_size=1000, **keywords):
		"""
		2009-2-18
			if gene_id_fname is given and is a file:
				yield (gene1_id, snp_start_index, snp_stop_index)
			else:
				yield (phenotype_index, snp_start_index1, snp_stop_index1, snp_start_index2, snp_stop_index2)
		2009-2-12
			use yield to become a generator
			called by inputNodePrepare()
		2009-1-22
		"""
		no_of_phenotypes = len(pdata.phenotype_index_ls)
		start_index = 0	#for each computing node: the index of gene >= start_index
		#no_of_genes = len(pdata.gene_id2snps_id_ls)
		no_of_tests_per_node = 0
		
		if gene_id_fname and os.path.isfile(gene_id_fname):
			reader = csv.reader(open(gene_id_fname), delimiter=figureOutDelimiter(gene_id_fname))
			gene_id_ls = []
			for row in reader:
				gene_id = int(row[0])
				gene_id_ls.append(gene_id)
			del reader
			no_of_genes = len(gene_id_ls)
			no_of_total_snps = len(pdata.snp_info.chr_pos_ls)
			for i in range(no_of_genes):
				gene1_id = gene_id_ls[i]
				n1 = len(pdata.gene_id2snps_id_ls[gene1_id])	#no_of_snps_of_this_gene
				snp_start_index = 0
				while snp_start_index < no_of_total_snps:
					no_of_snps_to_consider = block_size/(n1*no_of_phenotypes)
					snp_stop_index = snp_start_index+no_of_snps_to_consider
					if snp_stop_index > no_of_total_snps:
						snp_stop_index = no_of_total_snps
					yield (gene1_id, snp_start_index, snp_stop_index)
					snp_start_index += no_of_snps_to_consider
		else:
			#no gene_id_fname. pairwise among all SNPs
			no_of_snps_to_consider = int(math.sqrt(block_size))
			no_of_total_snps = len(pdata.snp_info.chr_pos_ls)
			for phenotype_index in pdata.phenotype_index_ls:
				for snp_start_index1 in range(0, no_of_total_snps, no_of_snps_to_consider):
					snp_stop_index1 = min(no_of_total_snps, snp_start_index1+no_of_snps_to_consider)
					for snp_start_index2 in range(snp_start_index1, no_of_total_snps, no_of_snps_to_consider):
						snp_stop_index2 = min(no_of_total_snps, snp_start_index2+no_of_snps_to_consider)
						yield (phenotype_index, snp_start_index1, snp_stop_index1, snp_start_index2, snp_stop_index2)
Example #24
0
	def generate_params(self, gene_id_fname, pdata, block_size=1000, **keywords):
		"""
		2009-2-18
			if gene_id_fname is given and is a file:
				yield (gene1_id, snp_start_index, snp_stop_index)
			else:
				yield (phenotype_index, snp_start_index1, snp_stop_index1, snp_start_index2, snp_stop_index2)
		2009-2-12
			use yield to become a generator
			called by inputNodePrepare()
		2009-1-22
		"""
		no_of_phenotypes = len(pdata.phenotype_index_ls)
		start_index = 0	#for each computing node: the index of gene >= start_index
		#no_of_genes = len(pdata.gene_id2snps_id_ls)
		no_of_tests_per_node = 0
		
		if gene_id_fname and os.path.isfile(gene_id_fname):
			reader = csv.reader(open(gene_id_fname), delimiter=figureOutDelimiter(gene_id_fname))
			gene_id_ls = []
			for row in reader:
				gene_id = int(row[0])
				gene_id_ls.append(gene_id)
			del reader
			no_of_genes = len(gene_id_ls)
			no_of_total_snps = len(pdata.snp_info.chr_pos_ls)
			for i in range(no_of_genes):
				gene1_id = gene_id_ls[i]
				n1 = len(pdata.gene_id2snps_id_ls[gene1_id])	#no_of_snps_of_this_gene
				snp_start_index = 0
				while snp_start_index < no_of_total_snps:
					no_of_snps_to_consider = block_size/(n1*no_of_phenotypes)
					snp_stop_index = snp_start_index+no_of_snps_to_consider
					if snp_stop_index > no_of_total_snps:
						snp_stop_index = no_of_total_snps
					yield (gene1_id, snp_start_index, snp_stop_index)
					snp_start_index += no_of_snps_to_consider
		else:
			#no gene_id_fname. pairwise among all SNPs
			no_of_snps_to_consider = int(math.sqrt(block_size))
			no_of_total_snps = len(pdata.snp_info.chr_pos_ls)
			for phenotype_index in pdata.phenotype_index_ls:
				for snp_start_index1 in range(0, no_of_total_snps, no_of_snps_to_consider):
					snp_stop_index1 = min(no_of_total_snps, snp_start_index1+no_of_snps_to_consider)
					for snp_start_index2 in range(snp_start_index1, no_of_total_snps, no_of_snps_to_consider):
						snp_stop_index2 = min(no_of_total_snps, snp_start_index2+no_of_snps_to_consider)
						yield (phenotype_index, snp_start_index1, snp_stop_index1, snp_start_index2, snp_stop_index2)
Example #25
0
	def run(self):
		"""
		2008-09-10
			in case chop the whole figure into blocks, swap col_block_index and row_block_index to make row first, column 2nd
		"""
		from pymodule.yhio.SNP import read_data
		from pymodule.utils import figureOutDelimiter, PassingData
		delimiter = figureOutDelimiter(self.input_fname)
		print delimiter
		header, row_label_ls1, row_label_ls2, data_matrix = read_data(self.input_fname, matrix_data_type=float, delimiter='\t')
		import numpy
		data_matrix = numpy.array(data_matrix)
		min_value = numpy.min(data_matrix)
		if self.min_value_non_negative and min_value < 0:
			min_value = 0
		max_value = numpy.max(data_matrix)
		font = get_font(self.font_path, font_size=self.font_size)
		Value2Color.special_value2color[-2] = self.super_value_color
		value2color_func = lambda x: Value2Color.value2HSLcolor(x, min_value, max_value)
		im_legend = drawContinousLegend(min_value, max_value, self.no_of_ticks, value2color_func, font)
		
		fig_fname_prefix = os.path.splitext(self.fig_fname)[0]
		if self.split_legend_and_matrix:
			im_legend.save('%s_legend.png'%fig_fname_prefix)
		
		no_of_rows, no_of_cols = data_matrix.shape
		passParam = PassingData(value2color_func=value2color_func, im_legend=im_legend, font=font, \
							split_legend_and_matrix=self.split_legend_and_matrix, no_grid=self.no_grid)
		
		if no_of_cols <= self.blockColUnit:
			self._drawMatrix(data_matrix, row_label_ls1, header[2:], self.fig_fname, passParam)
		else:	#split into blocks
			no_of_col_blocks = no_of_cols/self.blockColUnit+1
			no_of_row_blocks = no_of_rows/self.blockRowUnit + 1
			for i in range(no_of_col_blocks):
				col_start_index = i*self.blockColUnit
				col_end_index = (i+1)*self.blockColUnit
				if col_start_index<no_of_cols:
					for j in range(no_of_row_blocks):
						row_start_index = j*self.blockRowUnit
						row_end_index = (j+1)*self.blockRowUnit
						if row_start_index<no_of_rows:
							fig_fname = '%s_%s_%s.png'%(fig_fname_prefix, j, i)	#row first, column 2nd
							self._drawMatrix(data_matrix[row_start_index:row_end_index,col_start_index:col_end_index], row_label_ls1[row_start_index:row_end_index], \
											header[2+col_start_index:2+col_end_index], fig_fname, passParam)
Example #26
0
	def generate_params(self, gene_id_fname, pdata, block_size=1000, **keywords):
		"""
		2009-2-12
			use yield to become a generator
		2008-11-25
			read gene ids from gene_id_fname and generate pairs among them
			each node handle a number of pairs, depending on many SNP pairs it incurs
		"""
		#sys.stderr.write("Generating parameters ...")
		#params_ls = []
		no_of_phenotypes = len(pdata.phenotype_index_ls)
		start_index = 0	#for each computing node: the index of gene >= start_index
		#no_of_genes = len(pdata.gene_id2snps_id_ls)
		no_of_tests_per_node = 0
		
		reader = csv.reader(open(gene_id_fname), delimiter=figureOutDelimiter(gene_id_fname))
		gene_id_ls = []
		for row in reader:
			gene_id = int(row[0])
			gene_id_ls.append(gene_id)
		del reader
		
		no_of_genes = len(gene_id_ls)
		gene_id_pairs_for_each_node = []
		for i in range(no_of_genes):
			gene1_id = gene_id_ls[i]
			n1 = len(pdata.gene_id2snps_id_ls[gene1_id])	#no_of_snps_of_this_gene
			#for gene2_id in pdata.gene_id2snps_id_ls:	#2009-2-8 another setting: gene_id_fname vs all genes
			for j in range(i+1, no_of_genes):
				gene2_id = gene_id_ls[j]
				n2 = len(pdata.gene_id2snps_id_ls[gene2_id])
				est_no_of_tests = (n1*n2)*no_of_phenotypes	#this is the upper bound for the number of tests for each gene on a computing node. data missing would make the number smaller.
				no_of_tests_per_node += est_no_of_tests
				gene_id_pairs_for_each_node.append((gene1_id, gene2_id))
				if no_of_tests_per_node>=block_size:
					yield gene_id_pairs_for_each_node
					#reset gene_id_pairs_for_each_node
					gene_id_pairs_for_each_node = []
					no_of_tests_per_node = 0	#reset this to 0
				
		#pick up the last gene_id_pairs_for_each_node if it's not empty
		if gene_id_pairs_for_each_node:
			yield gene_id_pairs_for_each_node
Example #27
0
	def dropRedundantEcotypes(self, input_fname, ecotypeid2tg_ecotypeid):
		"""
		2009-4-4
			retain only one row out of duplicated ecotype rows based on ecotypeid2tg_ecotypeid.
				it's not random. usually the one with same ecotype id as tg_ecotypeid unless tg_ecotypeid doesn't appear.
			if duplicated ecotypes belong to different haplotype group, choose the one with tg_ecotypeid otherwise random.
		"""
		sys.stderr.write("Dropping redundant ecotypes ...\n")
		reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname))
		col_name2col_index = getColName2IndexFromHeader(reader.next())
		ecotypeid_idx = col_name2col_index['ecotypeid']
		haplo_name_idx = col_name2col_index['haplogroup']
		nativename_idx = col_name2col_index['nativename']
		tg_ecotypeid2row = {}
		no_of_duplicates = 0
		no_of_duplicates_with_different_haplogroups = 0
		counter = 0
		for row in reader:
			ecotypeid = int(row[ecotypeid_idx])
			haplo_name = row[haplo_name_idx]
			nativename = row[nativename_idx]
			if ecotypeid in ecotypeid2tg_ecotypeid:
				tg_ecotypeid = ecotypeid2tg_ecotypeid[ecotypeid]
				if tg_ecotypeid not in tg_ecotypeid2row:
					tg_ecotypeid2row[tg_ecotypeid] = row
				else:
					no_of_duplicates += 1
					old_row = tg_ecotypeid2row[tg_ecotypeid]
					old_ecotypeid = int(old_row[ecotypeid_idx])
					old_haplo_name = old_row[haplo_name_idx]
					old_nativename = row[nativename_idx]
					if old_haplo_name!=haplo_name:
						sys.stderr.write("ecotype %s(%s) in haplotype group %s, while duplicate %s(%s) in haplotype group %s.\n"%\
										 (ecotypeid, nativename, haplo_name, old_ecotypeid, old_nativename, old_haplo_name))
						no_of_duplicates_with_different_haplogroups += 1
					if ecotypeid==tg_ecotypeid:	#replace if the new ecotypeid matching the tg_ecotypeid whether the haplotype group is same or not.
						tg_ecotypeid2row[tg_ecotypeid] = row
			else:
				sys.stderr.write("Warning: ecotype %s not in ecotypeid2tg_ecotypeid.\n"%(ecotypeid))
			counter += 1
		sys.stderr.write("no_of_duplicates: %s, out of which %s encompass different haplotype groups. %s accessions in total. Done.\n"%\
						 (no_of_duplicates, no_of_duplicates_with_different_haplogroups, counter))
		return tg_ecotypeid2row
Example #28
0
	def run(self):
		"""
		2008-9-7
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		delimiter = figureOutDelimiter(self.input_fname, report=self.report)
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter)
		
		snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
						data_matrix=data_matrix)
		newSnpData, allele_index2allele_ls = snpData.convert2Binary(self.report)
		
		if self.mapping_fname:	#output allele_index2allele_ls
			self.output_allele2index_ls(snpData, allele_index2allele_ls, self.mapping_fname)
		
		newSnpData.tofile(self.output_fname)
    def readInRefArrayData(self, input_fname, ref_array_id_set=None):
        """
		2010-5-25
		"""

        sys.stderr.write("Getting data matrix for reference arrays.\n")
        reader = csv.reader(open(input_fname),
                            delimiter=figureOutDelimiter(input_fname))
        for i in xrange(3):  # skip first 3 rows
            reader.next()
        data_matrix = []
        for row in reader:
            array_id = int(row[0])
            if array_id in ref_array_id_set:
                data_matrix.append(map(float, row[2:]))
        del reader
        data_matrix = numpy.array(data_matrix)
        sys.stderr.write("%s arrays, %s probes. Done.\n" %
                         (data_matrix.shape[0], data_matrix.shape[1]))
        return data_matrix
	def run(self):
		"""
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		inf = utils.openGzipFile(self.inputFname, openMode='r')
		
		reader = csv.reader(inf, delimiter=figureOutDelimiter(inf))
		header = None
		for i in xrange(self.noOfLinesInHeader):
			if i==0:
				header = reader.next()	#first line is taken as header
			else:
				reader.next()
		if header is not None:
			colName2Index = getColName2IndexFromHeader(header)
		
		newHeader = ['alignmentID', 'total_base_count', 'sampled_base_count', 'meanDepth', 'medianDepth', 'modeDepth']
		inputStatLs = []
		
		writer = csv.writer(utils.openGzipFile(self.outputFname, openMode='w'), delimiter='\t')
		writer.writerow(newHeader)
		counter = 0
		real_counter = 0
		for row in reader:
			counter += 1
			if real_counter <= self.maxNumberOfSamplings:
				r = random.random()
				if r<=self.fractionToSample and real_counter<=self.maxNumberOfSamplings:
					inputStatLs.append(float(row[self.whichColumn]))
					real_counter += 1
		
		meanDepth = numpy.mean(inputStatLs)
		medianDepth = numpy.median(inputStatLs)
		modeDepth = scipy.stats.mode(inputStatLs)[0][0]
		outputRow = [self.alignmentID, counter, real_counter, meanDepth, medianDepth, modeDepth]
		writer.writerow(outputRow)
		del writer
Example #31
0
	def get_snp_region_ls(self, ft_region_fname, snp_info, min_distance=5000):
		sys.stderr.write("Get all snp regions ...")
		delimiter = figureOutDelimiter(ft_region_fname)
		ft_region_reader = csv.reader(open(ft_region_fname, 'r'), delimiter=delimiter)		
		snp_region_ls = []
		for row in ft_region_reader:
			row = map(int, row)
			chr1, pos1, chr2, pos2 = row
			if pos2<pos1:
				pos1, pos2 = pos2, pos1
			span = abs(pos2-pos1)
			if span < min_distance*2:
				extra_span = (min_distance*2-span)/2
				pos1 = max(pos1 - extra_span, 1)
				pos2 = pos2+extra_span
			
			snp_region = self.findSNPsInRegion(snp_info, chr1, pos1, pos2, center_snp_position=None)
			snp_region_ls.append(snp_region)
		del ft_region_reader
		sys.stderr.write("Done.\n")
		return snp_region_ls
Example #32
0
	def run(self):
		"""
		2008-5-12
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		delimiter = figureOutDelimiter(self.input_fname, report=self.report)
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter)
		
		if self.array_id_2nd_column:
			snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
							data_matrix=data_matrix)
		else:
			snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\
							data_matrix=data_matrix)	#ignore category_list
		
		rawSnpsData_ls = SNPData2RawSnpsData_ls(snpData, need_transposeSNPData=1, report=self.report)
		chromosomes = [rawSnpsData.chromosome for rawSnpsData in rawSnpsData_ls]
		snpsdata.writeRawSnpsDatasToFile(self.output_fname, rawSnpsData_ls, chromosomes=chromosomes, deliminator=',', withArrayIds=self.array_id_2nd_column)
	def run(self):
		"""
		2007-02-27
		2007-09-14
			filtering_bits
		-read_data()
		-remove_rows_with_too_many_NAs()
		-remove_cols_with_too_many_NAs()
		-remove_identity_strains()
		-write_data_matrix()
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		delimiter = figureOutDelimiter(self.input_fname, report=self.report)
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, int(self.nt_alphabet_bits[0]), delimiter=delimiter)
		data_matrix = numpy.array(data_matrix)
		if self.filtering_bits[0]=='1':
			remove_rows_data = self.remove_rows_with_too_many_NAs(data_matrix, self.row_cutoff)
			rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set
			strain_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs
		else:
			rows_with_too_many_NAs_set = set()
		if self.filtering_bits[1]=='1':
			remove_cols_data = self.remove_cols_with_too_many_NAs(data_matrix, col_cutoff, rows_with_too_many_NAs_set)
			cols_with_too_many_NAs_set = remove_cols_data.cols_with_too_many_NAs_set			
		else:
			cols_with_too_many_NAs_set = set()
		if self.filtering_bits[2]=='1':
			no_of_rows, no_of_cols = data_matrix.shape
			total_rows_set = set(range(no_of_rows))
			rows_to_be_checked = total_rows_set - rows_with_too_many_NAs_set
			total_cols_set = set(range(no_of_cols))
			cols_to_be_checked = total_cols_set - cols_with_too_many_NAs_set
			identity_strains_to_be_removed = self.remove_identity_strains(data_matrix, rows_to_be_checked, cols_to_be_checked)
		else:
			identity_strains_to_be_removed = set()
		rows_to_be_tossed_out = rows_with_too_many_NAs_set | identity_strains_to_be_removed
		#self.write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, int(self.nt_alphabet_bits[1]))
		write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, nt_alphabet=int(self.nt_alphabet_bits[1]), delimiter=delimiter)
Example #34
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.user,
                               passwd=self.passwd)
        curs = conn.cursor()

        if self.ecotype_duplicate2tg_ecotypeid_table:
            ecotype_duplicate2tg_ecotypeid = self.get_ecotype_duplicate2tg_ecotypeid(
                curs, self.ecotype_duplicate2tg_ecotypeid_table)
        else:
            ecotype_duplicate2tg_ecotypeid = None
        from pymodule import figureOutDelimiter
        delimiter = figureOutDelimiter(self.input_fname)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname, delimiter=delimiter)

        tg_ecotypeid2ecotypeid_duplicate_index_ls = self.get_tg_ecotypeid2ecotypeid_duplicate_index_ls(
            strain_acc_list, category_list, ecotype_duplicate2tg_ecotypeid)

        ecotypeid2nativename = get_ecotypeid2nativename(
            curs, ecotype_table=self.ecotype_table)
        tg_ecotypeid_ls, merge_matrix = self.get_merged_matrix(tg_ecotypeid2ecotypeid_duplicate_index_ls, data_matrix, \
                     ecotypeid2nativename, self.stat_output_fname)

        tg_nativename_ls = []
        for ecotypeid in tg_ecotypeid_ls:
            tg_nativename_ls.append(ecotypeid2nativename[ecotypeid])
        header[1] = 'nativename'
        write_data_matrix(merge_matrix,
                          self.output_fname,
                          header,
                          tg_ecotypeid_ls,
                          tg_nativename_ls,
                          delimiter=delimiter)
	def trioInconsistentRateFileWalker(cls, inputFname, processFunc=None, minNoOfTotal=100, run_type=1):
		"""
		2012.10.25 only skip except during file opening, not file reading

		2011-9-30
		"""
		try:
			reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname))
			header = reader.next()
			col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True)
		except:
			sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
			import traceback
			traceback.print_exc()
			return
		inconsistent_rate_index = col_name2index.get("inconsistency")
		if run_type==1:
			index_of_x_data = col_name2index.get("stopFrequency")
		elif run_type==2:
			index_of_x_data = col_name2index.get("stop")
		else:
			sys.stderr.write("Unsupported run_type %s in trioInconsistentRateFileWalker().\n"%(run_type))
			sys.exit(3)
		index_of_no_of_total = col_name2index.get("no_of_total")
		inconsistent_rate_ls = []
		x_ls = []
		for row in reader:
			if self.samplingRate<1 and self.samplingRate>=0:
				r = random.random()
				if r>self.samplingRate:
					continue
			no_of_total = int(float(row[index_of_no_of_total]))
			if no_of_total<=minNoOfTotal:
				continue
			inconsistency = float(row[inconsistent_rate_index])
			inconsistent_rate_ls.append(inconsistency)
			x_data = float(row[index_of_x_data])
			x_ls.append(x_data)
		processFunc(x_ls, inconsistent_rate_ls)
		del reader
Example #36
0
	def generate_params(self, gene_id_fname, pdata, block_size=1000, **keywords):
		"""
		2009-2-12
			use yield make this function a generator
		2009-2-9
			add argument gene_id_fname to restrict analysis on genes from it
		2008-09-09
			estimate the number of tests each gene would encompass, and decide how many genes should be included in a set to send out
		2008-09-06
			each node handles a certain number of genes. identified by the index of the 1st gene and the index of the last gene.
		"""
		no_of_phenotypes = len(pdata.phenotype_index_ls)
		start_index = 0	#for each computing node: the index of gene >= start_index
		no_of_tests_per_node = 0
		
		#2009-2-9
		if gene_id_fname and os.path.isfile(gene_id_fname):
			reader = csv.reader(open(gene_id_fname), delimiter=figureOutDelimiter(gene_id_fname))
			gene_id_ls = []
			for row in reader:
				gene_id = int(row[0])
				gene_id_ls.append(gene_id)
			del reader
			pdata.gene_id_ls = gene_id_ls	#replace pdata's gene_id_ls
		
		no_of_genes = len(pdata.gene_id_ls)
		
		for i in range(no_of_genes):
			gene_id = pdata.gene_id_ls[i]
			n = len(pdata.gene_id2snps_id_ls[gene_id])	#no_of_snps_of_this_gene
			est_no_of_tests = (n*(n-1)*5/2.0 + n)*no_of_phenotypes	#this is the upper bound for the number of tests for each gene on a computing node. data missing would make the number smaller.
			no_of_tests_per_node += est_no_of_tests
			if no_of_tests_per_node>=block_size:
				yield (start_index, i+1)	#the computing node is gonna handle genes from pdata.gene_id_ls[start_index] to pdata.gene_id_ls[i]
				#reset the starting pointer to the index of the next gene
				start_index = i+1
				no_of_tests_per_node = 0	#reset this to 0
			elif i==no_of_genes-1:	#this is the last gene, have to include them
				yield (start_index, i+1)
Example #37
0
    def run(self):
        """
		2008-9-7
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        delimiter = figureOutDelimiter(self.input_fname, report=self.report)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname, delimiter=delimiter)

        snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
            data_matrix=data_matrix)
        newSnpData, allele_index2allele_ls = snpData.convert2Binary(
            self.report)

        if self.mapping_fname:  #output allele_index2allele_ls
            self.output_allele2index_ls(snpData, allele_index2allele_ls,
                                        self.mapping_fname)

        newSnpData.tofile(self.output_fname)
	def run(self):
		
		if self.debug:
			import pdb
			pdb.set_trace()
		try:
			inf = utils.openGzipFile(self.inputFname)
			delimiter = figureOutDelimiter(inf)
			if not delimiter:
				delimiter='\t'
			reader = csv.reader(inf, delimiter=delimiter)
			writer = csv.writer(open(self.outputFname, 'w'), delimiter=delimiter)
			extendHeader = []
			if self.addChrName:
				extendHeader.append(self.chrHeader)
			extendHeader.append(self.chrLengthHeader)
		except:
			sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
			import traceback
			traceback.print_exc()
			print sys.exc_info()
			sys.exit(0)
		try:
			header = self.processHeader(reader=reader, extendHeader=extendHeader, chrLengthHeader = self.chrLengthHeader)
			writer.writerow(header)
			for row in reader:
				new_data_row = self.processRow(row)
				writer.writerow(new_data_row)
			del reader
			del writer
		except:	#in case something wrong (i.e. file is empty)
			sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
			import traceback
			traceback.print_exc()
			print sys.exc_info()
			sys.exit(0)
Example #39
0
	def get_input(cls, input_fname, data_type=numpy.float32):
		"""
		2009-10-28
			switch the default data_type to numpy.float32 to save memory on 64bit machines
		2009-9-28
			add argument data_type to specify data type of data_matrix.
			default is numpy.float (numpy.float could be float32, float64, float128 depending on the architecture).
				numpy.double is also fine.
		2009-5-18
			become classmethod
		"""
		sys.stderr.write("Getting input from %s ..."%input_fname)
		reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname))
		commandline = 'wc -l %s'%input_fname
		command_handler = subprocess.Popen(commandline, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
		stdout_content, stderr_content = command_handler.communicate()
		if stderr_content:
			sys.stderr.write('stderr of %s: %s \n'%(commandline, stderr_content))
		no_of_rows = int(stdout_content.split()[0])-1
		
		header = reader.next()
		no_of_cols = len(header)-3
		data_matrix = numpy.zeros([no_of_rows, no_of_cols], data_type)
		probe_id_ls = []
		chr_pos_ls = []
		i=0
		for row in reader:
			
			probe_id = row[0]
			probe_id_ls.append(probe_id)
			chr_pos_ls.append(row[-2:])
			for j in range(1, 1+no_of_cols):
				data_matrix[i][j-1] = float(row[j])
			i += 1
		sys.stderr.write("Done.\n")
		return data_matrix, probe_id_ls, chr_pos_ls, header
Example #40
0
	def __init__(self, inputFname=None, **keywords):
		self.ad = ProcessOptions.process_function_arguments(keywords, self.option_default_dict, error_doc=self.__doc__, \
														class_to_have_attr=self)
		if not self.inputFname:
			self.inputFname = inputFname
		if self.inputFname and self.inputFile is None:
			self.inputFile = utils.openGzipFile(self.inputFname, openMode=self.openMode)
		
		self.filename = self.inputFname	#2013.05.03 for easy access
		
		self.csvFile = None
		self.isRealCSV = False
		if self.openMode=='r':	#reading mode
			if self.delimiter is None:
				self.delimiter = figureOutDelimiter(self.inputFile)
			
			if self.delimiter=='\t' or self.delimiter==',':
				self.csvFile = csv.reader(self.inputFile, delimiter=self.delimiter)
				self.isRealCSV = True
			else:
				self.csvFile = self.inputFile
				self.isRealCSV = False
		else:	#writing mode
			if not self.delimiter:
				self.delimiter = '\t'
			self.csvFile = csv.writer(self.inputFile, delimiter=self.delimiter)
			self.isRealCSV = True
			#else:
			#	self.csvFile = self.inputFile
			#	self.isRealCSV = False
		self.col_name2index = None
		
		self._row = None	#2013.08.30 to store the current row being read
		self.headerPattern = re.compile(r'^[a-zA-Z]')	#default header pattern, line beginned with letter
		self.commentPattern = re.compile(r'^#')	#default, beginned with #
		self.comment_row_list  = []
Example #41
0
	def putGeneListIntoDb(self, input_fname, list_type_id, list_type_name, gene_symbol2gene_id_set, db, skip_1st_line=False):
		"""
		2009-10-18
			If first column (gene symbol) is full of numbers, it's taken as a legitimate Gene ID.
		2009-2-4
			use getGeneIDSetGivenAccVer() from pymodule.utils to find gene_id_set
		2008-01-08
			add option skip_1st_line
			stop using csv.reader, use raw file handler instead
			figureOutDelimiter() is modified not to use csv.Sniffer() by default. it'll return delimiter None if the file is single-column.
		2008-12-11
			more filtering:
				1. strip the original_name
				2. pick alphanumeric characters out of original_name
			if GeneListType is already in db. check if GeneList has this gene already or not.
		2008-11-20
			use figureOutDelimiter() to get delimiter automatically
		2008-07-15
			if the list_type_name is given, forget about list_type_id. program will first search db for the given list_type_name, if search failed, create a new entry.
		2008-07-15
			use gene_id2original_name to avoid redundancy in gene list
		"""
		import csv, sys, os
		session = db.session
		delimiter=figureOutDelimiter(input_fname)
		inf = open(input_fname)	#2008-11-20
		if skip_1st_line:
			inf.next()	#skips the 1st line
		counter = 0
		success_counter = 0
		gene_id2original_name = {}	#to avoid redundancy in gene list
		for line in inf:
			if line=='\n':	#skip empty lines
				continue
			row = line.split(delimiter)
			original_name = row[0].strip()	#2008-12-11 remove spaces/tabs in the beginning/end
			all_number_p_search_result = self.all_number_p.search(original_name)
			if all_number_p_search_result:	# 2009-10-18 original_name is full of numbers. a legitimate Gene ID.
				ecotypeid = int(all_number_p_search_result.group(0))
				gene_id_set = set([ecotypeid])
			else:
				gene_id_set = getGeneIDSetGivenAccVer(original_name, gene_symbol2gene_id_set)
			
			if gene_id_set==None:
				sys.stderr.write("Linking to gene id failed for %s. No such in gene_symbol2gene_id_set.\n"%(original_name))
			elif len(gene_id_set)==1:
				gene_id = list(gene_id_set)[0]
				if gene_id not in gene_id2original_name:
					gene_id2original_name[gene_id] = original_name
				success_counter += 1
			elif len(gene_id_set)>1:
				sys.stderr.write("Too many gene_ids for %s: %s.\n"%(original_name, gene_id_set))
			elif len(gene_id_set)==0:
				sys.stderr.write("Linking to gene id failed for %s. gene_id_set is empty.\n"%(original_name))
			else:
				sys.stderr.write("not supposed to happen: original_name=%s, gene_id_set=%s\n."%(original_name, gene_id_set))
			counter += 1
		del inf
		
		if list_type_name:	#if the short name is given, forget about list_type_id
			glt = GeneListType.query.filter_by(short_name=list_type_name).first()	#try search the db first.
			if not glt:
				glt = GeneListType(short_name=list_type_name)
				session.save(glt)
				session.flush()
		else:	#use the list_type_id to get it
			glt = GeneListType.get(list_type_id)
		glt.original_filename = input_fname	#save the filename
		session.save_or_update(glt)
		
		for gene_id, original_name in gene_id2original_name.iteritems():
			if glt.id:	#2008-12-11 GeneListType is already in db. check if GeneList has this gene already or not.
				rows = GeneList.query.filter_by(gene_id=gene_id).filter_by(list_type_id=glt.id)
				if rows.count()>0:
					sys.stderr.write("Gene: %s (%s) already with list type %s.\n"%(gene_id, original_name, glt.short_name))
					continue
			gl = GeneList(gene_id=gene_id, list_type=glt, original_name=original_name)
			session.save(gl)
		sys.stderr.write("%s/%s linked successfully.\n"%(success_counter, counter))
Example #42
0
	def outputSNPDataInNewCoordinate(self, querySNPDataFname=None, querySNPID2NewReferenceCoordinateLs=None,\
									newSNPDataOutputFname=None, newSNPDataOutputFormat=1):
		"""
		2013.07.03 added argument newSNPDataOutputFormat
			
		2012.10.14
			split out of findSNPPositionOnNewRef()
		"""
		sys.stderr.write("Converting querySNPDataFname %s into individual X SNP format, format=%s ... "%\
						(querySNPDataFname, newSNPDataOutputFormat))
		"""
Sample  Geno    SNP
1999010 CC      cs_primer1082_247
1999068 CC      cs_primer1082_247
2000022 CT      cs_primer1082_247
2000064 CT      cs_primer1082_247
2000117 CC      cs_primer1082_247

		"""
		inf = utils.openGzipFile(querySNPDataFname)
		reader = csv.reader(inf, delimiter=figureOutDelimiter(inf))
		col_name2index = getColName2IndexFromHeader(reader.next())
		
		sampleIndex = col_name2index.get("Sample")
		genotypeIndex = col_name2index.get("Geno")
		SNPIDIndex = col_name2index.get("SNP")
		
		row_id2index = {}
		row_id_ls = []
		col_id_ls = []
		col_id2index = {}
		row_col_index2genotype = {}
		for row in reader:
			sampleID = row[sampleIndex]
			genotype = row[genotypeIndex]
			querySNPID = row[SNPIDIndex]
			if querySNPID in querySNPID2NewReferenceCoordinateLs:
				newRefCoordinateLs = querySNPID2NewReferenceCoordinateLs.get(querySNPID)
				if len(newRefCoordinateLs)==1:
					newRefCoordinate = newRefCoordinateLs[0]
					if newSNPDataOutputFormat==2:
						col_id = '%s_%s'%(newRefCoordinate.newChr, newRefCoordinate.newRefStart)
					else:
						col_id = '%s_%s_%s'%(newRefCoordinate.newChr, newRefCoordinate.newRefStart, newRefCoordinate.newRefStop)
					queryStrand = newRefCoordinate.queryStrand
					if col_id not in col_id2index:
						col_id2index[col_id] = len(col_id2index)
						col_id_ls.append(col_id)
					if sampleID not in row_id2index:
						row_id2index[sampleID] = len(row_id2index)
						row_id_ls.append(sampleID)
					if queryStrand == "-":
						genotype = SNP.reverseComplement(genotype)
					row_index = row_id2index[sampleID]
					col_index = col_id2index[col_id]
					row_col_index2genotype[(row_index, col_index)] = genotype
				else:
					continue
		data_matrix = numpy.zeros([len(row_id_ls), len(col_id2index)], dtype=numpy.int8)
		
		for row_col_index, genotype in row_col_index2genotype.iteritems():
			row_index, col_index = row_col_index[:2]
			data_matrix[row_index, col_index] = SNP.nt2number[genotype]
		sys.stderr.write("\n")
		snpData = SNP.SNPData(row_id_ls=row_id_ls, col_id_ls=col_id_ls, data_matrix=data_matrix)
		snpData.tofile(newSNPDataOutputFname)
    def putHaplotypeGroupIntoDB(self, session, input_fname,
                                max_snp_typing_error_rate, snp_id_ls):
        """
		2009-4-10
			remove tg_ecotypeid2row
		2009-4-4
			add argument tg_ecotypeid2row
		2009-3-31
		"""
        sys.stderr.write("Constructing haplotype groups ...\n")
        pattern_ecotypeid = re.compile(r'(?<=\))\d+')
        reader = csv.reader(open(input_fname),
                            delimiter=figureOutDelimiter(input_fname))
        col_name2col_index = getColName2IndexFromHeader(reader.next())
        ecotypeid_idx = col_name2col_index['ecotypeid']
        haplo_name_idx = col_name2col_index['haplogroup']
        geographic_integrity_idx = col_name2col_index['geographic_integrity']
        filtered_SNPs_idx = col_name2col_index['filtered_SNPs']
        counter = 0
        #for tg_ecotypeid, row in tg_ecotypeid2row.iteritems():
        for row in reader:
            ecotypeid = int(row[ecotypeid_idx])
            #ecotypeid = tg_ecotypeid	#2009-4-4 use tg_ecotypeid instead
            haplo_name = row[haplo_name_idx]
            geographic_integrity_name = row[geographic_integrity_idx]
            filtered_SNPs = row[filtered_SNPs_idx]
            ref_ecotypeid = int(pattern_ecotypeid.search(haplo_name).group(0))
            haplo_group = StockDB.HaploGroup.query.filter_by(
                short_name=haplo_name).first()
            if not haplo_group:
                haplo_group = StockDB.HaploGroup(
                    short_name=haplo_name,
                    ref_ecotypeid=ref_ecotypeid,
                    max_snp_typing_error_rate=max_snp_typing_error_rate)
                session.save(haplo_group)
                session.flush()

            ecotype = StockDB.Ecotype.get(ecotypeid)
            haplo_group.ecotypes.append(ecotype)
            geographic_integrity = StockDB.GeographicIntegrity.query.filter_by(
                short_name=geographic_integrity_name).first()
            if not geographic_integrity:
                geographic_integrity = StockDB.GeographicIntegrity(
                    short_name=geographic_integrity_name)
                session.save(geographic_integrity)
                session.flush()
            ecotype.geographic_integrity = geographic_integrity
            session.save_or_update(ecotype)
            #one bit of ecotype: link the ecotypeid to tg_ecotype_id

            #deal with filtered SNPs
            for i in range(len(filtered_SNPs)):
                allele = filtered_SNPs[i]
                if allele == '_':
                    continue
                fc = StockDB.FilteredCalls(ecotypeid=ecotypeid,
                                           snpid=snp_id_ls[i],
                                           allele=allele)
                session.save(fc)
                session.flush()
            counter += 1
            if counter % 500 == 0 and self.report:
                sys.stderr.write('%s%s' % ('\x08' * 80, counter))
        session.flush()
        sys.stderr.write("Done.\n")
Example #44
0
	def copyAndReformatResultFile(self, db, inputFname=None, db_entry=None, user=None, output_fname=None):
		"""
		2011-2-22
			Locus are now identified as Snps.id / CNV.id in association result files. (chr, pos) before.
		2009-1-7
			insert float into the middle below
				column_5th=int(float(row[4]))	#int('89.0') would raise an exception
		2008-11-12
			parse lines with column_6(genotype_var_perc) and more (comment)
		2008-09-30
			deal with 5-column file. The 5-th column is minor allele count.
			also return True in the end. return False if error in the middle.
		2008-08-19
			add original_filename to ResultsMethod
		2008-07-16
			if inputFname is neither file name nor file object, exit the program
			better handling of the column_4th and its header
		2008-07-16
			if it's 4-column, the last one is MAF.
			can't deal with segment score anymore.
		2008-05-30
			merged with store_file()
				dump the file onto file system storage if output_fname is given
				db submission is too slow
		2008-05-26
			inputFname from plone is not file object although it has file object interface.
		2008-05-26
			csv.Sniffer() can't figure out delimiter if '\n' is in the string, use own dumb function figureOutDelimiter()
		2008-05-25
			save marker(snps) in database if it's not there.
			use marker id in results table
		2008-05-24
			figure out delimiter automatically
			inputFname could be a file object (from plone)
			phenotype method doesn't go with results anymore. it goes with results_method
		2008-04-28
			changed to use Stock_250kDatabase (SQLAlchemy) to do db submission
		"""
		if (isinstance(inputFname, str) or isinstance(inputFname, unicode)) and os.path.isfile(inputFname):
			sys.stderr.write("Submitting results from %s ..."%(os.path.basename(inputFname)))
			delimiter = figureOutDelimiter(inputFname)
			reader = csv.reader(open(inputFname), delimiter=delimiter)
			db_entry.original_filename = inputFname
		elif hasattr(inputFname, 'readline') or hasattr(inputFname, 'read'):	#inputFname is not a file name, but direct file object. it could also be <ZPublisher.HTTPRequest.FileUpload instance at 0xa1774f4c>
			sys.stderr.write("Submitting results from %s on plone ..."%inputFname.filename)
			cs = csv.Sniffer()
			inputFname.seek(0)	#it's already read by plone to put int data['inputFname'], check results2db_250k.py
			if getattr(inputFname, 'readline', None) is not None:
				test_line = inputFname.readline()
				delimiter = cs.sniff(test_line).delimiter
			else:
				test_line = inputFname.read(200)
				delimiter = figureOutDelimiter(test_line)	#counting is a safer solution. if test_line include '\n', cs.sniff() won't figure it out.
			inputFname.seek(0)
			reader = csv.reader(inputFname, delimiter=delimiter)
			if getattr(inputFname, 'filename', None):
				db_entry.original_filename = getattr(inputFname, 'filename', None)
			else:
				db_entry.original_filename = getattr(inputFname, 'name', None)
		else:
			sys.stderr.write("Error: %s is neither a file name nor a file object.\n"%inputFname)
			sys.exit(4)
		
		if output_fname:
			if os.path.isfile(output_fname):
				sys.stderr.write("Error: file %s already exists. Skip.\n"%output_fname)
				return False
			writer = csv.writer(open(output_fname, 'w'), delimiter='\t')
		elif self.marker_pos2snp_id is None:
			self.marker_pos2snp_id = self.get_marker_pos2snp_id(db)
		
		header_outputted = 0
		no_of_lines = 0
		
		session = db.session
		for row in reader:
			#check if 1st line is header or not
			if no_of_lines ==0 and self.pa_has_characters.search(row[1]):	#check the 2nd one, which is strict digits. while the 1st column, chromosome could be 'X' or something
				continue
			snp_id = int(row[0])
			if row[1] and row[1]!='0':	#2011-2-22 something on 2nd column. wrong format.
				chr = int(row[0])
				start_pos = int(row[1])
				sys.stderr.write("Error: current version doesn't take chr,pos as marker ID anymore. Has to be one id (either Snps.id or CNV.id).\n")
				sys.exit(4)
			score = row[2]
			stop_pos = None
			column_4th = None
			column_5th = None
			column_6 = None
			rest_of_row = []
			rest_of_header = []
			
			#marker_name = '%s_%s'%(chr, start_pos)	#2011-2-22
			if len(row)>=4:
				column_4th=row[3]
				#stop_pos = int(row[2])
				#score = row[3]
			if len(row)>=5:
				#column_4th=row[3]
				column_5th=int(float(row[4]))	#2009-1-7 int('89.0') would raise an exception
			if len(row)>=6:
				column_6 = row[5]
			if len(row)>=7:
				rest_of_row = row[6:]
				rest_of_header = ['beta%s'%i for i in range(len(rest_of_row))]
				#sys.stderr.write("ERROR: Found %s columns.\n"%(len(row)))
				#return False
			
			if output_fname:	#go to file system
				if not header_outputted:	#3-column or 4-column header
					#if stop_pos is not None:	#2011-2-22
					#	position_header = ['start_position', 'stop_position']
					#else:
					#	position_header = ['position']
					header = ['snp_id', 'none', 'score']	#2011-2-22
					if column_4th is not None:
						header.append('MAF')
					if column_5th is not None:
						header.append('MAC')	#Minor Allele Count
					if column_6 is not None:
						header.append('genotype_var_perc')	#genotype variance percentage
					if rest_of_row:
						header += rest_of_header
					writer.writerow(header)
					header_outputted = 1
				#data_row = [chr, start_pos]	#2011-2-22
				data_row = [snp_id, '']	#2011-2-22
				#if stop_pos is not None:	#2011-2-22
				#	data_row.append(stop_pos)
				data_row.append(score)
				if column_4th is not None:
					data_row.append(column_4th)
				if column_5th is not None:
					data_row.append(column_5th)
					if db_entry.no_of_accessions is None:	#calculate the no_of_accessions based on MAC/MAF
						db_entry.no_of_accessions = int(round(float(column_5th)/float(column_4th)))
				if column_6 is not None:
					data_row.append(column_6)
				if rest_of_row:
					data_row += rest_of_row
				writer.writerow(data_row)
			"""
			# 2011-2-22 store the results directly into db. only for old SNP association results.
			else:
				
				key = (chr, start_pos, stop_pos)
				if key in self.marker_pos2snp_id:
					snps_id = self.marker_pos2snp_id[key]
					if isinstance(snps_id, SNPs):	#it's a new marker object
						r = Results(score=score)
						r.snps = snps_id
					else:	#others are all integer ids
						r = Results(snps_id=snps_id, score=score)
				else:
					#construct a new marker
					marker = SNPs(name=marker_name, chromosome=chr, position=start_pos, end_position=stop_pos, created_by=user)
					#save it in database to get id
					session.add(marker)
					self.marker_pos2snp_id[key] = marker	#for the next time to encounter same marker
					self.is_new_marker_added = True	#set this flag as new marker was inputted into the dict
					r = Results(score=score)
					r.snps = marker
					del marker
				r.results_method = db_entry
				session.add(r)
				del r
			"""
			no_of_lines += 1
		
		del reader
		if output_fname:
			del writer
		sys.stderr.write("Done.\n")
		return 0
Example #45
0
    def traverse(self):
        """
		2012.1.9
		"""
        newHeader = []
        key2dataLs = {
        }  #key is the keyColumn, dataLs corresponds to the sum of each column from valueColumnLs
        delimiter = None
        for inputFname in self.inputFnameLs:
            if not os.path.isfile(inputFname):
                if self.exitNonZeroIfAnyInputFileInexistent:
                    sys.exit(3)
                else:
                    continue
            reader = None
            try:
                inputFile = utils.openGzipFile(inputFname)
                delimiter = figureOutDelimiter(inputFile)
                reader = MatrixFile(inputFile=inputFile, delimiter=delimiter)
            except:
                sys.stderr.write('Except type: %s\n' % repr(sys.exc_info()))
                import traceback
                traceback.print_exc()

            try:
                #if isCSVReader:
                header = reader.next()
                #else:
                #	header = inputFile.readline().strip().split()	#whatever splits them
                self.handleNewHeader(header,
                                     newHeader,
                                     self.keyColumnLs,
                                     self.valueColumnLs,
                                     keyColumnSet=self.keyColumnSet)
                if self.noHeader:  #2012.8.10
                    inputFile.seek(0)
                    reader = MatrixFile(inputFile=inputFile,
                                        delimiter=delimiter)
            except:  #in case something wrong (i.e. file is empty)
                sys.stderr.write('Except type: %s\n' % repr(sys.exc_info()))
                import traceback
                traceback.print_exc()

            if reader is not None:
                for row in reader:
                    #if not isCSVReader:
                    #	row = row.strip().split()
                    try:
                        self.handleValueColumns(
                            row,
                            key2dataLs=key2dataLs,
                            keyColumnLs=self.keyColumnLs,
                            valueColumnLs=self.valueColumnLs)
                    except:  #in case something wrong (i.e. file is empty)
                        sys.stderr.write('Ignore this row: %s.\n' % repr(row))
                        sys.stderr.write('Except type: %s\n' %
                                         repr(sys.exc_info()))
                        import traceback
                        traceback.print_exc()
                del reader
        if self.noHeader:  #2012.8.10
            newHeader = None
        returnData = PassingData(key2dataLs=key2dataLs,
                                 delimiter=delimiter,
                                 header=newHeader)
        return returnData
Example #46
0
	def submit_results(cls, db, input_fname, rm, user, output_fname=None):
		"""
		2009-1-7
			insert float into the middle below
				column_5th=int(float(row[4]))	#int('89.0') would raise an exception
		2008-11-12
			parse lines with column_6(genotype_var_perc) and more (comment)
		2008-09-30
			deal with 5-column file. The 5-th column is minor allele count.
			also return True in the end. return False if error in the middle.
		2008-08-19
			add original_filename to ResultsMethod
		2008-07-16
			if input_fname is neither file name nor file object, exit the program
			better handling of the column_4th and its header
		2008-07-16
			if it's 4-column, the last one is MAF.
			can't deal with segment score anymore.
		2008-05-30
			merged with store_file()
				dump the file onto file system storage if output_fname is given
				db submission is too slow
		2008-05-26
			input_fname from plone is not file object although it has file object interface.
		2008-05-26
			csv.Sniffer() can't figure out delimiter if '\n' is in the string, use own dumb function figureOutDelimiter()
		2008-05-25
			save marker(snps) in database if it's not there.
			use marker id in results table
		2008-05-24
			figure out delimiter automatically
			input_fname could be a file object (from plone)
			phenotype method doesn't go with results anymore. it goes with results_method
		2008-04-28
			changed to use Stock_250kDatabase (SQLAlchemy) to do db submission
		"""
		if isinstance(input_fname, str) and os.path.isfile(input_fname):
			sys.stderr.write("Submitting results from %s ..."%(os.path.basename(input_fname)))
			delimiter = figureOutDelimiter(input_fname)
			reader = csv.reader(open(input_fname), delimiter=delimiter)
			rm.original_filename = input_fname
		elif hasattr(input_fname, 'readline') or hasattr(input_fname, 'read'):	#input_fname is not a file name, but direct file object. it could also be <ZPublisher.HTTPRequest.FileUpload instance at 0xa1774f4c>
			sys.stderr.write("Submitting results from %s on plone ..."%input_fname.filename)
			cs = csv.Sniffer()
			input_fname.seek(0)	#it's already read by plone to put int data['input_fname'], check results2db_250k.py
			if getattr(input_fname, 'readline', None) is not None:
				test_line = input_fname.readline()
				delimiter = cs.sniff(test_line).delimiter
			else:
				test_line = input_fname.read(200)
				delimiter = figureOutDelimiter(test_line)	#counting is a safer solution. if test_line include '\n', cs.sniff() won't figure it out.
			input_fname.seek(0)
			reader = csv.reader(input_fname, delimiter=delimiter)
			if getattr(input_fname, 'filename', None):
				rm.original_filename = getattr(input_fname, 'filename', None)
			else:
				rm.original_filename = getattr(input_fname, 'name', None)
		else:
			sys.stderr.write("Error: %s is neither a file name nor a file object.\n"%input_fname)
			sys.exit(4)
		
		if output_fname:
			writer = csv.writer(open(output_fname, 'w'), delimiter='\t')
		elif cls.marker_pos2snp_id is None:
			cls.marker_pos2snp_id = cls.get_marker_pos2snp_id(db)
		
		header_outputted = 0
		no_of_lines = 0
		
		session = db.session
		for row in reader:
			#check if 1st line is header or not
			if no_of_lines ==0 and cls.pa_has_characters.search(row[1]):	#check the 2nd one, which is strict digits. while the 1st column, chromosome could be 'X' or something
				continue
			chr = int(row[0])
			start_pos = int(row[1])
			score = row[2]
			stop_pos = None
			column_4th = None
			column_5th = None
			column_6 = None
			rest_of_row = []
			rest_of_header = []
			
			marker_name = '%s_%s'%(chr, start_pos)
			if len(row)>=4:
				column_4th=row[3]
				#stop_pos = int(row[2])
				#score = row[3]
			if len(row)>=5:
				#column_4th=row[3]
				column_5th=int(float(row[4]))	#2009-1-7 int('89.0') would raise an exception
			if len(row)>=6:
				column_6 = row[5]
			if len(row)>=7:
				rest_of_row = row[6:]
				rest_of_header = ['beta%s'%i for i in range(len(rest_of_row))]
				#sys.stderr.write("ERROR: Found %s columns.\n"%(len(row)))
				#return False
			
			if output_fname:	#go to file system
				if not header_outputted:	#3-column or 4-column header
					if stop_pos is not None:
						position_header = ['start_position', 'stop_position']
					else:
						position_header = ['position']
					header = ['chromosome'] + position_header + ['score']
					if column_4th is not None:
						header.append('MAF')
					if column_5th is not None:
						header.append('MAC')	#Minor Allele Count
					if column_6 is not None:
						header.append('genotype_var_perc')	#genotype variance percentage
					if rest_of_row:
						header += rest_of_header
					writer.writerow(header)
					header_outputted = 1
				data_row = [chr, start_pos]
				if stop_pos is not None:
					data_row.append(stop_pos)
				data_row.append(score)
				if column_4th is not None:
					data_row.append(column_4th)
				if column_5th is not None:
					data_row.append(column_5th)
				if column_6 is not None:
					data_row.append(column_6)
				if rest_of_row:
					data_row += rest_of_row
				writer.writerow(data_row)
			else:
				key = (chr, start_pos, stop_pos)
				if key in cls.marker_pos2snp_id:
					snps_id = cls.marker_pos2snp_id[key]
					if isinstance(snps_id, SNPs):	#it's a new marker object
						r = Results(score=score)
						r.snps = snps_id
					else:	#others are all integer ids
						r = Results(snps_id=snps_id, score=score)
				else:
					#construct a new marker
					marker = SNPs(name=marker_name, chromosome=chr, position=start_pos, end_position=stop_pos, created_by=user)
					#save it in database to get id
					session.save(marker)
					cls.marker_pos2snp_id[key] = marker	#for the next time to encounter same marker
					cls.is_new_marker_added = True	#set this flag as new marker was inputted into the dict
					r = Results(score=score)
					r.snps = marker
					del marker
				r.results_method = rm
				session.save(r)
				del r
			no_of_lines += 1
		
		del reader
		if output_fname:
			del writer
		sys.stderr.write("Done.\n")
		return True
Example #47
0
    def run(self):

        if self.debug:
            import pdb
            pdb.set_trace()

        no_of_result1_peaks_ls = []
        no_of_result2_peaks_ls = []
        fraction_of_result1_peaks_in_result2_ls = []
        fraction_of_result2_peaks_in_result1_ls = []
        no_of_combined_peaks_ls = []
        fraction_of_overlap_in_combined_peaks_ls = []

        for inputFname in self.inputFnameLs:
            reader = csv.reader(open(inputFname),
                                delimiter=figureOutDelimiter(inputFname))
            header = reader.next()
            col_name2index = getColName2IndexFromHeader(header,
                                                        skipEmptyColumn=True)
            no_of_result1_peaks_index = col_name2index.get(
                "no_of_result1_peaks")
            no_of_result2_peaks_index = col_name2index.get(
                "no_of_result2_peaks")
            no_of_result1_peaks_in_result2_index = col_name2index.get(
                "no_of_result1_peaks_in_result2")
            no_of_result2_peaks_in_result1_index = col_name2index.get(
                "no_of_result2_peaks_in_result1")
            for row in reader:
                no_of_result1_peaks = float(row[no_of_result1_peaks_index])
                no_of_result2_peaks = float(row[no_of_result2_peaks_index])
                no_of_result1_peaks_in_result2 = float(
                    row[no_of_result1_peaks_in_result2_index])
                no_of_result2_peaks_in_result1 = float(
                    row[no_of_result2_peaks_in_result1_index])
                no_of_result1_peaks_ls.append(no_of_result1_peaks)
                no_of_result2_peaks_ls.append(no_of_result2_peaks)
                fraction_of_result1_peaks_in_result2_ls.append(
                    no_of_result1_peaks_in_result2 / no_of_result1_peaks)
                fraction_of_result2_peaks_in_result1_ls.append(
                    no_of_result2_peaks_in_result1 / no_of_result2_peaks)
                no_of_combined_peaks_ls.append(no_of_result1_peaks +
                                               no_of_result2_peaks)
                fraction_of_overlap_in_combined_peaks_ls.append(
                    (no_of_result1_peaks_in_result2 +
                     no_of_result2_peaks_in_result1) /
                    (no_of_result1_peaks + no_of_result2_peaks))
            del reader

        title = "%s pairs" % (len(fraction_of_result1_peaks_in_result2_ls))
        if len(fraction_of_result1_peaks_in_result2_ls) > 10:
            medianFraction = numpy.median(
                fraction_of_result1_peaks_in_result2_ls)
            title += " median %.3f" % (medianFraction)
        yh_matplotlib.drawHist(fraction_of_result1_peaks_in_result2_ls, title=title, \
            xlabel_1D="fraction of result1 peaks in result2", xticks=None, \
            outputFname="%s_hist_of_fraction_of_result1_peaks_in_result2.png"%self.outputFnamePrefix, \
            min_no_of_data_points=20, needLog=False, \
            dpi=200)
        title = "%s pairs" % (len(fraction_of_result2_peaks_in_result1_ls))
        if len(fraction_of_result2_peaks_in_result1_ls) > 10:
            medianFraction = numpy.median(
                fraction_of_result2_peaks_in_result1_ls)
            title += " median %.3f" % (medianFraction)
        yh_matplotlib.drawHist(fraction_of_result2_peaks_in_result1_ls, title=title, \
            xlabel_1D="fraction of result2 peaks in result1", xticks=None, \
            outputFname="%s_hist_of_fraction_of_result2_peaks_in_result1.png"%self.outputFnamePrefix, \
            min_no_of_data_points=20, needLog=False, \
            dpi=200)

        title = "%s pairs" % (len(fraction_of_overlap_in_combined_peaks_ls))
        if len(fraction_of_overlap_in_combined_peaks_ls) > 10:
            medianFraction = numpy.median(
                fraction_of_overlap_in_combined_peaks_ls)
            title += " median %.3f" % (medianFraction)
        yh_matplotlib.drawHist(fraction_of_overlap_in_combined_peaks_ls, title=title, \
            xlabel_1D="fraction of recurrent peaks in combined", xticks=None, \
            outputFname="%s_hist_of_fraction_of_recurrent_peaks_in_combined.png"%self.outputFnamePrefix, \
            min_no_of_data_points=20, needLog=False, \
            dpi=200)

        title = "%s results" % (len(no_of_result1_peaks_ls))
        yh_matplotlib.drawScatter(no_of_result1_peaks_ls, no_of_result2_peaks_ls, \
          fig_fname="%s_no_of_peaks_result1_vs_result2.png"%self.outputFnamePrefix, \
          title=title, xlabel='No. of peaks in result1', \
          ylabel='No. of peaks in result2', dpi=300)

        title = "%s results" % (len(no_of_result1_peaks_ls))
        yh_matplotlib.drawScatter(no_of_result1_peaks_ls, fraction_of_result1_peaks_in_result2_ls, \
          fig_fname="%s_result1_no_of_peak_vs_fraction.png"%self.outputFnamePrefix, \
          title=title, xlabel='No. of peaks in result1', \
          ylabel='Fraction found in result2', dpi=300)

        title = "%s results" % (len(no_of_result2_peaks_ls))
        yh_matplotlib.drawScatter(no_of_result2_peaks_ls, fraction_of_result2_peaks_in_result1_ls, \
          fig_fname="%s_result2_no_of_peak_vs_fraction.png"%self.outputFnamePrefix, \
          title=title, xlabel='No. of peaks in result2', \
          ylabel='Fraction found in result1', dpi=300)

        title = "%s pairs" % (len(fraction_of_result1_peaks_in_result2_ls))
        yh_matplotlib.drawScatter(fraction_of_result1_peaks_in_result2_ls, fraction_of_result2_peaks_in_result1_ls, \
          fig_fname="%s_1_fraction_in2_vs_2_fraction_in1.png"%self.outputFnamePrefix, \
          title=title, xlabel='result1 fraction found in result2', \
          ylabel='result2 fraction found in result1', dpi=300)

        title = "%s pairs" % (len(no_of_combined_peaks_ls))
        yh_matplotlib.drawScatter(no_of_combined_peaks_ls, fraction_of_overlap_in_combined_peaks_ls, \
          fig_fname="%s_combined_no_of_peak_vs_fraction.png"%self.outputFnamePrefix, \
          title=title, xlabel='No. of peaks combined', \
          ylabel='Fraction recurrent', dpi=300)
Example #48
0
	def traverse(self):
		"""
		self.noHeader:	#2012.8.10
		2012.1.9
		"""
		newHeader = []
		key2dataLs = {}	#key is the keyColumn, dataLs corresponds to the sum of each column from valueColumnLs 
		delimiter = None
		noOfDataColumnsFromPriorFiles = 0
		for inputFname in self.inputFnameLs:
			if not os.path.isfile(inputFname):
				if self.exitNonZeroIfAnyInputFileInexistent:
					sys.exit(3)
				else:
					continue
			reader = None
			try:
				inputFile = utils.openGzipFile(inputFname)
				delimiter = figureOutDelimiter(inputFile)
				reader = MatrixFile(inputFile=inputFile, delimiter=delimiter)
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
			
			valueColumnLs = []
			try:
				header = reader.next()
				self.handleNewHeader(header, newHeader, self.keyColumnLs, valueColumnLs, keyColumnSet=self.keyColumnSet)
				if self.noHeader:	#2012.8.10
					inputFile.seek(0)
					reader = MatrixFile(inputFile=inputFile, delimiter=delimiter)
			except:	#in case something wrong (i.e. file is empty)
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
			
			if reader is not None and valueColumnLs:
				visitedKeySet = set()
				for row in reader:
					try:
						self.handleValueColumns(row, key2dataLs=key2dataLs, keyColumnLs=self.keyColumnLs, \
								valueColumnLs=valueColumnLs, noOfDataColumnsFromPriorFiles=noOfDataColumnsFromPriorFiles, \
								visitedKeySet=visitedKeySet)
					except:	#in case something wrong (i.e. file is empty)
						sys.stderr.write('Ignore this row: %s.\n'%repr(row))
						sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
						import traceback
						traceback.print_exc()
				del reader
				#append empty data to keys who are not present in this current "reader" file
				totalKeySet = set(key2dataLs.keys())
				unvisitedKeySet = totalKeySet - visitedKeySet
				for key in unvisitedKeySet:
					for i in valueColumnLs:
						key2dataLs[key].append('')
			noOfDataColumnsFromPriorFiles += len(valueColumnLs)
		if self.noHeader:	#2012.8.10
			newHeader = None
		returnData = PassingData(key2dataLs=key2dataLs, delimiter=delimiter, header=newHeader)
		return returnData
    def putGeneListIntoDb(self,
                          input_fname,
                          list_type_id,
                          list_type_name,
                          gene_symbol2gene_id_set,
                          db,
                          skip_1st_line=False):
        """
		2009-10-18
			If first column (gene symbol) is full of numbers, it's taken as a legitimate Gene ID.
		2009-2-4
			use getGeneIDSetGivenAccVer() from pymodule.utils to find gene_id_set
		2008-01-08
			add option skip_1st_line
			stop using csv.reader, use raw file handler instead
			figureOutDelimiter() is modified not to use csv.Sniffer() by default. it'll return delimiter None if the file is single-column.
		2008-12-11
			more filtering:
				1. strip the original_name
				2. pick alphanumeric characters out of original_name
			if GeneListType is already in db. check if GeneList has this gene already or not.
		2008-11-20
			use figureOutDelimiter() to get delimiter automatically
		2008-07-15
			if the list_type_name is given, forget about list_type_id. program will first search db for the given list_type_name, if search failed, create a new entry.
		2008-07-15
			use gene_id2original_name to avoid redundancy in gene list
		"""
        import csv, sys, os
        session = db.session
        delimiter = figureOutDelimiter(input_fname)
        inf = open(input_fname)  #2008-11-20
        if skip_1st_line:
            inf.next()  #skips the 1st line
        counter = 0
        success_counter = 0
        gene_id2original_name = {}  #to avoid redundancy in gene list
        for line in inf:
            if line == '\n':  #skip empty lines
                continue
            row = line.split(delimiter)
            original_name = row[0].strip(
            )  #2008-12-11 remove spaces/tabs in the beginning/end
            all_number_p_search_result = self.all_number_p.search(
                original_name)
            if all_number_p_search_result:  # 2009-10-18 original_name is full of numbers. a legitimate Gene ID.
                ecotypeid = int(all_number_p_search_result.group(0))
                gene_id_set = set([ecotypeid])
            else:
                gene_id_set = getGeneIDSetGivenAccVer(original_name,
                                                      gene_symbol2gene_id_set)

            if gene_id_set == None:
                sys.stderr.write(
                    "Linking to gene id failed for %s. No such in gene_symbol2gene_id_set.\n"
                    % (original_name))
            elif len(gene_id_set) == 1:
                gene_id = list(gene_id_set)[0]
                if gene_id not in gene_id2original_name:
                    gene_id2original_name[gene_id] = original_name
                success_counter += 1
            elif len(gene_id_set) > 1:
                sys.stderr.write("Too many gene_ids for %s: %s.\n" %
                                 (original_name, gene_id_set))
            elif len(gene_id_set) == 0:
                sys.stderr.write(
                    "Linking to gene id failed for %s. gene_id_set is empty.\n"
                    % (original_name))
            else:
                sys.stderr.write(
                    "not supposed to happen: original_name=%s, gene_id_set=%s\n."
                    % (original_name, gene_id_set))
            counter += 1
        del inf

        if list_type_name:  #if the short name is given, forget about list_type_id
            glt = GeneListType.query.filter_by(
                short_name=list_type_name).first()  #try search the db first.
            if not glt:
                glt = GeneListType(short_name=list_type_name)
                session.save(glt)
                session.flush()
        else:  #use the list_type_id to get it
            glt = GeneListType.get(list_type_id)
        glt.original_filename = input_fname  #save the filename
        session.save_or_update(glt)

        for gene_id, original_name in gene_id2original_name.iteritems():
            if glt.id:  #2008-12-11 GeneListType is already in db. check if GeneList has this gene already or not.
                rows = GeneList.query.filter_by(gene_id=gene_id).filter_by(
                    list_type_id=glt.id)
                if rows.count() > 0:
                    sys.stderr.write(
                        "Gene: %s (%s) already with list type %s.\n" %
                        (gene_id, original_name, glt.short_name))
                    continue
            gl = GeneList(gene_id=gene_id,
                          list_type=glt,
                          original_name=original_name)
            session.save(gl)
        sys.stderr.write("%s/%s linked successfully.\n" %
                         (success_counter, counter))
	def run(self):
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		#['trio_set', 'chromosome', 'pos', 'depthOfFather','depthOfMother', 'depthOfChild', 'isInconsistent']
		
		chr_pos2inconsistentData = {}	#key is (chr,pos),
		#value is (noOfInconsistencyInTrio, noOfTotalInTrio, noOfInconsistencyInDuo, noOfTotalInDuo)
		sys.stderr.write("Reading from %s files ...\n"%(len(self.inputFnameLs)))
		for inputFname in self.inputFnameLs:
			if not os.path.isfile(inputFname):
				continue
			reader = None
			trioSetStrIndex = None
			chromosomeIndex = None
			posIndex = None
			isInconsistentIndex = None
			try:
				inputFile = utils.openGzipFile(inputFname)
				delimiter = figureOutDelimiter(inputFile)
				reader = csv.reader(inputFile, delimiter=delimiter)
				header = reader.next()
				col_name2index = getColName2IndexFromHeader(header)
				
				trioSetStrIndex = col_name2index.get("#trio_set")
				chromosomeIndex = col_name2index.get("chromosome")
				posIndex = col_name2index.get("pos")
				isInconsistentIndex = col_name2index.get("isInconsistent")
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
			if reader is not None and isInconsistentIndex is not None:
				for row in reader:
					trio_set_str = row[trioSetStrIndex]
					chromosome = row[chromosomeIndex]
					pos = int(row[posIndex])
					isInconsistent = int(row[isInconsistentIndex])
					chr_pos = (chromosome, pos)
					if chr_pos not in chr_pos2inconsistentData:
						chr_pos2inconsistentData[chr_pos] = [0, 0, 0, 0]
					#trio_set_ls = trio_set_str.split(',')
					if trio_set_str.find("0")==0 or trio_set_str.find(",0")!=-1:	#it's a duo. one parent is missing.
						chr_pos2inconsistentData[chr_pos][2] += isInconsistent
						chr_pos2inconsistentData[chr_pos][3] += 1
					else:	#it's a trio
						chr_pos2inconsistentData[chr_pos][0] += isInconsistent
						chr_pos2inconsistentData[chr_pos][1] += 1
						
		sys.stderr.write("Done.\n")
		
		sys.stderr.write("Outputting ...")
		writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
		writer.writerow(['#chromosome', 'pos', 'noOfInconsistencyInTrio', 'noOfTotalInTrio', 'inconsistencyRateInTrio',\
						'noOfInconsistencyInDuo', 'noOfTotalInDuo', 'inconsistencyRateInDuo'])
		chr_pos_ls = chr_pos2inconsistentData.keys()
		chr_pos_ls.sort()
		for chr_pos in chr_pos_ls:
			chromosome, pos = chr_pos
			noOfInconsistencyInTrio, noOfTotalInTrio, noOfInconsistencyInDuo, noOfTotalInDuo = chr_pos2inconsistentData.get(chr_pos)
			if noOfTotalInTrio>0:
				inconsistencyRateInTrio = noOfInconsistencyInTrio/float(noOfTotalInTrio)
			else:
				inconsistencyRateInTrio = -1
			if noOfTotalInDuo>0:
				inconsistencyRateInDuo = noOfInconsistencyInDuo/float(noOfTotalInDuo)
			else:
				inconsistencyRateInDuo = -1
			writer.writerow([chromosome, pos, noOfInconsistencyInTrio, noOfTotalInTrio, inconsistencyRateInTrio,\
							noOfInconsistencyInDuo, noOfTotalInDuo, inconsistencyRateInDuo])
		
		del writer
		sys.stderr.write("Done.\n")
	def get_snp_pair2value_type(self, boolean_pair_fname, gene_id_set=None):
		"""
		2009-1-20
			add argument gene_id_set, to limit input only to those genes
			report the progress
		2008-11-25
		"""
		sys.stderr.write("Getting snp_pair2value_type ...\n")
		snp_pair2value_type = {}
		reader = csv.reader(open(boolean_pair_fname), delimiter=figureOutDelimiter(boolean_pair_fname))
		reader.next()
		min_value = None
		max_value = None
		counter = 0
		real_counter = 0
		for row in reader:
			snp1_id, gene1_id, snp2_id, gene2_id, bool_type, pvalue, count1, count2 = row[:8]
			counter += 1
			if not snp2_id:
				snp2_id = snp1_id
				continue	#2008-11-26 skip a row if it's pvalue from single SNP.
			gene1_id = int(gene1_id)
			if not gene2_id:
				gene2_id = gene1_id
			else:
				gene2_id = int(gene2_id)
			pvalue = float(pvalue)
			if pvalue==0:
				pvalue = 15
				bool_type = -1
			else:
				pvalue = -math.log10(float(pvalue))
			
			value = pvalue
			if min_value is None:
				min_value =value
			elif value<min_value:
				min_value = value
			
			if max_value is None:
				max_value = value
			elif value>max_value:
				max_value = value
			
			if gene_id_set is not None and (gene1_id not in gene_id_set or gene2_id not in gene_id_set):
				continue
			real_counter += 1
			
			snp1_id = snp1_id.split('_')
			snp1_id = map(int, snp1_id)
			if len(snp1_id)==2:
				snp1_id.append(0)
			
			snp2_id = snp2_id.split('_')
			snp2_id = map(int, snp2_id)
			if len(snp2_id)==2:
				snp2_id.append(0)
			
			snp_pair = [tuple(snp1_id), tuple(snp2_id)]
			snp_pair.sort()
			snp_pair = tuple(snp_pair)
			
			if bool_type:
				bool_type = int(bool_type)
			else:
				bool_type = 0
			if snp_pair not in snp_pair2value_type:
				snp_pair2value_type[snp_pair] = (pvalue, bool_type)
			else:
				if pvalue>snp_pair2value_type[snp_pair][0]:	#only take maximum
					snp_pair2value_type[snp_pair] = (pvalue, bool_type)
			sys.stderr.write("%s%s\t%s"%('\x08'*40, counter, real_counter))
		del reader
		sys.stderr.write("Done.\n")
		return snp_pair2value_type, min_value, max_value
    def run(self):
        """
		2010-5-25
		"""
        if self.debug:
            #for one-node testing purpose
            import pdb
            pdb.set_trace()

        self.communicator = MPI.world.duplicate()
        node_rank = self.communicator.rank
        free_computing_nodes = range(1, self.communicator.size -
                                     1)  #exclude the 1st and last node
        free_computing_node_set = Set(free_computing_nodes)
        output_node_rank = self.communicator.size - 1

        # 2010-5-25 to hold final data
        array_id2no_of_blocks_returned = {}
        array_id2col_index2intensity_ls = {}

        if node_rank == 0:
            db = Stock_250kDB.Stock_250kDB(drivername=self.drivername,
                                           username=self.db_user,
                                           password=self.db_passwd,
                                           hostname=self.hostname,
                                           database=self.dbname,
                                           schema=self.schema)
            db.setup(create_tables=False)

            reader = csv.reader(open(self.input_fname),
                                delimiter=figureOutDelimiter(self.input_fname))

            probe_id_ls = reader.next()[2:]
            probe_id_ls = map(int, probe_id_ls)
            chr_ls = reader.next()[2:]
            pos_ls = reader.next()[2:]
            chr_pos_ls = zip(chr_ls, pos_ls)

            commonData = self.prepareCommonData(db, self.blockSize, self.jumpStep, \
               self.x_range, self.y_range, self.minNoOfProbesPerBlock, \
               array_file_directory=self.array_file_directory, probe_id_ls=probe_id_ls,\
               chr_pos_ls=chr_pos_ls, probeType=2, \
               probes_blockData_picklef=self.probes_blockData_picklef)
            param_ls = self.generate_params(reader, blockDataCodedIndex_ls=commonData.blockDataCodedIndex_ls, \
                ref_array_id_set=self.ref_array_id_set) #must be behind prepareCommonData()
            refDataMatrix = self.readInRefArrayData(
                self.input_fname, ref_array_id_set=self.ref_array_id_set)

            if self.communicator.size == 1:  # single-node serial run
                blockDataCodedIndex_ls = commonData.blockDataCodedIndex_ls

                output_dir = os.path.split(self.output_fname)[0]
                if not os.path.isdir(output_dir):
                    os.makedirs(output_dir)
                writer = csv.writer(open(self.output_fname, 'w'),
                                    delimiter='\t')
                probe_id_ls = commonData.probe_id_ls
                chr_pos_ls = commonData.chr_pos_ls
                self.writeHeader(writer, probe_id_ls, chr_pos_ls)
                for array_id, ecotype_id, blockIndex, blockIntensity_ls in param_ls:
                    result_ls = self.ltsOneBlockAgainstAllRef(array_id, ecotype_id, blockIndex, blockIntensity_ls,\
                           refDataMatrix, blockDataCodedIndex_ls)
                    self.handleComputingOutputData(result_ls, blockDataCodedIndex_ls, array_id2no_of_blocks_returned, \
                      array_id2col_index2intensity_ls, writer)
                sys.exit(0)

            commonData_pickle = cPickle.dumps(commonData, protocol=-1)
            sys.stderr.write("Passing data to output node %s from %s ... " % (
                output_node_rank,
                node_rank,
            ))
            self.communicator.send(commonData_pickle, output_node_rank, 0)
            sys.stderr.write(".\n")

            refDataMatrix_pickle = cPickle.dumps(refDataMatrix, protocol=-1)
            for node in free_computing_nodes:  #send it to the computing_node
                sys.stderr.write(
                    "passing initial data to nodes from %s to %s ... " %
                    (node_rank, node))
                self.communicator.send(commonData_pickle, node, 0)
                self.communicator.send(refDataMatrix_pickle, node, 0)
                sys.stderr.write(".\n")
            if len(commonData.blockDataCodedIndex_ls) == 0:
                sys.stderr.write("Not a single block is formed. Exit!")
                sys.exit(0)
            del commonData, commonData_pickle, refDataMatrix, refDataMatrix_pickle

        elif node_rank in free_computing_node_set:
            data, source, tag = self.communicator.receiveString(0, 0)
            commonData = cPickle.loads(data)
            if len(commonData.blockDataCodedIndex_ls) == 0:
                sys.stderr.write("Not a single block is formed. Exit!")
                sys.exit(0)
            blockDataCodedIndex_ls = commonData.blockDataCodedIndex_ls
            del data, commonData

            data, source, tag, = self.communicator.receiveString(0, 0)
            refDataMatrix = cPickle.loads(data)
            del data
        else:
            data, source, tag = self.communicator.receiveString(0, 0)
            commonData = cPickle.loads(data)
            if len(commonData.blockDataCodedIndex_ls) == 0:
                sys.stderr.write("Not a single block is formed. Exit!")
                sys.exit(0)
            probe_id_ls = commonData.probe_id_ls
            chr_pos_ls = commonData.chr_pos_ls
            blockDataCodedIndex_ls = commonData.blockDataCodedIndex_ls
            del data, commonData

        self.synchronize()
        if node_rank == 0:
            param_obj = PassingData(param_ls=param_ls,
                                    output_node_rank=output_node_rank,
                                    report=self.report,
                                    counter=0)
            self.inputNode(param_obj,
                           free_computing_nodes,
                           param_generator=param_ls,
                           message_size=self.message_size)
        elif node_rank in free_computing_node_set:
            computing_parameter_obj = PassingData(refDataMatrix=refDataMatrix, \
                     blockDataCodedIndex_ls=blockDataCodedIndex_ls)
            self.computing_node(computing_parameter_obj,
                                self.computing_node_handler,
                                output_node_rank=output_node_rank)
        else:
            output_dir = os.path.split(self.output_fname)[0]
            if not os.path.isdir(output_dir):
                os.makedirs(output_dir)
            writer = csv.writer(open(self.output_fname, 'w'), delimiter='\t')
            self.writeHeader(writer, probe_id_ls, chr_pos_ls)

            output_param_obj = PassingData(writer=writer, array_id2no_of_blocks_returned=array_id2no_of_blocks_returned,\
                   array_id2col_index2intensity_ls=array_id2col_index2intensity_ls,\
                   blockDataCodedIndex_ls=blockDataCodedIndex_ls)
            self.output_node(free_computing_nodes, output_param_obj,
                             self.output_node_handler)
            del writer
        self.synchronize()  #to avoid some node early exits
Example #53
0
    def putQCIntoDB(self, session, input_fname, no_of_lines_to_skip, data_source_obj, cnv_type_obj, cnv_method_obj=None, \
       run_type=1, original_id=None, version=1):
        """
		2009-10-28
		"""
        sys.stderr.write("Putting QC data into database ... \n")
        reader = csv.reader(open(input_fname),
                            delimiter=figureOutDelimiter(input_fname))
        input_file_basename = os.path.basename(input_fname)

        if run_type == 7:
            # 2010-6-14 need to skip first 4 lines (3 comment-lines + 1 header)  for nucmer coords file
            no_of_lines_to_skip = 4
        elif run_type == 8:  # skip 2 lines (1 comment-line + 1 header) for breakdancer output from Quan Long
            no_of_lines_to_skip = 2

        col_name2index = None

        for i in range(no_of_lines_to_skip):
            header = reader.next()

        # the last line to be skipped will be the header
        if col_name2index is None:
            col_name2index = getColName2IndexFromHeader(header)

        counter = 0
        for row in reader:
            if run_type == 1:
                db_obj = self.generateCNVQCCallObjFromClark2007(
                    session, row, data_source_obj, cnv_type_obj,
                    cnv_method_obj)
            elif run_type == 2:
                db_obj = self.generateCNVQCCallObjFromSchneebergerOssowski(
                    session, row, data_source_obj, cnv_type_obj,
                    cnv_method_obj)
            elif run_type == 3:
                db_obj = self.generateCNVQCCallObjFromBobSchmitzData(session, row, data_source_obj, cnv_type_obj,\
                              cnv_method_obj, original_id=original_id)
            elif run_type == 4:
                db_obj = self.generateCNVQCCallObjFromLerContigDerivedCNVs(session, row, data_source_obj, cnv_type_obj, \
                           cnv_method_obj=cnv_method_obj, \
                           original_id=original_id, col_name2index=col_name2index)

            elif run_type == 5:
                db_obj = self.generateSequenceFragmentRefPosObjFromLerContigSpansOverCol(session, row, data_source_obj, cnv_type_obj, \
                           cnv_method_obj=cnv_method_obj, \
                           original_id=original_id, col_name2index=col_name2index, version=version)
            elif run_type == 6:
                db_obj = self.generateSequenceFragment2ProbeObj(session, row, data_source_obj, cnv_type_obj, \
                          cnv_method_obj=cnv_method_obj, \
                          original_id=original_id, col_name2index=col_name2index)
            elif run_type == 7:
                db_obj = self.generateSequenceFragmentRefPosObjFromNucmerLerContigSpansOverCol(session, row, data_source_obj, \
                         cnv_type_obj, \
                         cnv_method_obj=cnv_method_obj, \
                         original_id=original_id, col_name2index=col_name2index,\
                         version=version, comment=input_file_basename)
            elif run_type == 8:
                db_obj = self.generateCNVQCCallObjFromQuanLongBreakDancerOutput(session, row, data_source_obj, \
                         cnv_type_obj, cnv_method_obj=cnv_method_obj,\
                         original_id=original_id, col_name2index=col_name2index)
            elif run_type == 9:
                db_obj = self.generateCNVQCCallObjFromQuanLongCoverageDerived(session, row, data_source_obj, \
                        cnv_type_obj=cnv_type_obj, \
                        cnv_method_obj=cnv_method_obj,
                        col_name2index=col_name2index)
            else:
                sys.stderr.write("Run type %s not supported.\n" % run_type)
            if db_obj:
                session.add(db_obj)
                session.flush()
            counter += 1
            if counter % 5000 == 0:
                sys.stderr.write("%s%s" % ('\x08' * 40, counter))
        session.flush()
        sys.stderr.write("%s records. Done.\n" % counter)
Example #54
0
    def run(self):
        """
		2008-12-02
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        delimiter = figureOutDelimiter(self.input_fname, report=self.report)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname, delimiter=delimiter)

        if self.array_id_2nd_column:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
                data_matrix=data_matrix)
        else:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\
                data_matrix=data_matrix) #ignore category_list

        newSnpData, allele_index2allele_ls = snpData.convert2Binary(
            self.report)

        if self.phenotype_fname and self.phenotype_method_id:
            header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
                self.phenotype_fname, turn_into_integer=0)
            phenData = SNPData(
                header=header_phen,
                strain_acc_list=newSnpData.strain_acc_list,
                data_matrix=data_matrix_phen
            )  #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
            phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(
                newSnpData.row_id_ls, strain_acc_list_phen,
                phenData.data_matrix)  #tricky, using strain_acc_list_phen

            phenotype_col_index = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(
                phenData, Set([self.phenotype_method_id]))[0]
            phenotype_label = phenData.col_id_ls[phenotype_col_index]
            phenotype_f = open(
                '%s_%s.pheno' %
                (self.output_fname_prefix, phenotype_label.replace('/', '_')),
                'w')
            for phenotype_value in phenData.data_matrix[:,
                                                        phenotype_col_index]:
                if self.phenotype_is_binary:  #binary and non-binary have different NA designator
                    if numpy.isnan(phenotype_value):
                        phenotype_value = 9
                    else:
                        phenotype_value = int(phenotype_value)
                else:
                    if numpy.isnan(phenotype_value):
                        phenotype_value = -100.0
                phenotype_f.write('%s\n' % phenotype_value)
            del phenotype_f

        genotype_f = open('%s.geno' % self.output_fname_prefix, 'w')
        ind_writer = csv.writer(open('%s.ind' % self.output_fname_prefix, 'w'),
                                delimiter='\t')
        snp_writer = csv.writer(open('%s.snp' % self.output_fname_prefix, 'w'),
                                delimiter='\t')

        #transpose it
        newSnpData = transposeSNPData(newSnpData)

        no_of_rows = len(newSnpData.data_matrix)
        no_of_cols = len(newSnpData.data_matrix[0])
        for i in range(no_of_rows):
            snp_id = newSnpData.row_id_ls[i]
            chr, pos = snp_id.split('_')
            allele1 = allele_index2allele_ls[i][0]  #major allele
            allele2 = allele_index2allele_ls[i][1]  #minor allele
            snp_writer.writerow([snp_id, chr, 0.0, pos, allele1, allele2])
            geno_line = ''
            for j in range(no_of_cols):
                if i == 0:  #write out the accessions
                    ind_writer.writerow([newSnpData.col_id_ls[j], 'U', 'Case'])
                allele = newSnpData.data_matrix[i][j]
                if allele == 0:
                    geno_line += '0'
                elif allele == 1:
                    geno_line += '2'
                else:
                    geno_line += '9'
            geno_line += '\n'
            genotype_f.write(geno_line)

        del genotype_f, ind_writer, snp_writer
Example #55
0
	def predictALLSegments(self, input_fname, array_id2model_array_id_ls, array_id2model,\
						max_amplitude=-0.1, param_obj=None):
		"""
		2010-7-25
			handle the situation that any arrays has >=3 model-arrays
		2010-7-1
		"""
		sys.stderr.write('Predicting for all segments from %s ... \n'%(input_fname))
		reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname))
		
		header = reader.next()
		col_name2index = getColName2IndexFromHeader(header)
		median_col_index = col_name2index.get('median')
		ecotype_id_idx = col_name2index.get('ecotype_id', col_name2index.get('array_id'))
		counter = 0
		no_of_segments_in_model = 0
		no_of_predicted_deletions = 0
		for row in reader:
			counter += 1
			amplitude = float(row[col_name2index['amplitude']])
			if amplitude>max_amplitude:
				continue
			cnv_ecotype_id = int(row[ecotype_id_idx])
			array_id = int(row[col_name2index.get('array_id')])
			if array_id not in array_id2model_array_id_ls:
				continue
			no_of_probes = int(row[col_name2index['length']])
			
			start_probe = row[col_name2index['start_probe']].split('_')	# split chr_pos
			start_probe = map(int, start_probe)
			start_probe_id = row[col_name2index['start_probe_id']]
			stop_probe = row[col_name2index['end_probe']].split('_')
			stop_probe = map(int, stop_probe)
			stop_probe_id = row[col_name2index['end_probe_id']]
			
			segment_chromosome = start_probe[0]
			if start_probe[0]!=stop_probe[0]:	#spurious. on different chromosomes.
				continue
			segment_start_pos = start_probe[1]-12
			segment_stop_pos = stop_probe[1]+12
			segment_length = abs(segment_stop_pos-segment_start_pos+1)
			
			if median_col_index is not None:
				median_intensity = float(row[median_col_index])
			else:
				median_intensity = None
			cnv_segment_obj = PassingData(ecotype_id=cnv_ecotype_id, start_probe=start_probe, stop_probe=stop_probe,\
												no_of_probes=no_of_probes, amplitude=amplitude, segment_length=segment_length,\
												segment_chromosome=segment_chromosome, array_id=array_id,\
												start_probe_id=start_probe_id, stop_probe_id=stop_probe_id,\
												segment_start_pos=segment_start_pos, segment_stop_pos=segment_stop_pos,\
												median_intensity=median_intensity)
			model_array_id_ls = array_id2model_array_id_ls.get(array_id)
			no_of_segments_in_model += 1
			label_predicted, label_predicted2probability = self.predictOneSegmentByMultipleModels(cnv_segment_obj, \
																	model_array_id_ls, array_id2model)
			if label_predicted==-1:	# predicted to be deletion.
				cnv_segment_obj.probability = label_predicted2probability[-1]
				cnv_segment_obj.comment = 'model arrays: %s'%(repr(model_array_id_ls)[1:-1])
				self.saveSegmentObj(param_obj, cnv_segment_obj)
				no_of_predicted_deletions += 1
			if no_of_predicted_deletions%5000==0:
				sys.stderr.write('%s%s\t%s\t%s'%('\x08'*100, counter, no_of_segments_in_model, no_of_predicted_deletions))
		sys.stderr.write('%s%s\t%s\t%s\n'%('\x08'*100, counter, no_of_segments_in_model, no_of_predicted_deletions))
		sys.stderr.write('%s out of %s segments were used in prediction. %s predicted deletions.\n'%\
						(no_of_segments_in_model, counter, no_of_predicted_deletions))