コード例 #1
0
	def run(self):
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		inconsistent_rate_ls = []
		for inputFname in self.inputFnameLs:
			if os.path.isfile(inputFname):
				try:
					reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname))
					header = reader.next()
					col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True)
					inconsistent_rate_index = col_name2index.get("inconsistency")
					for row in reader:
						inconsistency = float(row[inconsistent_rate_index])
						inconsistent_rate_ls.append(inconsistency)
					del reader
				except:
					sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
					import traceback
					traceback.print_exc()
		
		if self.title is None:
			title = "histogram of inconsistent rate from %s refs"%(len(inconsistent_rate_ls))
		else:
			title = self.title
		if len(inconsistent_rate_ls)>10:
			medianInconsistentRate = numpy.median(inconsistent_rate_ls)
			title += " median %.4f"%(medianInconsistentRate)
		yh_matplotlib.drawHist(inconsistent_rate_ls, title=title, \
									xlabel_1D="Inconsistent Rate", xticks=None, outputFname=self.outputFname, min_no_of_data_points=20, needLog=False, \
									dpi=200)
コード例 #2
0
	def readDataMatrix(self, inputFname, minExprSumPerGene=180):
		"""
		2012.5.8
		"""
		sys.stderr.write("Reading the gene expression matrix from %s ..."%(inputFname))
		
		suffix = os.path.splitext(inputFname)[1]
		if suffix=='.gz':
			import gzip
			inf = gzip.open(inputFname, 'r')
		else:
			inf = open(inputFname, 'r')
		
		reader = csv.reader(inf, delimiter=figureOutDelimiter(inf))
		header = reader.next()	#first line is taken as header
		colName2Index = getColName2IndexFromHeader(header)
		data_matrix = []
		row_id_ls = []
		counter = 0
		real_counter = 0
		for row in reader:
			data_row = row[1:]
			data_row = map(float, data_row)
			exprSumPerGene = sum(data_row)
			counter += 1
			if exprSumPerGene>=minExprSumPerGene:
				real_counter += 1
				row_id_ls.append(row[0])
				data_matrix.append(data_row)
		data_matrix = numpy.array(data_matrix)
		sys.stderr.write("%s rows out of %s selected. %s rows , %s columns.\n"%(real_counter, counter, \
																	len(row_id_ls), len(header)-1))
		return PassingData(row_id_ls=row_id_ls, header=header, data_matrix=data_matrix)
コード例 #3
0
	def trioInconsistentRateFileWalker(self, inputFname, processFunc=None, minNoOfTotal=100, run_type=1):
		"""
		2011-11-2
			remove the maxDepth filter. apply afterwards through filterDataByDepth().
		2011-9-30
		
		"""
		reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname))
		header = reader.next()
		col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True)
		isInconsistent_index = col_name2index.get("isInconsistent")
		index_of_fa_depth = col_name2index.get("depthOfFather")
		index_of_mo_depth = col_name2index.get('depthOfMother')
		index_of_child_depth = col_name2index.get('depthOfChild')
		for row in reader:
			fa_depth = int(float(row[index_of_fa_depth]))
			mo_depth = int(float(row[index_of_mo_depth]))
			child_depth = int(float(row[index_of_child_depth]))
			isInconsistent = float(float(row[isInconsistent_index]))
			#if fa_depth<=self.maxDepth and mo_depth <=self.maxDepth and child_depth<=self.maxDepth:
			self.fa_depth_ls.append(fa_depth)
			self.mo_depth_ls.append(mo_depth)
			self.child_depth_ls.append(child_depth)
			self.inconsistent_ls.append(isInconsistent)
		del reader
コード例 #4
0
	def run(self):
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
		writer.writerow(['#sampleID', 'chromosome', 'meanDepth', 'medianDepth'])
		for inputFname in self.inputFnameLs:
			inputFile = utils.openGzipFile(inputFname)
			delimiter = figureOutDelimiter(inputFile)
			reader = csv.reader(inputFile, delimiter=delimiter)
			header = reader.next()
			col_name2index = getColName2IndexFromHeader(header)
			
			intervalIDIndex = col_name2index.get("Target")
			#only the first read group among the output (so don't run the DepthOfCoverageWalker over multi-read-group bam files
			avgCoverageIndex = 4
			sampleID = header[avgCoverageIndex][:-9]	#this column header is like $sampleID_mean_cvg. so get rid of _mean_cvg
			medianCoverageIndex = 6
			
			for row in reader:
				intervalID = row[intervalIDIndex]
				writer.writerow([sampleID, intervalID, row[avgCoverageIndex], row[medianCoverageIndex]])
		del writer
		sys.stderr.write("Done.\n")
コード例 #5
0
	def putHaplotypeGroupIntoDB(self, session, input_fname, tg_ecotypeid2row, max_snp_typing_error_rate, snp_id_ls):
		"""
		2009-3-31
		2009-4-4
			add argument tg_ecotypeid2row
		"""
		sys.stderr.write("Constructing haplotype groups ...\n")
		pattern_ecotypeid = re.compile(r'(?<=\))\d+')
		reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname))
		col_name2col_index = getColName2IndexFromHeader(reader.next())
		ecotypeid_idx = col_name2col_index['ecotypeid']
		haplo_name_idx = col_name2col_index['haplogroup']
		geographic_integrity_idx = col_name2col_index['geographic_integrity']
		filtered_SNPs_idx = col_name2col_index['filtered_SNPs']
		counter = 0
		for tg_ecotypeid, row in tg_ecotypeid2row.iteritems():
			ecotypeid = int(row[ecotypeid_idx])
			ecotypeid = tg_ecotypeid	#2009-4-4 use tg_ecotypeid instead
			haplo_name = row[haplo_name_idx]
			geographic_integrity_name = row[geographic_integrity_idx]
			filtered_SNPs = row[filtered_SNPs_idx]
			ref_ecotypeid = int(pattern_ecotypeid.search(haplo_name).group(0))
			haplo_group = StockDB.HaploGroup.query.filter_by(short_name=haplo_name).first()
			if not haplo_group:
				haplo_group = StockDB.HaploGroup(short_name=haplo_name, ref_ecotypeid=ref_ecotypeid, max_snp_typing_error_rate=max_snp_typing_error_rate)
				session.save(haplo_group)
				session.flush()
			
			ecotype = StockDB.Ecotype.get(ecotypeid)
			haplo_group.ecotypes.append(ecotype)
			geographic_integrity = StockDB.GeographicIntegrity.query.filter_by(short_name=geographic_integrity_name).first()
			if not geographic_integrity:
				geographic_integrity = StockDB.GeographicIntegrity(short_name=geographic_integrity_name)
				session.save(geographic_integrity)
				session.flush()
			ecotype.geographic_integrity = geographic_integrity
			session.save_or_update(ecotype)
			#one bit of ecotype: link the ecotypeid to tg_ecotype_id
			
			
			#deal with filtered SNPs
			for i in range(len(filtered_SNPs)):
				allele = filtered_SNPs[i]
				if allele=='_':
					continue
				fc = StockDB.FilteredCalls(ecotypeid=ecotypeid, snpid=snp_id_ls[i], allele=allele)
				session.save(fc)
				session.flush()
			counter += 1
			if counter%500==0 and self.report:
				sys.stderr.write('%s%s'%('\x08'*80, counter))
		session.flush()
		sys.stderr.write("Done.\n")
コード例 #6
0
ファイル: DrawLD.py プロジェクト: bopopescu/gwasmodules
    def drawLD(self,
               axe_LD,
               LD_fname,
               row_snp_region,
               col_snp_region,
               min_MAF,
               which_LD_statistic,
               min_gap=100000):
        """
		2008-10-03
		"""
        sys.stderr.write("Drawing LD from %s ...\n" % (LD_fname))
        reader = csv.reader(open(LD_fname), delimiter='\t')
        col_name2index = getColName2IndexFromHeader(reader.next())
        counter = 0
        real_counter = 0
        xylim_data = PassingData(xlim=[-1, -1], ylim=[-1, -1])
        for row in reader:
            snp1 = row[col_name2index['snp1']].split('_')
            snp1 = tuple(map(int, snp1))
            snp2 = row[col_name2index['snp2']].split('_')
            snp2 = tuple(map(int, snp2))
            allele1_freq = float(row[col_name2index['allele1_freq']])
            allele2_freq = float(row[col_name2index['allele2_freq']])
            if allele1_freq >= min_MAF and allele2_freq >= min_MAF:  #meet the minimum minor-allele-frequency
                LD_stat = float(row[col_name2index[LD_statistic.get_name(
                    which_LD_statistic)]])
                LD_stat = abs(LD_stat)
                fc = self.r2ToRGBColor(LD_stat)
                if snp1 in row_snp_region.chr_pos2adjacent_window and snp2 in col_snp_region.chr_pos2adjacent_window:
                    if snp1[0] == snp2[0] and abs(
                            snp1[1] - snp2[1]) <= min_gap:  #too close, ignore
                        continue
                    self.addOnePolygon(axe_LD, row_snp_region.chr_pos2adjacent_window[snp1], col_snp_region.chr_pos2adjacent_window[snp2], \
                        fc, xylim_data)
                    real_counter += 1
                if snp2 in row_snp_region.chr_pos2adjacent_window and snp1 in col_snp_region.chr_pos2adjacent_window:
                    if snp1[0] == snp2[0] and abs(
                            snp1[1] - snp2[1]) <= min_gap:  #too close, ignore
                        continue
                    self.addOnePolygon(axe_LD, row_snp_region.chr_pos2adjacent_window[snp2], col_snp_region.chr_pos2adjacent_window[snp1], \
                        fc, xylim_data)
                    real_counter += 1
            counter += 1
            if counter % 100000 == 0:
                sys.stderr.write('%s\t%s' % ('\x08' * 100, counter))
                if counter % 500000 == 0 and self.debug > 0:
                    break
        del reader
        sys.stderr.write("%s LD drawn. Done.\n" % real_counter)
        return xylim_data
コード例 #7
0
	def getSampleID2FamilyCount(self, inputFname):
		"""
		2012.3.29
		"""
		sys.stderr.write("Getting sampleID2FamilyCount from %s ..."%(inputFname))
		reader = csv.reader(open(inputFname, 'r'), delimiter=figureOutDelimiter(inputFname))
		header = reader.next()
		colName2Index = getColName2IndexFromHeader(header)
		sampleID2FamilyCount = {}
		for row in reader:
			individualID = row[colName2Index.get("individualID")]
			familyCount = int(row[colName2Index.get("familyCount")])
			sampleID2FamilyCount[individualID] = familyCount
		sys.stderr.write("%s individuals.\n"%(len(sampleID2FamilyCount)))
		return sampleID2FamilyCount
コード例 #8
0
    def dropRedundantEcotypes(self, input_fname, ecotypeid2tg_ecotypeid):
        """
		2009-4-10
			not used. decided to keep all of them.
		2009-4-4
			retain only one row out of duplicated ecotype rows based on ecotypeid2tg_ecotypeid.
				it's not random. usually the one with same ecotype id as tg_ecotypeid unless tg_ecotypeid doesn't appear.
			if duplicated ecotypes belong to different haplotype group, choose the one with tg_ecotypeid otherwise random.
		"""
        sys.stderr.write("Dropping redundant ecotypes ...\n")
        reader = csv.reader(open(input_fname),
                            delimiter=figureOutDelimiter(input_fname))
        col_name2col_index = getColName2IndexFromHeader(reader.next())
        ecotypeid_idx = col_name2col_index['ecotypeid']
        haplo_name_idx = col_name2col_index['haplogroup']
        nativename_idx = col_name2col_index['nativename']
        tg_ecotypeid2row = {}
        no_of_duplicates = 0
        no_of_duplicates_with_different_haplogroups = 0
        counter = 0
        for row in reader:
            ecotypeid = int(row[ecotypeid_idx])
            haplo_name = row[haplo_name_idx]
            nativename = row[nativename_idx]
            if ecotypeid in ecotypeid2tg_ecotypeid:
                tg_ecotypeid = ecotypeid2tg_ecotypeid[ecotypeid]
                if tg_ecotypeid not in tg_ecotypeid2row:
                    tg_ecotypeid2row[tg_ecotypeid] = row
                else:
                    no_of_duplicates += 1
                    old_row = tg_ecotypeid2row[tg_ecotypeid]
                    old_ecotypeid = int(old_row[ecotypeid_idx])
                    old_haplo_name = old_row[haplo_name_idx]
                    old_nativename = row[nativename_idx]
                    if old_haplo_name != haplo_name:
                        sys.stderr.write("ecotype %s(%s) in haplotype group %s, while duplicate %s(%s) in haplotype group %s.\n"%\
                             (ecotypeid, nativename, haplo_name, old_ecotypeid, old_nativename, old_haplo_name))
                        no_of_duplicates_with_different_haplogroups += 1
                    if ecotypeid == tg_ecotypeid:  #replace if the new ecotypeid matching the tg_ecotypeid whether the haplotype group is same or not.
                        tg_ecotypeid2row[tg_ecotypeid] = row
            else:
                sys.stderr.write(
                    "Warning: ecotype %s not in ecotypeid2tg_ecotypeid.\n" %
                    (ecotypeid))
            counter += 1
        sys.stderr.write("no_of_duplicates: %s, out of which %s encompass different haplotype groups. %s accessions in total. Done.\n"%\
             (no_of_duplicates, no_of_duplicates_with_different_haplogroups, counter))
        return tg_ecotypeid2row
コード例 #9
0
    def save_LD(self, session, LD_fname, call_method_id, commit=0):
        """
		2008-10-15
			adapted from DrawSNPRegion.get_LD() 
		"""
        sys.stderr.write("Reading in LD info from %s ...\n" % (LD_fname))
        reader = csv.reader(open(LD_fname), delimiter='\t')
        col_name2index = getColName2IndexFromHeader(reader.next())
        counter = 0
        for row in reader:
            snp1 = row[col_name2index['snp1']].split('_')
            snp1 = map(int, snp1)
            snp2 = row[col_name2index['snp2']].split('_')
            snp2 = map(int, snp2)
            allele1_freq = float(row[col_name2index['allele1_freq']])
            allele2_freq = float(row[col_name2index['allele2_freq']])
            r2 = float(row[col_name2index['r2']])
            D_prime = float(row[col_name2index['D_prime']])
            D = float(row[col_name2index['D']])
            if snp1 < snp2:
                snp_pair = (snp1[0], snp1[1], snp2[0], snp2[1])
            else:
                snp_pair = (snp2[0], snp2[1], snp1[0], snp1[1])
            no_of_pairs = int(
                float(row[col_name2index['no_of_pairs']]) / 2
            )  #MpiLD.py outputs this double (due to haploid regarded as diploid)
            ld = Stock_250kDB.LD(snp1_maf=allele1_freq,
                                 snp2_maf=allele2_freq,
                                 d=D,
                                 d_prime=D_prime,
                                 r2=r2,
                                 no_of_pairs=no_of_pairs)
            ld.chr1 = snp_pair[0]
            ld.pos1 = snp_pair[1]
            ld.chr2 = snp_pair[2]
            ld.pos2 = snp_pair[3]
            ld.call_method_id = call_method_id
            if commit:
                session.save(ld)
                session.flush()
            counter += 1
            if counter % 100000 == 0:
                sys.stderr.write('%s\t%s' % ('\x08' * 100, counter))
            if counter % 1000 == 0 and self.debug > 0:
                break
                pass
        sys.stderr.write("%s entries. Done.\n" % counter)
コード例 #10
0
    def getScoreRankFromRBG(self, rbg, candidate_gene_set, results_directory):
        """
		2008-09-28
			rename getScoreRank to getScoreRankFromRBG
		"""
        sys.stderr.write("Getting score & rank list ...")
        if results_directory:  #given a directory where all results are.
            result_fname = os.path.join(results_directory,
                                        os.path.basename(rbg.filename))
        else:
            result_fname = rbg.filename
        if not os.path.isfile(result_fname):
            sys.stderr.write("%s doesn't exist.\n" % result_fname)
            return None
        #if rbg.results_method.analysis_method_id==13:
        #	sys.stderr.write("Skip analysis_method_id=13.\n")
        #	return None
        reader = csv.reader(open(result_fname), delimiter='\t')
        col_name2index = getColName2IndexFromHeader(reader.next())
        counter = 0
        candidate_score_ls = []
        non_candidate_score_ls = []
        candidate_rank_ls = []
        non_candidate_rank_ls = []
        for row in reader:
            gene_id = int(row[col_name2index['gene_id']])
            score = float(row[col_name2index['score']])
            if gene_id in candidate_gene_set:
                candidate_score_ls.append(score)
                candidate_rank_ls.append(counter)
            else:
                non_candidate_score_ls.append(score)
                non_candidate_rank_ls.append(counter)
            counter += 1
        del reader
        analysis_method = Stock_250kDB.AnalysisMethod.get(
            rbg.results_method.analysis_method_id)

        score_rank_data = PassingData(candidate_score_ls=candidate_score_ls, candidate_rank_ls=candidate_rank_ls,\
              non_candidate_score_ls=non_candidate_score_ls, non_candidate_rank_ls=non_candidate_rank_ls,\
              analysis_method=analysis_method)

        sys.stderr.write("Done.\n")
        return score_rank_data
コード例 #11
0
	def dropRedundantEcotypes(self, input_fname, ecotypeid2tg_ecotypeid):
		"""
		2009-4-4
			retain only one row out of duplicated ecotype rows based on ecotypeid2tg_ecotypeid.
				it's not random. usually the one with same ecotype id as tg_ecotypeid unless tg_ecotypeid doesn't appear.
			if duplicated ecotypes belong to different haplotype group, choose the one with tg_ecotypeid otherwise random.
		"""
		sys.stderr.write("Dropping redundant ecotypes ...\n")
		reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname))
		col_name2col_index = getColName2IndexFromHeader(reader.next())
		ecotypeid_idx = col_name2col_index['ecotypeid']
		haplo_name_idx = col_name2col_index['haplogroup']
		nativename_idx = col_name2col_index['nativename']
		tg_ecotypeid2row = {}
		no_of_duplicates = 0
		no_of_duplicates_with_different_haplogroups = 0
		counter = 0
		for row in reader:
			ecotypeid = int(row[ecotypeid_idx])
			haplo_name = row[haplo_name_idx]
			nativename = row[nativename_idx]
			if ecotypeid in ecotypeid2tg_ecotypeid:
				tg_ecotypeid = ecotypeid2tg_ecotypeid[ecotypeid]
				if tg_ecotypeid not in tg_ecotypeid2row:
					tg_ecotypeid2row[tg_ecotypeid] = row
				else:
					no_of_duplicates += 1
					old_row = tg_ecotypeid2row[tg_ecotypeid]
					old_ecotypeid = int(old_row[ecotypeid_idx])
					old_haplo_name = old_row[haplo_name_idx]
					old_nativename = row[nativename_idx]
					if old_haplo_name!=haplo_name:
						sys.stderr.write("ecotype %s(%s) in haplotype group %s, while duplicate %s(%s) in haplotype group %s.\n"%\
										 (ecotypeid, nativename, haplo_name, old_ecotypeid, old_nativename, old_haplo_name))
						no_of_duplicates_with_different_haplogroups += 1
					if ecotypeid==tg_ecotypeid:	#replace if the new ecotypeid matching the tg_ecotypeid whether the haplotype group is same or not.
						tg_ecotypeid2row[tg_ecotypeid] = row
			else:
				sys.stderr.write("Warning: ecotype %s not in ecotypeid2tg_ecotypeid.\n"%(ecotypeid))
			counter += 1
		sys.stderr.write("no_of_duplicates: %s, out of which %s encompass different haplotype groups. %s accessions in total. Done.\n"%\
						 (no_of_duplicates, no_of_duplicates_with_different_haplogroups, counter))
		return tg_ecotypeid2row
コード例 #12
0
	def getScoreRankFromRBG(self, rbg, candidate_gene_set, results_directory):
		"""
		2008-09-28
			rename getScoreRank to getScoreRankFromRBG
		"""
		sys.stderr.write("Getting score & rank list ...")
		if results_directory:	#given a directory where all results are.
			result_fname = os.path.join(results_directory, os.path.basename(rbg.filename))
		else:
			result_fname = rbg.filename
		if not os.path.isfile(result_fname):
			sys.stderr.write("%s doesn't exist.\n"%result_fname)
			return None
		#if rbg.results_method.analysis_method_id==13:
		#	sys.stderr.write("Skip analysis_method_id=13.\n")
		#	return None
		reader = csv.reader(open(result_fname), delimiter='\t')
		col_name2index = getColName2IndexFromHeader(reader.next())
		counter = 0
		candidate_score_ls = []
		non_candidate_score_ls = []
		candidate_rank_ls = []
		non_candidate_rank_ls = []
		for row in reader:
			gene_id = int(row[col_name2index['gene_id']])
			score = float(row[col_name2index['score']])
			if gene_id in candidate_gene_set:
				candidate_score_ls.append(score)
				candidate_rank_ls.append(counter)
			else:
				non_candidate_score_ls.append(score)
				non_candidate_rank_ls.append(counter)
			counter += 1
		del reader
		analysis_method = Stock_250kDB.AnalysisMethod.get(rbg.results_method.analysis_method_id)
		
		score_rank_data = PassingData(candidate_score_ls=candidate_score_ls, candidate_rank_ls=candidate_rank_ls,\
								non_candidate_score_ls=non_candidate_score_ls, non_candidate_rank_ls=non_candidate_rank_ls,\
								analysis_method=analysis_method)
		
		sys.stderr.write("Done.\n")
		return score_rank_data
	def run(self):
		"""
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		inf = utils.openGzipFile(self.inputFname, openMode='r')
		
		reader = csv.reader(inf, delimiter=figureOutDelimiter(inf))
		header = None
		for i in xrange(self.noOfLinesInHeader):
			if i==0:
				header = reader.next()	#first line is taken as header
			else:
				reader.next()
		if header is not None:
			colName2Index = getColName2IndexFromHeader(header)
		
		newHeader = ['alignmentID', 'total_base_count', 'sampled_base_count', 'meanDepth', 'medianDepth', 'modeDepth']
		inputStatLs = []
		
		writer = csv.writer(utils.openGzipFile(self.outputFname, openMode='w'), delimiter='\t')
		writer.writerow(newHeader)
		counter = 0
		real_counter = 0
		for row in reader:
			counter += 1
			if real_counter <= self.maxNumberOfSamplings:
				r = random.random()
				if r<=self.fractionToSample and real_counter<=self.maxNumberOfSamplings:
					inputStatLs.append(float(row[self.whichColumn]))
					real_counter += 1
		
		meanDepth = numpy.mean(inputStatLs)
		medianDepth = numpy.median(inputStatLs)
		modeDepth = scipy.stats.mode(inputStatLs)[0][0]
		outputRow = [self.alignmentID, counter, real_counter, meanDepth, medianDepth, modeDepth]
		writer.writerow(outputRow)
		del writer
コード例 #14
0
	def save_LD(self, session, LD_fname, call_method_id, commit=0):
		"""
		2008-10-15
			adapted from DrawSNPRegion.get_LD() 
		"""
		sys.stderr.write("Reading in LD info from %s ...\n"%(LD_fname))
		reader = csv.reader(open(LD_fname), delimiter='\t')
		col_name2index = getColName2IndexFromHeader(reader.next())
		counter = 0
		for row in reader:
			snp1 = row[col_name2index['snp1']].split('_')
			snp1 = map(int, snp1)
			snp2 = row[col_name2index['snp2']].split('_')
			snp2 = map(int, snp2)
			allele1_freq = float(row[col_name2index['allele1_freq']])
			allele2_freq = float(row[col_name2index['allele2_freq']])
			r2 = float(row[col_name2index['r2']])
			D_prime = float(row[col_name2index['D_prime']])
			D = float(row[col_name2index['D']])
			if snp1<snp2:
				snp_pair = (snp1[0], snp1[1], snp2[0], snp2[1])
			else:
				snp_pair = (snp2[0], snp2[1], snp1[0], snp1[1])
			no_of_pairs = int(float(row[col_name2index['no_of_pairs']])/2)	#MpiLD.py outputs this double (due to haploid regarded as diploid)
			ld = Stock_250kDB.LD(snp1_maf=allele1_freq, snp2_maf=allele2_freq, d=D, d_prime=D_prime, r2=r2, no_of_pairs=no_of_pairs)
			ld.chr1 = snp_pair[0]
			ld.pos1 = snp_pair[1]
			ld.chr2 = snp_pair[2]
			ld.pos2 = snp_pair[3]
			ld.call_method_id = call_method_id
			if commit:
				session.save(ld)
				session.flush()
			counter += 1
			if counter%100000==0:
				sys.stderr.write('%s\t%s'%('\x08'*100, counter))
			if counter%1000==0 and self.debug>0:
				break
				pass
		sys.stderr.write("%s entries. Done.\n"%counter)
コード例 #15
0
	def trioInconsistentRateFileWalker(cls, inputFname, processFunc=None, minNoOfTotal=100, run_type=1):
		"""
		2012.10.25 only skip except during file opening, not file reading

		2011-9-30
		"""
		try:
			reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname))
			header = reader.next()
			col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True)
		except:
			sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
			import traceback
			traceback.print_exc()
			return
		inconsistent_rate_index = col_name2index.get("inconsistency")
		if run_type==1:
			index_of_x_data = col_name2index.get("stopFrequency")
		elif run_type==2:
			index_of_x_data = col_name2index.get("stop")
		else:
			sys.stderr.write("Unsupported run_type %s in trioInconsistentRateFileWalker().\n"%(run_type))
			sys.exit(3)
		index_of_no_of_total = col_name2index.get("no_of_total")
		inconsistent_rate_ls = []
		x_ls = []
		for row in reader:
			if self.samplingRate<1 and self.samplingRate>=0:
				r = random.random()
				if r>self.samplingRate:
					continue
			no_of_total = int(float(row[index_of_no_of_total]))
			if no_of_total<=minNoOfTotal:
				continue
			inconsistency = float(row[inconsistent_rate_index])
			inconsistent_rate_ls.append(inconsistency)
			x_data = float(row[index_of_x_data])
			x_ls.append(x_data)
		processFunc(x_ls, inconsistent_rate_ls)
		del reader
コード例 #16
0
	def getTopGeneSet(self, rbg, results_directory, no_of_top_genes=1000):
		"""
		2008-09-29
			get a set of top genes
		"""
		if results_directory:	#given a directory where all results are.
			result_fname = os.path.join(results_directory, os.path.basename(rbg.filename))
		else:
			result_fname = rbg.filename
		if not os.path.isfile(result_fname):
			sys.stderr.write("%s doesn't exist.\n"%result_fname)
			return None
		reader = csv.reader(open(result_fname), delimiter='\t')
		col_name2index = getColName2IndexFromHeader(reader.next())
		counter = 0
		gene_set = Set()
		for row in reader:
			gene_id = int(row[col_name2index['gene_id']])
			gene_set.add(gene_id)
			counter += 1
			if no_of_top_genes is not None and counter>=no_of_top_genes:
				break
		del reader
		return gene_set
コード例 #17
0
	def outputSNPDataInNewCoordinate(self, querySNPDataFname=None, querySNPID2NewReferenceCoordinateLs=None,\
									newSNPDataOutputFname=None, newSNPDataOutputFormat=1):
		"""
		2013.07.03 added argument newSNPDataOutputFormat
			
		2012.10.14
			split out of findSNPPositionOnNewRef()
		"""
		sys.stderr.write("Converting querySNPDataFname %s into individual X SNP format, format=%s ... "%\
						(querySNPDataFname, newSNPDataOutputFormat))
		"""
Sample  Geno    SNP
1999010 CC      cs_primer1082_247
1999068 CC      cs_primer1082_247
2000022 CT      cs_primer1082_247
2000064 CT      cs_primer1082_247
2000117 CC      cs_primer1082_247

		"""
		inf = utils.openGzipFile(querySNPDataFname)
		reader = csv.reader(inf, delimiter=figureOutDelimiter(inf))
		col_name2index = getColName2IndexFromHeader(reader.next())
		
		sampleIndex = col_name2index.get("Sample")
		genotypeIndex = col_name2index.get("Geno")
		SNPIDIndex = col_name2index.get("SNP")
		
		row_id2index = {}
		row_id_ls = []
		col_id_ls = []
		col_id2index = {}
		row_col_index2genotype = {}
		for row in reader:
			sampleID = row[sampleIndex]
			genotype = row[genotypeIndex]
			querySNPID = row[SNPIDIndex]
			if querySNPID in querySNPID2NewReferenceCoordinateLs:
				newRefCoordinateLs = querySNPID2NewReferenceCoordinateLs.get(querySNPID)
				if len(newRefCoordinateLs)==1:
					newRefCoordinate = newRefCoordinateLs[0]
					if newSNPDataOutputFormat==2:
						col_id = '%s_%s'%(newRefCoordinate.newChr, newRefCoordinate.newRefStart)
					else:
						col_id = '%s_%s_%s'%(newRefCoordinate.newChr, newRefCoordinate.newRefStart, newRefCoordinate.newRefStop)
					queryStrand = newRefCoordinate.queryStrand
					if col_id not in col_id2index:
						col_id2index[col_id] = len(col_id2index)
						col_id_ls.append(col_id)
					if sampleID not in row_id2index:
						row_id2index[sampleID] = len(row_id2index)
						row_id_ls.append(sampleID)
					if queryStrand == "-":
						genotype = SNP.reverseComplement(genotype)
					row_index = row_id2index[sampleID]
					col_index = col_id2index[col_id]
					row_col_index2genotype[(row_index, col_index)] = genotype
				else:
					continue
		data_matrix = numpy.zeros([len(row_id_ls), len(col_id2index)], dtype=numpy.int8)
		
		for row_col_index, genotype in row_col_index2genotype.iteritems():
			row_index, col_index = row_col_index[:2]
			data_matrix[row_index, col_index] = SNP.nt2number[genotype]
		sys.stderr.write("\n")
		snpData = SNP.SNPData(row_id_ls=row_id_ls, col_id_ls=col_id_ls, data_matrix=data_matrix)
		snpData.tofile(newSNPDataOutputFname)
コード例 #18
0
    def putQCIntoDB(self, session, input_fname, no_of_lines_to_skip, data_source_obj, cnv_type_obj, cnv_method_obj=None, \
       run_type=1, original_id=None, version=1):
        """
		2009-10-28
		"""
        sys.stderr.write("Putting QC data into database ... \n")
        reader = csv.reader(open(input_fname),
                            delimiter=figureOutDelimiter(input_fname))
        input_file_basename = os.path.basename(input_fname)

        if run_type == 7:
            # 2010-6-14 need to skip first 4 lines (3 comment-lines + 1 header)  for nucmer coords file
            no_of_lines_to_skip = 4
        elif run_type == 8:  # skip 2 lines (1 comment-line + 1 header) for breakdancer output from Quan Long
            no_of_lines_to_skip = 2

        col_name2index = None

        for i in range(no_of_lines_to_skip):
            header = reader.next()

        # the last line to be skipped will be the header
        if col_name2index is None:
            col_name2index = getColName2IndexFromHeader(header)

        counter = 0
        for row in reader:
            if run_type == 1:
                db_obj = self.generateCNVQCCallObjFromClark2007(
                    session, row, data_source_obj, cnv_type_obj,
                    cnv_method_obj)
            elif run_type == 2:
                db_obj = self.generateCNVQCCallObjFromSchneebergerOssowski(
                    session, row, data_source_obj, cnv_type_obj,
                    cnv_method_obj)
            elif run_type == 3:
                db_obj = self.generateCNVQCCallObjFromBobSchmitzData(session, row, data_source_obj, cnv_type_obj,\
                              cnv_method_obj, original_id=original_id)
            elif run_type == 4:
                db_obj = self.generateCNVQCCallObjFromLerContigDerivedCNVs(session, row, data_source_obj, cnv_type_obj, \
                           cnv_method_obj=cnv_method_obj, \
                           original_id=original_id, col_name2index=col_name2index)

            elif run_type == 5:
                db_obj = self.generateSequenceFragmentRefPosObjFromLerContigSpansOverCol(session, row, data_source_obj, cnv_type_obj, \
                           cnv_method_obj=cnv_method_obj, \
                           original_id=original_id, col_name2index=col_name2index, version=version)
            elif run_type == 6:
                db_obj = self.generateSequenceFragment2ProbeObj(session, row, data_source_obj, cnv_type_obj, \
                          cnv_method_obj=cnv_method_obj, \
                          original_id=original_id, col_name2index=col_name2index)
            elif run_type == 7:
                db_obj = self.generateSequenceFragmentRefPosObjFromNucmerLerContigSpansOverCol(session, row, data_source_obj, \
                         cnv_type_obj, \
                         cnv_method_obj=cnv_method_obj, \
                         original_id=original_id, col_name2index=col_name2index,\
                         version=version, comment=input_file_basename)
            elif run_type == 8:
                db_obj = self.generateCNVQCCallObjFromQuanLongBreakDancerOutput(session, row, data_source_obj, \
                         cnv_type_obj, cnv_method_obj=cnv_method_obj,\
                         original_id=original_id, col_name2index=col_name2index)
            elif run_type == 9:
                db_obj = self.generateCNVQCCallObjFromQuanLongCoverageDerived(session, row, data_source_obj, \
                        cnv_type_obj=cnv_type_obj, \
                        cnv_method_obj=cnv_method_obj,
                        col_name2index=col_name2index)
            else:
                sys.stderr.write("Run type %s not supported.\n" % run_type)
            if db_obj:
                session.add(db_obj)
                session.flush()
            counter += 1
            if counter % 5000 == 0:
                sys.stderr.write("%s%s" % ('\x08' * 40, counter))
        session.flush()
        sys.stderr.write("%s records. Done.\n" % counter)
コード例 #19
0
	def findSNPPositionOnNewRef(self, SNPFlankSequenceFname=None, blastHitResultFname=None, \
							querySNPDataFname=None,\
							querySNPID2NewRefCoordinateOutputFname=None, newSNPDataOutputFname=None, minAlignmentSpan=10):
		"""
		2012.10.8
			argument minAlignmentSpan: the number of bases involved in the blast query-target alignment
		2012.8.19
			newSNPDataOutputFname will contain the individual X SNP matrix.
		"""
		if SNPFlankSequenceFname:
			querySNPID2attributes = self.getQuerySNPID2attributes(SNPFlankSequenceFname=SNPFlankSequenceFname)
		else:
			querySNPID2attributes = None
		
		sys.stderr.write("Finding blast reference coordinates for SNPs from %s ... \n"%(blastHitResultFname))
		reader = csv.reader(open(blastHitResultFname), delimiter='\t')
		header =reader.next()
		col_name2index = getColName2IndexFromHeader(header)
		
		#every coordinate in blastHitResultFname is 1-based.
		"""
queryID queryStart      queryEnd        queryLength     targetChr       targetStart     targetStop      targetLength    noOfIdentities  noOfMismatches  identityPercentage
34804_309       1       417     417     Contig293       2551654 2552070 3001801 413     4       0.9904076738609112
43608_166       1       574     574     Contig269       1565599 1566170 3181654 565     9       0.9843205574912892
44412_392       2       580     580     Contig269       1776095 1776673 3181654 577     3       0.9948275862068966

		"""
		queryIDIndex = col_name2index['queryID']
		queryStartIndex = col_name2index['queryStart']
		queryEndIndex = col_name2index['queryEnd']
		
		targetChrIndex = col_name2index['targetChr']
		targetStartIndex = col_name2index['targetStart']
		targetStopIndex = col_name2index['targetStop']
		querySNPID2NewReferenceCoordinateLs = {}
		counter = 0
		real_counter = 0
		queryIDSet= set()
		for row in reader:
			queryID = row[queryIDIndex].split()[0]	##get rid of extra comment
			queryStart = int(row[queryStartIndex])
			queryEnd = int(row[queryEndIndex])
			
			targetChr = row[targetChrIndex]
			targetStart = int(row[targetStartIndex])
			targetStop = int(row[targetStopIndex])
			
			queryIDSet.add(queryID)
			
			queryAlignmentSpan = abs(queryEnd-queryStart) + 1
			targetAlignmentSpan = abs(targetStop-targetStart) + 1
			if queryAlignmentSpan == targetAlignmentSpan:
				if querySNPID2attributes and queryID in querySNPID2attributes:
					parseData = querySNPID2attributes.get(queryID)
					locusSpan = parseData.locusSpan
				else:
					parseData = self.parseQueryLocusID(queryID)
					start = parseData.start
					stop = parseData.stop
					if start is not None and stop is not None:
						stop = int(stop)
						start = int(start)
						locusSpan = abs(int(stop)-start)	#length-1
					else:
						locusSpan = None
				positionInFlank = parseData.positionInFlank
				queryRefBase = parseData.refBase
				queryAltBase = parseData.altBase
				if positionInFlank is not None and locusSpan is not None:
					positionInFlank = int(positionInFlank)
					if targetAlignmentSpan>=minAlignmentSpan and queryAlignmentSpan>=minAlignmentSpan:
						if queryStart <queryEnd and positionInFlank>queryStart and positionInFlank<queryEnd:
							#locus must be in the middle of queryStart and queryEnd.
							newRefStart =  targetStart + (positionInFlank - queryStart)
							newRefStop =  targetStop - (queryEnd - positionInFlank-locusSpan)
							queryStrand = "+"
							#query alignment start/stop are always in ascending order, regardless of strand
							queryAlignmentStart = max(1, parseData.start - (positionInFlank-1) + (queryStart-1))
							queryAlignmentStop = queryAlignmentStart + targetAlignmentSpan-1
						
						elif queryStart >queryEnd and positionInFlank<queryStart and positionInFlank>queryEnd:
							#could happen. on the opposite strand. targetStart is always bigger than targetStop
							#locus must be in the middle of queryStart and queryEnd.
							newRefStart=  targetStop - (positionInFlank-queryEnd)
							newRefStop =  targetStart + (queryStart - positionInFlank-locusSpan)
							queryStrand = "-"
							#query alignment start/stop are always in ascending order, regardless of strand
							queryAlignmentStart = max(1, parseData.start - (positionInFlank-1) + (queryEnd-1))
							queryAlignmentStop = queryAlignmentStart + targetAlignmentSpan-1
						
						else:
							newRefStart = None
							newRefStop = None
						if newRefStart is not None and newRefStop is not None:
							if queryID not in querySNPID2NewReferenceCoordinateLs:
								querySNPID2NewReferenceCoordinateLs[queryID] = []
							
							newRefCoordinate = PassingData(newChr=targetChr, newRefStart=newRefStart, newRefStop=newRefStop, \
												queryStrand=queryStrand, newRefBase="", \
												targetAlignmentSpan=targetAlignmentSpan,\
												targetAlignmentStart=targetStart,\
												targetAlignmentStop=targetStop,\
												queryAlignmentSpan=queryAlignmentSpan,\
												queryAlignmentStart=queryAlignmentStart,\
												queryAlignmentStop=queryAlignmentStop,\
												queryChromosome=parseData.chromosome, \
												queryStart=parseData.start, queryStop=parseData.stop,\
												queryRefBase=queryRefBase, queryAltBase=queryAltBase )
							querySNPID2NewReferenceCoordinateLs[queryID].append(newRefCoordinate)
							real_counter += 1
						
			counter += 1
		sys.stderr.write(" from %s blast results. %s/%s SNPs found blast-reference coordinates.\n"%\
						(counter, real_counter, len(queryIDSet)))
		
		
		if querySNPDataFname and newSNPDataOutputFname:
			self.outputSNPDataInNewCoordinate(querySNPDataFname=querySNPDataFname, \
									querySNPID2NewReferenceCoordinateLs=querySNPID2NewReferenceCoordinateLs, \
									newSNPDataOutputFname=newSNPDataOutputFname, \
									newSNPDataOutputFormat=self.newSNPDataOutputFormat)
		return querySNPID2NewReferenceCoordinateLs
コード例 #20
0
    def putHaplotypeGroupIntoDB(self, session, input_fname,
                                max_snp_typing_error_rate, snp_id_ls):
        """
		2009-4-10
			remove tg_ecotypeid2row
		2009-4-4
			add argument tg_ecotypeid2row
		2009-3-31
		"""
        sys.stderr.write("Constructing haplotype groups ...\n")
        pattern_ecotypeid = re.compile(r'(?<=\))\d+')
        reader = csv.reader(open(input_fname),
                            delimiter=figureOutDelimiter(input_fname))
        col_name2col_index = getColName2IndexFromHeader(reader.next())
        ecotypeid_idx = col_name2col_index['ecotypeid']
        haplo_name_idx = col_name2col_index['haplogroup']
        geographic_integrity_idx = col_name2col_index['geographic_integrity']
        filtered_SNPs_idx = col_name2col_index['filtered_SNPs']
        counter = 0
        #for tg_ecotypeid, row in tg_ecotypeid2row.iteritems():
        for row in reader:
            ecotypeid = int(row[ecotypeid_idx])
            #ecotypeid = tg_ecotypeid	#2009-4-4 use tg_ecotypeid instead
            haplo_name = row[haplo_name_idx]
            geographic_integrity_name = row[geographic_integrity_idx]
            filtered_SNPs = row[filtered_SNPs_idx]
            ref_ecotypeid = int(pattern_ecotypeid.search(haplo_name).group(0))
            haplo_group = StockDB.HaploGroup.query.filter_by(
                short_name=haplo_name).first()
            if not haplo_group:
                haplo_group = StockDB.HaploGroup(
                    short_name=haplo_name,
                    ref_ecotypeid=ref_ecotypeid,
                    max_snp_typing_error_rate=max_snp_typing_error_rate)
                session.save(haplo_group)
                session.flush()

            ecotype = StockDB.Ecotype.get(ecotypeid)
            haplo_group.ecotypes.append(ecotype)
            geographic_integrity = StockDB.GeographicIntegrity.query.filter_by(
                short_name=geographic_integrity_name).first()
            if not geographic_integrity:
                geographic_integrity = StockDB.GeographicIntegrity(
                    short_name=geographic_integrity_name)
                session.save(geographic_integrity)
                session.flush()
            ecotype.geographic_integrity = geographic_integrity
            session.save_or_update(ecotype)
            #one bit of ecotype: link the ecotypeid to tg_ecotype_id

            #deal with filtered SNPs
            for i in range(len(filtered_SNPs)):
                allele = filtered_SNPs[i]
                if allele == '_':
                    continue
                fc = StockDB.FilteredCalls(ecotypeid=ecotypeid,
                                           snpid=snp_id_ls[i],
                                           allele=allele)
                session.save(fc)
                session.flush()
            counter += 1
            if counter % 500 == 0 and self.report:
                sys.stderr.write('%s%s' % ('\x08' * 80, counter))
        session.flush()
        sys.stderr.write("Done.\n")
コード例 #21
0
	def vcftoolsOutputStatFileWalker(self, inputFname, processFunc=None, run_type=1, \
									chrColumnHeader='CHR', minChrLength=1000000, chrLengthColumnHeader='chrLength',\
									xColumnHeader="BIN_START", valueForNonPositiveYValue=-1):
		"""
		2012.10.26 skip sites if chr_cumu_start is not available
		2012.10.25 only skip except during file opening, not file reading
		2012.9.18 chrLengthColumnHeader could be nothing
		2012.8.31 add argument valueForNonPositiveYValue
		2012.8.13 bugfix. pass inf to figureOutDelimiter
		2012.8.1
		2011-11-2
			remove the maxDepth filter. apply afterwards through filterDataByDepth().
		2011-9-30
		
		"""
		sys.stderr.write("walking through %s ..."%(inputFname))
		counter =0
		chr2xy_ls = self.chr2xy_ls
		try:
			inf = utils.openGzipFile(inputFname)
			delimiter=figureOutDelimiter(inf)	#2012.8.13 bugfix. pass inf to figureOutDelimiter
			sys.stderr.write(" delimiter is '%s'  "%(delimiter))
			reader = csv.reader(inf, delimiter=delimiter)
			header = reader.next()
			col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True)
		except:	#in case something wrong (i.e. file is empty)
			sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
			import traceback
			traceback.print_exc()
			print sys.exc_info()
			return
		
		chr_id_index = col_name2index.get(chrColumnHeader, None)
		if chr_id_index is None:
			chr_id_index = col_name2index.get("CHROM", None)
		if chr_id_index is None:
			chr_id_index = col_name2index.get("CHR", None)
		if chr_id_index is None:
			sys.stderr.write("Error chr_id_index is None.\n")
			sys.exit(3)
		bin_start_index = col_name2index.get(xColumnHeader, None)
		if chrLengthColumnHeader:	#could be nothing
			chrLength_index = col_name2index.get(chrLengthColumnHeader, None)
		else:
			chrLength_index = None
		if self.whichColumnHeader:
			whichColumn = col_name2index.get(self.whichColumnHeader, None)
		else:
			whichColumn = self.whichColumn
		
		for row in reader:
			if self.samplingRate<1 and self.samplingRate>=0:
				r = random.random()
				if r>self.samplingRate:
					continue
			if chrLength_index:
				chrLength = int(row[chrLength_index])
				if chrLength<minChrLength:
					continue
			chr_id = row[chr_id_index]
			bin_start = int(float(row[bin_start_index]))
			
			yValue = row[whichColumn]
			yValue = self.handleYValue(yValue)
			
			if chr_id not in chr2xy_ls:
				chr2xy_ls[chr_id] = [[],[]]
			chr_cumu_start = self.chr_id2cumu_start.get(chr_id)
			if chr_cumu_start is None:	#2012.10.26 skip sites
				sys.stderr.write("Chromosome %s does not have chr_cumu_start.\n"%(chr_id))
				continue
			chr2xy_ls[chr_id][0].append(chr_cumu_start + bin_start + 1)
			chr2xy_ls[chr_id][1].append(yValue)
			counter += 1
		del reader
		inf.close()
		sys.stderr.write("%s data.\n"%(counter))
コード例 #22
0
	def run(self):
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		#['trio_set', 'chromosome', 'pos', 'depthOfFather','depthOfMother', 'depthOfChild', 'isInconsistent']
		
		chr_pos2inconsistentData = {}	#key is (chr,pos),
		#value is (noOfInconsistencyInTrio, noOfTotalInTrio, noOfInconsistencyInDuo, noOfTotalInDuo)
		sys.stderr.write("Reading from %s files ...\n"%(len(self.inputFnameLs)))
		for inputFname in self.inputFnameLs:
			if not os.path.isfile(inputFname):
				continue
			reader = None
			trioSetStrIndex = None
			chromosomeIndex = None
			posIndex = None
			isInconsistentIndex = None
			try:
				inputFile = utils.openGzipFile(inputFname)
				delimiter = figureOutDelimiter(inputFile)
				reader = csv.reader(inputFile, delimiter=delimiter)
				header = reader.next()
				col_name2index = getColName2IndexFromHeader(header)
				
				trioSetStrIndex = col_name2index.get("#trio_set")
				chromosomeIndex = col_name2index.get("chromosome")
				posIndex = col_name2index.get("pos")
				isInconsistentIndex = col_name2index.get("isInconsistent")
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
			if reader is not None and isInconsistentIndex is not None:
				for row in reader:
					trio_set_str = row[trioSetStrIndex]
					chromosome = row[chromosomeIndex]
					pos = int(row[posIndex])
					isInconsistent = int(row[isInconsistentIndex])
					chr_pos = (chromosome, pos)
					if chr_pos not in chr_pos2inconsistentData:
						chr_pos2inconsistentData[chr_pos] = [0, 0, 0, 0]
					#trio_set_ls = trio_set_str.split(',')
					if trio_set_str.find("0")==0 or trio_set_str.find(",0")!=-1:	#it's a duo. one parent is missing.
						chr_pos2inconsistentData[chr_pos][2] += isInconsistent
						chr_pos2inconsistentData[chr_pos][3] += 1
					else:	#it's a trio
						chr_pos2inconsistentData[chr_pos][0] += isInconsistent
						chr_pos2inconsistentData[chr_pos][1] += 1
						
		sys.stderr.write("Done.\n")
		
		sys.stderr.write("Outputting ...")
		writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
		writer.writerow(['#chromosome', 'pos', 'noOfInconsistencyInTrio', 'noOfTotalInTrio', 'inconsistencyRateInTrio',\
						'noOfInconsistencyInDuo', 'noOfTotalInDuo', 'inconsistencyRateInDuo'])
		chr_pos_ls = chr_pos2inconsistentData.keys()
		chr_pos_ls.sort()
		for chr_pos in chr_pos_ls:
			chromosome, pos = chr_pos
			noOfInconsistencyInTrio, noOfTotalInTrio, noOfInconsistencyInDuo, noOfTotalInDuo = chr_pos2inconsistentData.get(chr_pos)
			if noOfTotalInTrio>0:
				inconsistencyRateInTrio = noOfInconsistencyInTrio/float(noOfTotalInTrio)
			else:
				inconsistencyRateInTrio = -1
			if noOfTotalInDuo>0:
				inconsistencyRateInDuo = noOfInconsistencyInDuo/float(noOfTotalInDuo)
			else:
				inconsistencyRateInDuo = -1
			writer.writerow([chromosome, pos, noOfInconsistencyInTrio, noOfTotalInTrio, inconsistencyRateInTrio,\
							noOfInconsistencyInDuo, noOfTotalInDuo, inconsistencyRateInDuo])
		
		del writer
		sys.stderr.write("Done.\n")
コード例 #23
0
	def predictALLSegments(self, input_fname, array_id2model_array_id_ls, array_id2model,\
						max_amplitude=-0.1, param_obj=None):
		"""
		2010-7-25
			handle the situation that any arrays has >=3 model-arrays
		2010-7-1
		"""
		sys.stderr.write('Predicting for all segments from %s ... \n'%(input_fname))
		reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname))
		
		header = reader.next()
		col_name2index = getColName2IndexFromHeader(header)
		median_col_index = col_name2index.get('median')
		ecotype_id_idx = col_name2index.get('ecotype_id', col_name2index.get('array_id'))
		counter = 0
		no_of_segments_in_model = 0
		no_of_predicted_deletions = 0
		for row in reader:
			counter += 1
			amplitude = float(row[col_name2index['amplitude']])
			if amplitude>max_amplitude:
				continue
			cnv_ecotype_id = int(row[ecotype_id_idx])
			array_id = int(row[col_name2index.get('array_id')])
			if array_id not in array_id2model_array_id_ls:
				continue
			no_of_probes = int(row[col_name2index['length']])
			
			start_probe = row[col_name2index['start_probe']].split('_')	# split chr_pos
			start_probe = map(int, start_probe)
			start_probe_id = row[col_name2index['start_probe_id']]
			stop_probe = row[col_name2index['end_probe']].split('_')
			stop_probe = map(int, stop_probe)
			stop_probe_id = row[col_name2index['end_probe_id']]
			
			segment_chromosome = start_probe[0]
			if start_probe[0]!=stop_probe[0]:	#spurious. on different chromosomes.
				continue
			segment_start_pos = start_probe[1]-12
			segment_stop_pos = stop_probe[1]+12
			segment_length = abs(segment_stop_pos-segment_start_pos+1)
			
			if median_col_index is not None:
				median_intensity = float(row[median_col_index])
			else:
				median_intensity = None
			cnv_segment_obj = PassingData(ecotype_id=cnv_ecotype_id, start_probe=start_probe, stop_probe=stop_probe,\
												no_of_probes=no_of_probes, amplitude=amplitude, segment_length=segment_length,\
												segment_chromosome=segment_chromosome, array_id=array_id,\
												start_probe_id=start_probe_id, stop_probe_id=stop_probe_id,\
												segment_start_pos=segment_start_pos, segment_stop_pos=segment_stop_pos,\
												median_intensity=median_intensity)
			model_array_id_ls = array_id2model_array_id_ls.get(array_id)
			no_of_segments_in_model += 1
			label_predicted, label_predicted2probability = self.predictOneSegmentByMultipleModels(cnv_segment_obj, \
																	model_array_id_ls, array_id2model)
			if label_predicted==-1:	# predicted to be deletion.
				cnv_segment_obj.probability = label_predicted2probability[-1]
				cnv_segment_obj.comment = 'model arrays: %s'%(repr(model_array_id_ls)[1:-1])
				self.saveSegmentObj(param_obj, cnv_segment_obj)
				no_of_predicted_deletions += 1
			if no_of_predicted_deletions%5000==0:
				sys.stderr.write('%s%s\t%s\t%s'%('\x08'*100, counter, no_of_segments_in_model, no_of_predicted_deletions))
		sys.stderr.write('%s%s\t%s\t%s\n'%('\x08'*100, counter, no_of_segments_in_model, no_of_predicted_deletions))
		sys.stderr.write('%s out of %s segments were used in prediction. %s predicted deletions.\n'%\
						(no_of_segments_in_model, counter, no_of_predicted_deletions))
コード例 #24
0
    def run(self):

        if self.debug:
            import pdb
            pdb.set_trace()

        no_of_result1_peaks_ls = []
        no_of_result2_peaks_ls = []
        fraction_of_result1_peaks_in_result2_ls = []
        fraction_of_result2_peaks_in_result1_ls = []
        no_of_combined_peaks_ls = []
        fraction_of_overlap_in_combined_peaks_ls = []

        for inputFname in self.inputFnameLs:
            reader = csv.reader(open(inputFname),
                                delimiter=figureOutDelimiter(inputFname))
            header = reader.next()
            col_name2index = getColName2IndexFromHeader(header,
                                                        skipEmptyColumn=True)
            no_of_result1_peaks_index = col_name2index.get(
                "no_of_result1_peaks")
            no_of_result2_peaks_index = col_name2index.get(
                "no_of_result2_peaks")
            no_of_result1_peaks_in_result2_index = col_name2index.get(
                "no_of_result1_peaks_in_result2")
            no_of_result2_peaks_in_result1_index = col_name2index.get(
                "no_of_result2_peaks_in_result1")
            for row in reader:
                no_of_result1_peaks = float(row[no_of_result1_peaks_index])
                no_of_result2_peaks = float(row[no_of_result2_peaks_index])
                no_of_result1_peaks_in_result2 = float(
                    row[no_of_result1_peaks_in_result2_index])
                no_of_result2_peaks_in_result1 = float(
                    row[no_of_result2_peaks_in_result1_index])
                no_of_result1_peaks_ls.append(no_of_result1_peaks)
                no_of_result2_peaks_ls.append(no_of_result2_peaks)
                fraction_of_result1_peaks_in_result2_ls.append(
                    no_of_result1_peaks_in_result2 / no_of_result1_peaks)
                fraction_of_result2_peaks_in_result1_ls.append(
                    no_of_result2_peaks_in_result1 / no_of_result2_peaks)
                no_of_combined_peaks_ls.append(no_of_result1_peaks +
                                               no_of_result2_peaks)
                fraction_of_overlap_in_combined_peaks_ls.append(
                    (no_of_result1_peaks_in_result2 +
                     no_of_result2_peaks_in_result1) /
                    (no_of_result1_peaks + no_of_result2_peaks))
            del reader

        title = "%s pairs" % (len(fraction_of_result1_peaks_in_result2_ls))
        if len(fraction_of_result1_peaks_in_result2_ls) > 10:
            medianFraction = numpy.median(
                fraction_of_result1_peaks_in_result2_ls)
            title += " median %.3f" % (medianFraction)
        yh_matplotlib.drawHist(fraction_of_result1_peaks_in_result2_ls, title=title, \
            xlabel_1D="fraction of result1 peaks in result2", xticks=None, \
            outputFname="%s_hist_of_fraction_of_result1_peaks_in_result2.png"%self.outputFnamePrefix, \
            min_no_of_data_points=20, needLog=False, \
            dpi=200)
        title = "%s pairs" % (len(fraction_of_result2_peaks_in_result1_ls))
        if len(fraction_of_result2_peaks_in_result1_ls) > 10:
            medianFraction = numpy.median(
                fraction_of_result2_peaks_in_result1_ls)
            title += " median %.3f" % (medianFraction)
        yh_matplotlib.drawHist(fraction_of_result2_peaks_in_result1_ls, title=title, \
            xlabel_1D="fraction of result2 peaks in result1", xticks=None, \
            outputFname="%s_hist_of_fraction_of_result2_peaks_in_result1.png"%self.outputFnamePrefix, \
            min_no_of_data_points=20, needLog=False, \
            dpi=200)

        title = "%s pairs" % (len(fraction_of_overlap_in_combined_peaks_ls))
        if len(fraction_of_overlap_in_combined_peaks_ls) > 10:
            medianFraction = numpy.median(
                fraction_of_overlap_in_combined_peaks_ls)
            title += " median %.3f" % (medianFraction)
        yh_matplotlib.drawHist(fraction_of_overlap_in_combined_peaks_ls, title=title, \
            xlabel_1D="fraction of recurrent peaks in combined", xticks=None, \
            outputFname="%s_hist_of_fraction_of_recurrent_peaks_in_combined.png"%self.outputFnamePrefix, \
            min_no_of_data_points=20, needLog=False, \
            dpi=200)

        title = "%s results" % (len(no_of_result1_peaks_ls))
        yh_matplotlib.drawScatter(no_of_result1_peaks_ls, no_of_result2_peaks_ls, \
          fig_fname="%s_no_of_peaks_result1_vs_result2.png"%self.outputFnamePrefix, \
          title=title, xlabel='No. of peaks in result1', \
          ylabel='No. of peaks in result2', dpi=300)

        title = "%s results" % (len(no_of_result1_peaks_ls))
        yh_matplotlib.drawScatter(no_of_result1_peaks_ls, fraction_of_result1_peaks_in_result2_ls, \
          fig_fname="%s_result1_no_of_peak_vs_fraction.png"%self.outputFnamePrefix, \
          title=title, xlabel='No. of peaks in result1', \
          ylabel='Fraction found in result2', dpi=300)

        title = "%s results" % (len(no_of_result2_peaks_ls))
        yh_matplotlib.drawScatter(no_of_result2_peaks_ls, fraction_of_result2_peaks_in_result1_ls, \
          fig_fname="%s_result2_no_of_peak_vs_fraction.png"%self.outputFnamePrefix, \
          title=title, xlabel='No. of peaks in result2', \
          ylabel='Fraction found in result1', dpi=300)

        title = "%s pairs" % (len(fraction_of_result1_peaks_in_result2_ls))
        yh_matplotlib.drawScatter(fraction_of_result1_peaks_in_result2_ls, fraction_of_result2_peaks_in_result1_ls, \
          fig_fname="%s_1_fraction_in2_vs_2_fraction_in1.png"%self.outputFnamePrefix, \
          title=title, xlabel='result1 fraction found in result2', \
          ylabel='result2 fraction found in result1', dpi=300)

        title = "%s pairs" % (len(no_of_combined_peaks_ls))
        yh_matplotlib.drawScatter(no_of_combined_peaks_ls, fraction_of_overlap_in_combined_peaks_ls, \
          fig_fname="%s_combined_no_of_peak_vs_fraction.png"%self.outputFnamePrefix, \
          title=title, xlabel='No. of peaks combined', \
          ylabel='Fraction recurrent', dpi=300)