def run(self):
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		inconsistent_rate_ls = []
		for inputFname in self.inputFnameLs:
			if os.path.isfile(inputFname):
				try:
					reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname))
					header = reader.next()
					col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True)
					inconsistent_rate_index = col_name2index.get("inconsistency")
					for row in reader:
						inconsistency = float(row[inconsistent_rate_index])
						inconsistent_rate_ls.append(inconsistency)
					del reader
				except:
					sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
					import traceback
					traceback.print_exc()
		
		if self.title is None:
			title = "histogram of inconsistent rate from %s refs"%(len(inconsistent_rate_ls))
		else:
			title = self.title
		if len(inconsistent_rate_ls)>10:
			medianInconsistentRate = numpy.median(inconsistent_rate_ls)
			title += " median %.4f"%(medianInconsistentRate)
		yh_matplotlib.drawHist(inconsistent_rate_ls, title=title, \
									xlabel_1D="Inconsistent Rate", xticks=None, outputFname=self.outputFname, min_no_of_data_points=20, needLog=False, \
									dpi=200)
	def constructPedigreeGraphFromPlinkIBD(self, inputFname=None, maxDistanceToPOVector=0.04, drawDistribution=False, outputFnamePrefix=None):
		"""
		2012.8.14
		"""
		sys.stderr.write("Constructing pedigree-graph out of plink-ibd %s ..."%(inputFname))
		DG=nx.DiGraph()
		childNodeSet = set()
		reader = MatrixFile(inputFname)
		reader.constructColName2IndexFromHeader()
		
		monkey1IDIndex = reader.getColIndexGivenColHeader("IID1")
		monkey2IDIndex = reader.getColIndexGivenColHeader("IID2")
		Z0Index = reader.getColIndexGivenColHeader("Z0")
		Z1Index = reader.getColIndexGivenColHeader("Z1")
		Z2Index = reader.getColIndexGivenColHeader("Z2")
		
		poVector = numpy.array([0,1,0.0])
		counter = 0
		real_counter = 0
		
		data_ls = []
		for row in reader:
			monkey1ID = int(row[monkey1IDIndex])	#turn it into integer so could compare age
			monkey2ID = int(row[monkey2IDIndex])
			Z0 = float(row[Z0Index])
			Z1 = float(row[Z1Index])
			Z2 = float(row[Z2Index])
			ZVector = numpy.array([Z0, Z1, Z2])
			dist = numpy.linalg.norm(poVector-ZVector)
			if drawDistribution and outputFnamePrefix:
				data_ls.append(dist)
			if dist<=maxDistanceToPOVector:
				if monkey1ID>monkey2ID:
					childID = monkey1ID
					parentID = monkey2ID
				else:
					childID = monkey2ID
					parentID = monkey1ID
				DG.add_edge(parentID, childID, weight=dist)
				childNodeSet.add(childID)
				real_counter += 1
			counter += 1
		del reader
		sys.stderr.write("%s out of %s lines become PO pairs. %s children, %s nodes. %s edges. %s connected components.\n"%(\
							real_counter, counter, len(childNodeSet), DG.number_of_nodes(), DG.number_of_edges(), \
							nx.number_connected_components(DG.to_undirected())))
		if drawDistribution and outputFnamePrefix:
			outputFname = '%s_IBDVector2POVectorDist_hist.png'%(outputFnamePrefix)
			yh_matplotlib.drawHist(data_ls, title='', \
								xlabel_1D="dist(ZVector,POVector)", xticks=None, \
								outputFname=outputFname, min_no_of_data_points=10, \
								needLog=True, \
								dpi=200, min_no_of_bins=25)
		return PassingData(DG=DG, childNodeSet=childNodeSet)
	def drawKinshipIBDDeltaVectorHistogram(self, kinshipIBDDeltaData=None, row_id=None, outputFnamePrefix=None):
		"""
		2012.8.22
		"""
		vector = kinshipIBDDeltaData.getRowVectorGivenRowID(row_id=row_id)
		if vector is not None:
			data_ls = []
			for i in xrange(len(vector)):
				if vector.mask[i]==False:
					data_ls.append(vector[i])
			if len(data_ls)>10:
				outputFname = '%s_monkey_%s_kinship_ibd_hist.png'%(outputFnamePrefix, row_id)
				yh_matplotlib.drawHist(data_ls, title='', \
								xlabel_1D="%s kinship-ibd"%(row_id), xticks=None, \
								outputFname=outputFname, min_no_of_data_points=10, \
								needLog=True, \
								dpi=200, min_no_of_bins=25)
	def drawQualityData(self, qualityDataStructure, outputFnamePrefix, sequence_id=''):
		"""
		2011-8-15
		"""
		sys.stderr.write("Making plots on quality data ...")
		
		yh_matplotlib.drawHist(qualityDataStructure.quality_ls, title='histogram of phredScore from %s'%(sequence_id), xlabel_1D=None, \
							outputFname='%s_qualityHist.png'%(outputFnamePrefix), \
							min_no_of_data_points=50, needLog=False, dpi=200)
		
		yh_matplotlib.drawBoxPlot(qualityDataStructure.quality_ls_per_position, title='quality box plot from %s'%(sequence_id), \
								xlabel_1D='base position in read', xticks=None, outputFname='%s_quality_per_position.png'%(outputFnamePrefix), \
								dpi=200)
		
		no_of_bases_per_position = qualityDataStructure.no_of_bases_per_position
		readLength = len(no_of_bases_per_position)
		yh_matplotlib.drawBarChart(range(1, readLength+1), no_of_bases_per_position, title='no of base calls from %s'%(sequence_id),\
						xlabel_1D='base position in read', xticks=None, outputFname='%s_no_of_bases_per_position.png'%(outputFnamePrefix), \
						bottom=0, needLog=False, dpi=200)
		
		diNuc2count = qualityDataStructure.diNuc2count
		diNuc2quality_ls = qualityDataStructure.diNuc2quality_ls
		
		diNuc_key_ls = diNuc2count.keys()
		diNuc_key_ls.sort()
		diNuc_count_ls = []
		diNuc_quality_ls_ls = []
		for diNuc in diNuc_key_ls:
			diNuc_count_ls.append(diNuc2count.get(diNuc))
			diNuc_quality_ls_ls.append(diNuc2quality_ls.get(diNuc))
		
		yh_matplotlib.drawBarChart(range(1, len(diNuc_count_ls)+1), diNuc_count_ls, title='di-nucleotide counts from %s'%(sequence_id),\
						xlabel_1D=None, xticks=diNuc_key_ls, outputFname='%s_diNuc_count.png'%(outputFnamePrefix), \
						bottom=0, needLog=False, dpi=200)
		
		yh_matplotlib.drawBoxPlot(diNuc_quality_ls_ls, title='di-Nucleotide quality box plot from %s'%(sequence_id), \
								xlabel_1D=None, xticks=diNuc_key_ls, outputFname='%s_diNuc_quality.png'%(outputFnamePrefix), \
								dpi=200)
		
		sys.stderr.write("Done.\n")
    def drawBridgeChromosomalLengthHist(self, bridge_ls=None):
        """
		2011-4-18
		"""
        no_of_bridges = len(bridge_ls)
        sys.stderr.write(
            "Drawing histogram of chromosomal length for %s bridges ... \n" %
            (no_of_bridges))
        bridge_chr_length_ls = []
        no_of_loci_per_bridge_ls = []
        for i in xrange(no_of_bridges):
            bridge = bridge_ls[i]
            bridge_chr_length_ls.append(bridge[3])
            no_of_loci_per_bridge_ls.append(bridge[2])

        yh_matplotlib.drawHist(bridge_chr_length_ls, title='Histogram of bridge chromosomal length', \
             xlabel_1D='chromosomal length',\
             outputFname='/tmp/chromosomal_length_hist.png', min_no_of_data_points=50, needLog=True)
        yh_matplotlib.drawHist(no_of_loci_per_bridge_ls, title='Histogram of no-of-loci per bridge', \
             xlabel_1D='no-of-loci',\
             outputFname='/tmp/no_of_loci_hist.png', min_no_of_data_points=50, needLog=True)
        sys.stderr.write("Done.\n")
	def run(self):
		"""
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		#if self.monkeyCoverageFname and os.path.isfile(self.monkeyCoverageFname):
		#	monkey_id2coverage = cls.getMonkeyID2Coverage(self.monkeyCoverageFname)
		#else:
		db_vervet = self.db_vervet
		
		sequencedMonkeyIDSet = self.readInSequencedMonkeys(db_vervet, countryIDList=self.sequencedMonkeyCountryIDList)
		
		if self.inputFname:
			preChosenMonkeyIDSet = self.readInChosenOnes(self.inputFname)
		else:
			preChosenMonkeyIDSet = set()
		
		preChosenMonkeyIDSet |= sequencedMonkeyIDSet
		
		#monkey with latitudes/longitudes
		monkeyID2Info = self.readInMonkeysFromDB(db_vervet, countryIDList=self.newSampleMonkeyCountryIDList,\
								maxLongitude=self.maxLongitude, addOnlyVWPMonkeys=self.addOnlyVWPMonkeys)
		
		#allMonkeyID2Info is for output purpose
		allMonkeyID2Info = self.readInMonkeysFromDB(db_vervet, countryIDList=self.sequencedMonkeyCountryIDList,\
								maxLongitude=None)
		
		graph = self.constructNeighborGraph(monkeyID2Info, maxDist=self.maxDist)
		
		allMonkeyGraph = self.constructNeighborGraph(allMonkeyID2Info, maxDist=self.maxDist)
		
		"""
		#draw it and check how many monkeys have degree=1
		pos=nx.graphviz_layout(graph, prog="neato")
		#nx.draw_shell(graph)
		nx.draw(graph, pos, with_labels=True
			)
		#node_size=40,
		pylab.savefig('%s_graphNeatoLayout.png'%(self.outputFnamePrefix), dpi=150)
		"""
		
		shortestDistanceVectorData = self.constructNewMonkeyToChosenSetDistanceVector(graph=graph, preChosenMonkeyIDSet=preChosenMonkeyIDSet, \
												minShortestDistance=self.minShortestDistance)
		#probabilitySpanRBDict = self.constructNewMonkeyToChosenSetDistanceVector(graph=graph, preChosenMonkeyIDSet=preChosenMonkeyIDSet)
		#sampling for 10 times
		for i in xrange(1):
			finalChosenMonkeyIDDict = self.chooseExtraSamples(graph, preChosenMonkeyIDSet=preChosenMonkeyIDSet, \
												noOfMonkeysToChoose=self.noOfMonkeysToChoose, \
												shortestDistanceVectorData=shortestDistanceVectorData, \
												minShortestDistance=self.minShortestDistance)
			self.outputChosenMonkeys(monkeyID2Info=allMonkeyID2Info, chosenMonkeyIDDict=finalChosenMonkeyIDDict, \
									outputFname='%s_sample%s_%sMonkeys.tsv'%(self.outputFnamePrefix, i, len(finalChosenMonkeyIDDict)))
			distance_ls = self.getPairwiseDistanceWithinGraphOfChosenMonkey(graph=allMonkeyGraph, \
																		chosenMonkeyIDDict=finalChosenMonkeyIDDict)
			
			yh_matplotlib.drawHist(data_ls=distance_ls, title=None, \
					xlabel_1D="pairwise distance within graph", \
					xticks=None, outputFname='%s_sample%s_%sMonkeys_pairwise_distance_hist.png'%\
						(self.outputFnamePrefix, i, len(finalChosenMonkeyIDDict)), \
					min_no_of_data_points=10, needLog=True, \
					dpi=200, max_no_of_bins=40)
Esempio n. 7
0
    def run(self):

        if self.debug:
            import pdb
            pdb.set_trace()

        no_of_result1_peaks_ls = []
        no_of_result2_peaks_ls = []
        fraction_of_result1_peaks_in_result2_ls = []
        fraction_of_result2_peaks_in_result1_ls = []
        no_of_combined_peaks_ls = []
        fraction_of_overlap_in_combined_peaks_ls = []

        for inputFname in self.inputFnameLs:
            reader = csv.reader(open(inputFname),
                                delimiter=figureOutDelimiter(inputFname))
            header = reader.next()
            col_name2index = getColName2IndexFromHeader(header,
                                                        skipEmptyColumn=True)
            no_of_result1_peaks_index = col_name2index.get(
                "no_of_result1_peaks")
            no_of_result2_peaks_index = col_name2index.get(
                "no_of_result2_peaks")
            no_of_result1_peaks_in_result2_index = col_name2index.get(
                "no_of_result1_peaks_in_result2")
            no_of_result2_peaks_in_result1_index = col_name2index.get(
                "no_of_result2_peaks_in_result1")
            for row in reader:
                no_of_result1_peaks = float(row[no_of_result1_peaks_index])
                no_of_result2_peaks = float(row[no_of_result2_peaks_index])
                no_of_result1_peaks_in_result2 = float(
                    row[no_of_result1_peaks_in_result2_index])
                no_of_result2_peaks_in_result1 = float(
                    row[no_of_result2_peaks_in_result1_index])
                no_of_result1_peaks_ls.append(no_of_result1_peaks)
                no_of_result2_peaks_ls.append(no_of_result2_peaks)
                fraction_of_result1_peaks_in_result2_ls.append(
                    no_of_result1_peaks_in_result2 / no_of_result1_peaks)
                fraction_of_result2_peaks_in_result1_ls.append(
                    no_of_result2_peaks_in_result1 / no_of_result2_peaks)
                no_of_combined_peaks_ls.append(no_of_result1_peaks +
                                               no_of_result2_peaks)
                fraction_of_overlap_in_combined_peaks_ls.append(
                    (no_of_result1_peaks_in_result2 +
                     no_of_result2_peaks_in_result1) /
                    (no_of_result1_peaks + no_of_result2_peaks))
            del reader

        title = "%s pairs" % (len(fraction_of_result1_peaks_in_result2_ls))
        if len(fraction_of_result1_peaks_in_result2_ls) > 10:
            medianFraction = numpy.median(
                fraction_of_result1_peaks_in_result2_ls)
            title += " median %.3f" % (medianFraction)
        yh_matplotlib.drawHist(fraction_of_result1_peaks_in_result2_ls, title=title, \
            xlabel_1D="fraction of result1 peaks in result2", xticks=None, \
            outputFname="%s_hist_of_fraction_of_result1_peaks_in_result2.png"%self.outputFnamePrefix, \
            min_no_of_data_points=20, needLog=False, \
            dpi=200)
        title = "%s pairs" % (len(fraction_of_result2_peaks_in_result1_ls))
        if len(fraction_of_result2_peaks_in_result1_ls) > 10:
            medianFraction = numpy.median(
                fraction_of_result2_peaks_in_result1_ls)
            title += " median %.3f" % (medianFraction)
        yh_matplotlib.drawHist(fraction_of_result2_peaks_in_result1_ls, title=title, \
            xlabel_1D="fraction of result2 peaks in result1", xticks=None, \
            outputFname="%s_hist_of_fraction_of_result2_peaks_in_result1.png"%self.outputFnamePrefix, \
            min_no_of_data_points=20, needLog=False, \
            dpi=200)

        title = "%s pairs" % (len(fraction_of_overlap_in_combined_peaks_ls))
        if len(fraction_of_overlap_in_combined_peaks_ls) > 10:
            medianFraction = numpy.median(
                fraction_of_overlap_in_combined_peaks_ls)
            title += " median %.3f" % (medianFraction)
        yh_matplotlib.drawHist(fraction_of_overlap_in_combined_peaks_ls, title=title, \
            xlabel_1D="fraction of recurrent peaks in combined", xticks=None, \
            outputFname="%s_hist_of_fraction_of_recurrent_peaks_in_combined.png"%self.outputFnamePrefix, \
            min_no_of_data_points=20, needLog=False, \
            dpi=200)

        title = "%s results" % (len(no_of_result1_peaks_ls))
        yh_matplotlib.drawScatter(no_of_result1_peaks_ls, no_of_result2_peaks_ls, \
          fig_fname="%s_no_of_peaks_result1_vs_result2.png"%self.outputFnamePrefix, \
          title=title, xlabel='No. of peaks in result1', \
          ylabel='No. of peaks in result2', dpi=300)

        title = "%s results" % (len(no_of_result1_peaks_ls))
        yh_matplotlib.drawScatter(no_of_result1_peaks_ls, fraction_of_result1_peaks_in_result2_ls, \
          fig_fname="%s_result1_no_of_peak_vs_fraction.png"%self.outputFnamePrefix, \
          title=title, xlabel='No. of peaks in result1', \
          ylabel='Fraction found in result2', dpi=300)

        title = "%s results" % (len(no_of_result2_peaks_ls))
        yh_matplotlib.drawScatter(no_of_result2_peaks_ls, fraction_of_result2_peaks_in_result1_ls, \
          fig_fname="%s_result2_no_of_peak_vs_fraction.png"%self.outputFnamePrefix, \
          title=title, xlabel='No. of peaks in result2', \
          ylabel='Fraction found in result1', dpi=300)

        title = "%s pairs" % (len(fraction_of_result1_peaks_in_result2_ls))
        yh_matplotlib.drawScatter(fraction_of_result1_peaks_in_result2_ls, fraction_of_result2_peaks_in_result1_ls, \
          fig_fname="%s_1_fraction_in2_vs_2_fraction_in1.png"%self.outputFnamePrefix, \
          title=title, xlabel='result1 fraction found in result2', \
          ylabel='result2 fraction found in result1', dpi=300)

        title = "%s pairs" % (len(no_of_combined_peaks_ls))
        yh_matplotlib.drawScatter(no_of_combined_peaks_ls, fraction_of_overlap_in_combined_peaks_ls, \
          fig_fname="%s_combined_no_of_peak_vs_fraction.png"%self.outputFnamePrefix, \
          title=title, xlabel='No. of peaks combined', \
          ylabel='Fraction recurrent', dpi=300)
Esempio n. 8
0
	def run(self):
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		#without commenting out db_vervet connection code. schema "genome" wont' be default path.
		db_genome = GenomeDB.GenomeDatabase(drivername=self.drivername, username=self.db_user,
						password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema="genome")
		db_genome.setup(create_tables=False)
		#chrOrder=2 means chromosomes are not ordered alphabetically but by their sizes (descendingly)
		oneGenomeData = db_genome.getOneGenomeData(tax_id=self.tax_id, chr_gap=0, chrOrder=self.chrOrder, \
												sequence_type_id=self.sequence_type_id)
		chr2size = db_genome.getTopNumberOfChomosomes(contigMaxRankBySize=80000, contigMinRankBySize=1, tax_id=self.tax_id, \
											sequence_type_id=self.sequence_type_id)
		
		self.chr_id2cumu_start = oneGenomeData.chr_id2cumu_start
		"""
		size_chr_id_ls = [(value, key) for key, value in chr2size.iteritems()]
		size_chr_id_ls.sort()
		size_chr_id_ls.reverse()
		"""
		sys.stderr.write("Reading in data ...")
		for inputFname in self.inputFnameLs:
			if not os.path.isfile(inputFname):
				continue
			self.vcftoolsOutputStatFileWalker(inputFname, processFunc=None, chrColumnHeader=self.chrColumnHeader,\
									minChrLength=self.minChrLength, chrLengthColumnHeader=self.chrLengthColumnHeader,\
									xColumnHeader=self.xColumnHeader, valueForNonPositiveYValue=self.valueForNonPositiveYValue)
			
		sys.stderr.write("Done.\n")
		
		pylab.clf()
		yh_matplotlib.setPlotDimension(left=0.025, right=0.985, bottom=0.1, top=0.9)
		fig = pylab.figure(figsize=(30,2))
		#ax = pylab.axes()
		ax = fig.gca()
		
		max_y = None
		min_y = None
		value_ls = []
		for chromosome in oneGenomeData.chr_id_ls:
			xy_ls = self.chr2xy_ls.get(chromosome)
			if xy_ls:
				if max_y is None:
					max_y = max(xy_ls[1])
				else:
					max_y = max(max_y, max(xy_ls[1]))
				if min_y is None:
					min_y = min(xy_ls[1])
				else:
					min_y = min(min_y, min(xy_ls[1]))
				ax.plot(xy_ls[0], xy_ls[1], '.', markeredgewidth=0, markersize=4, alpha=0.8)
				value_ls += xy_ls[1]
		#separate each chromosome
		#for chromosome in chr_ls[:-1]:
		#	print chromosome
		#	ax.axvline(chr_id2cumu_size[chromosome], linestyle='--', color='k', linewidth=0.8)
		
		
		#draw the bonferroni line
		#bonferroni_value = -math.log10(0.01/len(genome_wide_result.data_obj_ls))
		#ax.axhline(bonferroni_value, linestyle='--', color='k', linewidth=0.8)
		
		ax.set_xlabel(self.xColumnPlotLabel)
		ax.set_ylabel(self.whichColumnPlotLabel)
		#ax.set_xlim([0, chr_id2cumu_size[chr_ls[-1]]])
		if self.ylim_type==1:
			ylim = ax.get_ylim()
			ax.set_ylim([0, ylim[1]])
		elif self.ylim_type==2:
			if max_y is not None and min_y is not None:
				delta = abs(max_y-min_y)/12.0
				ax.set_ylim([min_y-delta, max_y+delta])
		
		#outputFnamePrefix = os.path.splitext(self.outputFname)[0]
		outputFnamePrefix = self.outputFnamePrefix
		pylab.savefig('%s.png'%outputFnamePrefix, dpi=self.figureDPI)
		if self.need_svg:
			pylab.savefig('%s.svg'%outputFnamePrefix, dpi=self.figureDPI)
		outputFname = '%s_hist.png'%(outputFnamePrefix)
		yh_matplotlib.drawHist(value_ls, title='', \
				xlabel_1D=self.whichColumnPlotLabel, xticks=None, \
				outputFname=outputFname, min_no_of_data_points=self.minNoOfTotal, \
				needLog=self.logCount, \
				dpi=self.figureDPI, min_no_of_bins=40)