def run(self): if self.debug: import pdb pdb.set_trace() inconsistent_rate_ls = [] for inputFname in self.inputFnameLs: if os.path.isfile(inputFname): try: reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname)) header = reader.next() col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True) inconsistent_rate_index = col_name2index.get("inconsistency") for row in reader: inconsistency = float(row[inconsistent_rate_index]) inconsistent_rate_ls.append(inconsistency) del reader except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() if self.title is None: title = "histogram of inconsistent rate from %s refs"%(len(inconsistent_rate_ls)) else: title = self.title if len(inconsistent_rate_ls)>10: medianInconsistentRate = numpy.median(inconsistent_rate_ls) title += " median %.4f"%(medianInconsistentRate) yh_matplotlib.drawHist(inconsistent_rate_ls, title=title, \ xlabel_1D="Inconsistent Rate", xticks=None, outputFname=self.outputFname, min_no_of_data_points=20, needLog=False, \ dpi=200)
def constructPedigreeGraphFromPlinkIBD(self, inputFname=None, maxDistanceToPOVector=0.04, drawDistribution=False, outputFnamePrefix=None): """ 2012.8.14 """ sys.stderr.write("Constructing pedigree-graph out of plink-ibd %s ..."%(inputFname)) DG=nx.DiGraph() childNodeSet = set() reader = MatrixFile(inputFname) reader.constructColName2IndexFromHeader() monkey1IDIndex = reader.getColIndexGivenColHeader("IID1") monkey2IDIndex = reader.getColIndexGivenColHeader("IID2") Z0Index = reader.getColIndexGivenColHeader("Z0") Z1Index = reader.getColIndexGivenColHeader("Z1") Z2Index = reader.getColIndexGivenColHeader("Z2") poVector = numpy.array([0,1,0.0]) counter = 0 real_counter = 0 data_ls = [] for row in reader: monkey1ID = int(row[monkey1IDIndex]) #turn it into integer so could compare age monkey2ID = int(row[monkey2IDIndex]) Z0 = float(row[Z0Index]) Z1 = float(row[Z1Index]) Z2 = float(row[Z2Index]) ZVector = numpy.array([Z0, Z1, Z2]) dist = numpy.linalg.norm(poVector-ZVector) if drawDistribution and outputFnamePrefix: data_ls.append(dist) if dist<=maxDistanceToPOVector: if monkey1ID>monkey2ID: childID = monkey1ID parentID = monkey2ID else: childID = monkey2ID parentID = monkey1ID DG.add_edge(parentID, childID, weight=dist) childNodeSet.add(childID) real_counter += 1 counter += 1 del reader sys.stderr.write("%s out of %s lines become PO pairs. %s children, %s nodes. %s edges. %s connected components.\n"%(\ real_counter, counter, len(childNodeSet), DG.number_of_nodes(), DG.number_of_edges(), \ nx.number_connected_components(DG.to_undirected()))) if drawDistribution and outputFnamePrefix: outputFname = '%s_IBDVector2POVectorDist_hist.png'%(outputFnamePrefix) yh_matplotlib.drawHist(data_ls, title='', \ xlabel_1D="dist(ZVector,POVector)", xticks=None, \ outputFname=outputFname, min_no_of_data_points=10, \ needLog=True, \ dpi=200, min_no_of_bins=25) return PassingData(DG=DG, childNodeSet=childNodeSet)
def drawKinshipIBDDeltaVectorHistogram(self, kinshipIBDDeltaData=None, row_id=None, outputFnamePrefix=None): """ 2012.8.22 """ vector = kinshipIBDDeltaData.getRowVectorGivenRowID(row_id=row_id) if vector is not None: data_ls = [] for i in xrange(len(vector)): if vector.mask[i]==False: data_ls.append(vector[i]) if len(data_ls)>10: outputFname = '%s_monkey_%s_kinship_ibd_hist.png'%(outputFnamePrefix, row_id) yh_matplotlib.drawHist(data_ls, title='', \ xlabel_1D="%s kinship-ibd"%(row_id), xticks=None, \ outputFname=outputFname, min_no_of_data_points=10, \ needLog=True, \ dpi=200, min_no_of_bins=25)
def drawQualityData(self, qualityDataStructure, outputFnamePrefix, sequence_id=''): """ 2011-8-15 """ sys.stderr.write("Making plots on quality data ...") yh_matplotlib.drawHist(qualityDataStructure.quality_ls, title='histogram of phredScore from %s'%(sequence_id), xlabel_1D=None, \ outputFname='%s_qualityHist.png'%(outputFnamePrefix), \ min_no_of_data_points=50, needLog=False, dpi=200) yh_matplotlib.drawBoxPlot(qualityDataStructure.quality_ls_per_position, title='quality box plot from %s'%(sequence_id), \ xlabel_1D='base position in read', xticks=None, outputFname='%s_quality_per_position.png'%(outputFnamePrefix), \ dpi=200) no_of_bases_per_position = qualityDataStructure.no_of_bases_per_position readLength = len(no_of_bases_per_position) yh_matplotlib.drawBarChart(range(1, readLength+1), no_of_bases_per_position, title='no of base calls from %s'%(sequence_id),\ xlabel_1D='base position in read', xticks=None, outputFname='%s_no_of_bases_per_position.png'%(outputFnamePrefix), \ bottom=0, needLog=False, dpi=200) diNuc2count = qualityDataStructure.diNuc2count diNuc2quality_ls = qualityDataStructure.diNuc2quality_ls diNuc_key_ls = diNuc2count.keys() diNuc_key_ls.sort() diNuc_count_ls = [] diNuc_quality_ls_ls = [] for diNuc in diNuc_key_ls: diNuc_count_ls.append(diNuc2count.get(diNuc)) diNuc_quality_ls_ls.append(diNuc2quality_ls.get(diNuc)) yh_matplotlib.drawBarChart(range(1, len(diNuc_count_ls)+1), diNuc_count_ls, title='di-nucleotide counts from %s'%(sequence_id),\ xlabel_1D=None, xticks=diNuc_key_ls, outputFname='%s_diNuc_count.png'%(outputFnamePrefix), \ bottom=0, needLog=False, dpi=200) yh_matplotlib.drawBoxPlot(diNuc_quality_ls_ls, title='di-Nucleotide quality box plot from %s'%(sequence_id), \ xlabel_1D=None, xticks=diNuc_key_ls, outputFname='%s_diNuc_quality.png'%(outputFnamePrefix), \ dpi=200) sys.stderr.write("Done.\n")
def drawBridgeChromosomalLengthHist(self, bridge_ls=None): """ 2011-4-18 """ no_of_bridges = len(bridge_ls) sys.stderr.write( "Drawing histogram of chromosomal length for %s bridges ... \n" % (no_of_bridges)) bridge_chr_length_ls = [] no_of_loci_per_bridge_ls = [] for i in xrange(no_of_bridges): bridge = bridge_ls[i] bridge_chr_length_ls.append(bridge[3]) no_of_loci_per_bridge_ls.append(bridge[2]) yh_matplotlib.drawHist(bridge_chr_length_ls, title='Histogram of bridge chromosomal length', \ xlabel_1D='chromosomal length',\ outputFname='/tmp/chromosomal_length_hist.png', min_no_of_data_points=50, needLog=True) yh_matplotlib.drawHist(no_of_loci_per_bridge_ls, title='Histogram of no-of-loci per bridge', \ xlabel_1D='no-of-loci',\ outputFname='/tmp/no_of_loci_hist.png', min_no_of_data_points=50, needLog=True) sys.stderr.write("Done.\n")
def run(self): """ """ if self.debug: import pdb pdb.set_trace() #if self.monkeyCoverageFname and os.path.isfile(self.monkeyCoverageFname): # monkey_id2coverage = cls.getMonkeyID2Coverage(self.monkeyCoverageFname) #else: db_vervet = self.db_vervet sequencedMonkeyIDSet = self.readInSequencedMonkeys(db_vervet, countryIDList=self.sequencedMonkeyCountryIDList) if self.inputFname: preChosenMonkeyIDSet = self.readInChosenOnes(self.inputFname) else: preChosenMonkeyIDSet = set() preChosenMonkeyIDSet |= sequencedMonkeyIDSet #monkey with latitudes/longitudes monkeyID2Info = self.readInMonkeysFromDB(db_vervet, countryIDList=self.newSampleMonkeyCountryIDList,\ maxLongitude=self.maxLongitude, addOnlyVWPMonkeys=self.addOnlyVWPMonkeys) #allMonkeyID2Info is for output purpose allMonkeyID2Info = self.readInMonkeysFromDB(db_vervet, countryIDList=self.sequencedMonkeyCountryIDList,\ maxLongitude=None) graph = self.constructNeighborGraph(monkeyID2Info, maxDist=self.maxDist) allMonkeyGraph = self.constructNeighborGraph(allMonkeyID2Info, maxDist=self.maxDist) """ #draw it and check how many monkeys have degree=1 pos=nx.graphviz_layout(graph, prog="neato") #nx.draw_shell(graph) nx.draw(graph, pos, with_labels=True ) #node_size=40, pylab.savefig('%s_graphNeatoLayout.png'%(self.outputFnamePrefix), dpi=150) """ shortestDistanceVectorData = self.constructNewMonkeyToChosenSetDistanceVector(graph=graph, preChosenMonkeyIDSet=preChosenMonkeyIDSet, \ minShortestDistance=self.minShortestDistance) #probabilitySpanRBDict = self.constructNewMonkeyToChosenSetDistanceVector(graph=graph, preChosenMonkeyIDSet=preChosenMonkeyIDSet) #sampling for 10 times for i in xrange(1): finalChosenMonkeyIDDict = self.chooseExtraSamples(graph, preChosenMonkeyIDSet=preChosenMonkeyIDSet, \ noOfMonkeysToChoose=self.noOfMonkeysToChoose, \ shortestDistanceVectorData=shortestDistanceVectorData, \ minShortestDistance=self.minShortestDistance) self.outputChosenMonkeys(monkeyID2Info=allMonkeyID2Info, chosenMonkeyIDDict=finalChosenMonkeyIDDict, \ outputFname='%s_sample%s_%sMonkeys.tsv'%(self.outputFnamePrefix, i, len(finalChosenMonkeyIDDict))) distance_ls = self.getPairwiseDistanceWithinGraphOfChosenMonkey(graph=allMonkeyGraph, \ chosenMonkeyIDDict=finalChosenMonkeyIDDict) yh_matplotlib.drawHist(data_ls=distance_ls, title=None, \ xlabel_1D="pairwise distance within graph", \ xticks=None, outputFname='%s_sample%s_%sMonkeys_pairwise_distance_hist.png'%\ (self.outputFnamePrefix, i, len(finalChosenMonkeyIDDict)), \ min_no_of_data_points=10, needLog=True, \ dpi=200, max_no_of_bins=40)
def run(self): if self.debug: import pdb pdb.set_trace() no_of_result1_peaks_ls = [] no_of_result2_peaks_ls = [] fraction_of_result1_peaks_in_result2_ls = [] fraction_of_result2_peaks_in_result1_ls = [] no_of_combined_peaks_ls = [] fraction_of_overlap_in_combined_peaks_ls = [] for inputFname in self.inputFnameLs: reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname)) header = reader.next() col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True) no_of_result1_peaks_index = col_name2index.get( "no_of_result1_peaks") no_of_result2_peaks_index = col_name2index.get( "no_of_result2_peaks") no_of_result1_peaks_in_result2_index = col_name2index.get( "no_of_result1_peaks_in_result2") no_of_result2_peaks_in_result1_index = col_name2index.get( "no_of_result2_peaks_in_result1") for row in reader: no_of_result1_peaks = float(row[no_of_result1_peaks_index]) no_of_result2_peaks = float(row[no_of_result2_peaks_index]) no_of_result1_peaks_in_result2 = float( row[no_of_result1_peaks_in_result2_index]) no_of_result2_peaks_in_result1 = float( row[no_of_result2_peaks_in_result1_index]) no_of_result1_peaks_ls.append(no_of_result1_peaks) no_of_result2_peaks_ls.append(no_of_result2_peaks) fraction_of_result1_peaks_in_result2_ls.append( no_of_result1_peaks_in_result2 / no_of_result1_peaks) fraction_of_result2_peaks_in_result1_ls.append( no_of_result2_peaks_in_result1 / no_of_result2_peaks) no_of_combined_peaks_ls.append(no_of_result1_peaks + no_of_result2_peaks) fraction_of_overlap_in_combined_peaks_ls.append( (no_of_result1_peaks_in_result2 + no_of_result2_peaks_in_result1) / (no_of_result1_peaks + no_of_result2_peaks)) del reader title = "%s pairs" % (len(fraction_of_result1_peaks_in_result2_ls)) if len(fraction_of_result1_peaks_in_result2_ls) > 10: medianFraction = numpy.median( fraction_of_result1_peaks_in_result2_ls) title += " median %.3f" % (medianFraction) yh_matplotlib.drawHist(fraction_of_result1_peaks_in_result2_ls, title=title, \ xlabel_1D="fraction of result1 peaks in result2", xticks=None, \ outputFname="%s_hist_of_fraction_of_result1_peaks_in_result2.png"%self.outputFnamePrefix, \ min_no_of_data_points=20, needLog=False, \ dpi=200) title = "%s pairs" % (len(fraction_of_result2_peaks_in_result1_ls)) if len(fraction_of_result2_peaks_in_result1_ls) > 10: medianFraction = numpy.median( fraction_of_result2_peaks_in_result1_ls) title += " median %.3f" % (medianFraction) yh_matplotlib.drawHist(fraction_of_result2_peaks_in_result1_ls, title=title, \ xlabel_1D="fraction of result2 peaks in result1", xticks=None, \ outputFname="%s_hist_of_fraction_of_result2_peaks_in_result1.png"%self.outputFnamePrefix, \ min_no_of_data_points=20, needLog=False, \ dpi=200) title = "%s pairs" % (len(fraction_of_overlap_in_combined_peaks_ls)) if len(fraction_of_overlap_in_combined_peaks_ls) > 10: medianFraction = numpy.median( fraction_of_overlap_in_combined_peaks_ls) title += " median %.3f" % (medianFraction) yh_matplotlib.drawHist(fraction_of_overlap_in_combined_peaks_ls, title=title, \ xlabel_1D="fraction of recurrent peaks in combined", xticks=None, \ outputFname="%s_hist_of_fraction_of_recurrent_peaks_in_combined.png"%self.outputFnamePrefix, \ min_no_of_data_points=20, needLog=False, \ dpi=200) title = "%s results" % (len(no_of_result1_peaks_ls)) yh_matplotlib.drawScatter(no_of_result1_peaks_ls, no_of_result2_peaks_ls, \ fig_fname="%s_no_of_peaks_result1_vs_result2.png"%self.outputFnamePrefix, \ title=title, xlabel='No. of peaks in result1', \ ylabel='No. of peaks in result2', dpi=300) title = "%s results" % (len(no_of_result1_peaks_ls)) yh_matplotlib.drawScatter(no_of_result1_peaks_ls, fraction_of_result1_peaks_in_result2_ls, \ fig_fname="%s_result1_no_of_peak_vs_fraction.png"%self.outputFnamePrefix, \ title=title, xlabel='No. of peaks in result1', \ ylabel='Fraction found in result2', dpi=300) title = "%s results" % (len(no_of_result2_peaks_ls)) yh_matplotlib.drawScatter(no_of_result2_peaks_ls, fraction_of_result2_peaks_in_result1_ls, \ fig_fname="%s_result2_no_of_peak_vs_fraction.png"%self.outputFnamePrefix, \ title=title, xlabel='No. of peaks in result2', \ ylabel='Fraction found in result1', dpi=300) title = "%s pairs" % (len(fraction_of_result1_peaks_in_result2_ls)) yh_matplotlib.drawScatter(fraction_of_result1_peaks_in_result2_ls, fraction_of_result2_peaks_in_result1_ls, \ fig_fname="%s_1_fraction_in2_vs_2_fraction_in1.png"%self.outputFnamePrefix, \ title=title, xlabel='result1 fraction found in result2', \ ylabel='result2 fraction found in result1', dpi=300) title = "%s pairs" % (len(no_of_combined_peaks_ls)) yh_matplotlib.drawScatter(no_of_combined_peaks_ls, fraction_of_overlap_in_combined_peaks_ls, \ fig_fname="%s_combined_no_of_peak_vs_fraction.png"%self.outputFnamePrefix, \ title=title, xlabel='No. of peaks combined', \ ylabel='Fraction recurrent', dpi=300)
def run(self): if self.debug: import pdb pdb.set_trace() #without commenting out db_vervet connection code. schema "genome" wont' be default path. db_genome = GenomeDB.GenomeDatabase(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema="genome") db_genome.setup(create_tables=False) #chrOrder=2 means chromosomes are not ordered alphabetically but by their sizes (descendingly) oneGenomeData = db_genome.getOneGenomeData(tax_id=self.tax_id, chr_gap=0, chrOrder=self.chrOrder, \ sequence_type_id=self.sequence_type_id) chr2size = db_genome.getTopNumberOfChomosomes(contigMaxRankBySize=80000, contigMinRankBySize=1, tax_id=self.tax_id, \ sequence_type_id=self.sequence_type_id) self.chr_id2cumu_start = oneGenomeData.chr_id2cumu_start """ size_chr_id_ls = [(value, key) for key, value in chr2size.iteritems()] size_chr_id_ls.sort() size_chr_id_ls.reverse() """ sys.stderr.write("Reading in data ...") for inputFname in self.inputFnameLs: if not os.path.isfile(inputFname): continue self.vcftoolsOutputStatFileWalker(inputFname, processFunc=None, chrColumnHeader=self.chrColumnHeader,\ minChrLength=self.minChrLength, chrLengthColumnHeader=self.chrLengthColumnHeader,\ xColumnHeader=self.xColumnHeader, valueForNonPositiveYValue=self.valueForNonPositiveYValue) sys.stderr.write("Done.\n") pylab.clf() yh_matplotlib.setPlotDimension(left=0.025, right=0.985, bottom=0.1, top=0.9) fig = pylab.figure(figsize=(30,2)) #ax = pylab.axes() ax = fig.gca() max_y = None min_y = None value_ls = [] for chromosome in oneGenomeData.chr_id_ls: xy_ls = self.chr2xy_ls.get(chromosome) if xy_ls: if max_y is None: max_y = max(xy_ls[1]) else: max_y = max(max_y, max(xy_ls[1])) if min_y is None: min_y = min(xy_ls[1]) else: min_y = min(min_y, min(xy_ls[1])) ax.plot(xy_ls[0], xy_ls[1], '.', markeredgewidth=0, markersize=4, alpha=0.8) value_ls += xy_ls[1] #separate each chromosome #for chromosome in chr_ls[:-1]: # print chromosome # ax.axvline(chr_id2cumu_size[chromosome], linestyle='--', color='k', linewidth=0.8) #draw the bonferroni line #bonferroni_value = -math.log10(0.01/len(genome_wide_result.data_obj_ls)) #ax.axhline(bonferroni_value, linestyle='--', color='k', linewidth=0.8) ax.set_xlabel(self.xColumnPlotLabel) ax.set_ylabel(self.whichColumnPlotLabel) #ax.set_xlim([0, chr_id2cumu_size[chr_ls[-1]]]) if self.ylim_type==1: ylim = ax.get_ylim() ax.set_ylim([0, ylim[1]]) elif self.ylim_type==2: if max_y is not None and min_y is not None: delta = abs(max_y-min_y)/12.0 ax.set_ylim([min_y-delta, max_y+delta]) #outputFnamePrefix = os.path.splitext(self.outputFname)[0] outputFnamePrefix = self.outputFnamePrefix pylab.savefig('%s.png'%outputFnamePrefix, dpi=self.figureDPI) if self.need_svg: pylab.savefig('%s.svg'%outputFnamePrefix, dpi=self.figureDPI) outputFname = '%s_hist.png'%(outputFnamePrefix) yh_matplotlib.drawHist(value_ls, title='', \ xlabel_1D=self.whichColumnPlotLabel, xticks=None, \ outputFname=outputFname, min_no_of_data_points=self.minNoOfTotal, \ needLog=self.logCount, \ dpi=self.figureDPI, min_no_of_bins=40)