def render_token(query, token='Yoga', output_file='out/yoga_density.png'): locations = numpy.array(query.locations_for_token(token)) m1 = locations[:,0] # x-coords m2 = locations[:,1] # y-coords # Perform a kernel density estimator on the coords in data. # FIXME: temporary hard code the max/min so all plots are on same scale xmin = -150 xmax = 150 ymin = -150 ymax = 150 #xmin = m1.min() #xmax = m1.max() #ymin = m2.min() #ymax = m2.max() X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = c_[X.ravel(), Y.ravel()] values = c_[m1, m2] kernel = stats.kde.gaussian_kde(values.T) Z = reshape(kernel(positions.T).T, X.T.shape) clf() #kinda insane that one has to do this *Before* you render but hrmm.. anyway necessary in case this function gets caleld twice imshow(rot90(Z), cmap=cm.gist_earth_r, extent=[xmin, xmax, ymin, ymax]) # Plot the locations (assumes each 'mention' has same weight - which is a shame) plot(m1, m2, 'k.', markersize=1) text(xmin,ymax-15,"density map for '" + token + "'") axis('equal') print 'saving density map for ', token, 'to ', output_file savefig(output_file)
def random_rot_90(k, img): """main function""" if np.random.rand() > .5: print(k) return np.ascontiguousarray(rot90(img, k, axes=(-2, -1))) else: return img
def plot_density(self, plot_filename="out/density.png"): x, y, labels = self.load_data() figure(figsize=(self.fig_width, self.fig_height), dpi=80) # Perform a kernel density estimator on the coords in data. # The following 10 lines can be commented out if density map not needed. space_factor = 1.2 xmin = space_factor * x.min() xmax = space_factor * x.max() ymin = space_factor * y.min() ymax = space_factor * y.max() X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = c_[X.ravel(), Y.ravel()] values = c_[x, y] kernel = stats.kde.gaussian_kde(values.T) Z = reshape(kernel(positions.T).T, X.T.shape) imshow(rot90(Z), cmap=cm.gist_earth_r, extent=[xmin, xmax, ymin, ymax]) # Plot the labels num_labels_to_plot = min([len(labels), self.max_labels, len(x), len(y)]) if self.has_labels: for i in range(num_labels_to_plot): text(x[i], y[i], labels[i]) # assumes m size and order matches labels else: plot(x, y, "k.", markersize=1) axis("equal") axis("off") savefig(plot_filename) print "wrote %s" % (plot_filename)
def get_text(images, patterns): from lab.lab9.zad2.deskew import deskew for image in images.keys(): line_map = {} im, imn = images[image] im = deskew(im) imn = deskew(imn) for p_key in patterns.keys(): pattern = patterns[p_key][0] fp = np.fft.fft2(rot90(pattern, 2), im.shape) fi = np.fft.fft2(im) m = np.multiply(fp, fi) corr = np.fft.ifft2(m) corr = np.abs(corr) corr = corr.astype(float) i_M, j_M = corr.shape it = 0 corr[corr < 0.99 * np.amax(corr)] = 0 def mark(i, j, c): x, y = (i - line_height // 2) // line_height, j - line_height // 2 if x in line_map: line_map[x][y] = c else: line_map[x] = {} line_map[x][y] = c for x in range(i - 10, i): for y in range(j - 10, j): imn[x, y][0] = 255 imn[x, y][1] = 255 imn[x, y][2] = 255 for i in range(i_M): for j in range(j_M): it += 1 if corr[i, j] > 0: print(corr[i, j]) mark(i, j, patterns[p_key][1]) lnns = [] for line in sorted(line_map.keys()): line_st = '' chars_in_line = [x for x in line_map[line].keys()] chars_in_line = sorted(chars_in_line) for i, lk in enumerate(sorted(line_map[line].keys())): line_st += line_map[line][lk] if i + 1 < len(chars_in_line): if (abs(chars_in_line[i] - chars_in_line[i + 1]) > line_height): line_st += ' ' lnns.append(line_st) save_results(image, imn, lnns)
def contour(self, p, dat): X, Y = mgrid[0.0:1.0:100j, 0.0:1.0:100j] positions = c_[X.ravel(), Y.ravel()] val = c_[dat[0,:], dat[1,:]] kernel = stats.kde.gaussian_kde(val.T) Z = reshape(kernel(positions.T).T, X.T.shape) p.imshow( rot90(Z) , cmap=p.cm.YlGnBu, extent=[0, 1, 0, 1]) p.plot(dat[0,:], dat[1,:], 'r.') p.axis([0.0, 1.0, 0.0, 1.0])
def plotmapheat(heat, title, outpath, plotdots=False): plt.figure(figsize=(4,5)) plt.imshow(rot90(heat), cmap=cm.gist_earth_r, extent=extent, aspect='auto') dpi = 150 if plotdots: plt.plot(lons, lats, ',', markersize=0.2, color=(0,0,0,0.3)) dpi = 300 plt.title(title, fontsize=10) # city markers: plt.plot([pm[1] for pm in placemarkers], [pm[0] for pm in placemarkers], '.', markersize=2, color=(1,1,1,0.3)) for pm in placemarkers: plt.text(pm[1], pm[0], pm[2], {'fontsize':4}, color=(0.95,0.95,0.75,0.3)) plt.savefig(outpath, papertype='A4', format='png', dpi=dpi)
def show_planes(im): r""" Create a quick montage showing a 3D image in all three directions Parameters ---------- im : ND-array A 3D image of the porous material Returns ------- image : ND-array A 2D array containing the views. This single image can be viewed using ``matplotlib.pyplot.imshow``. """ if sp.squeeze(im.ndim) < 3: raise Exception('This view is only necessary for 3D images') x, y, z = (sp.array(im.shape) / 2).astype(int) im_xy = im[:, :, z] im_xz = im[:, y, :] im_yz = sp.rot90(im[x, :, :]) new_x = im_xy.shape[0] + im_yz.shape[0] + 10 new_y = im_xy.shape[1] + im_xz.shape[1] + 10 new_im = sp.zeros([new_x + 20, new_y + 20], dtype=im.dtype) # Add xy image to upper left corner new_im[10:im_xy.shape[0] + 10, 10:im_xy.shape[1] + 10] = im_xy # Add xz image to lower left coner x_off = im_xy.shape[0] + 20 y_off = im_xy.shape[1] + 20 new_im[10:10 + im_xz.shape[0], y_off:y_off + im_xz.shape[1]] = im_xz new_im[x_off:x_off + im_yz.shape[0], 10:10 + im_yz.shape[1]] = im_yz return new_im
# now combine delaunay with KDE colours = np.zeros((gridsize, gridsize, 4)) kdeZmin = np.min(kdeZ) kdeZmax = np.max(kdeZ) confdepth = 0.45 for x in range(gridsize): for y in range(gridsize): conf = (kdeZ[x, y] - kdeZmin) / (kdeZmax - kdeZmin) val = min(1., max(0., interped[x, y])) colour = list(cm.rainbow(val)) # now fade it out to white according to conf for index in [0, 1, 2]: colour[index] = (colour[index] * conf) + (1.0 * (1. - conf)) colours[x, y, :] = colour #colours[x,y,:] = np.hstack((hls_to_rgb(val, 0.5 + confdepth - (confdepth * conf), 1.0), 1.0)) #colours[x,y,:] = [conf, conf, 1.0-conf, val] print colours plt.imshow(rot90(colours), cmap=cm.rainbow, norm=LogNorm(\ vmin=zmin, vmax=zmax)) plt.title("interpolated & confidence-shaded") plt.ylim([ymin, ymax]) plt.xlim([xmin, xmax]) plt.xticks(fontsize=fontsize) plt.yticks(fontsize=fontsize) ############################################ plt.savefig("plot_heati_simple.svg", format='SVG')
def gcbias(filelist, fileoutlist, bedfilelist): """************************************************************************************************************************************************************ Task: draws coverage as a function of gc content Input: filelist: list of strings, each containing the full path of the bam file to analyze. fileoutlist: list of strings, each containing the full path of the png file where the corresponding figure will be saved. bedfilelist: Output: a bmp file will be created named "fileout" where a graph that compares gc content and mean coverage will be saved. ************************************************************************************************************************************************************""" pid = str(os.getpid()) numpy.random.seed(1) ntotal_positions = [] bamlist = [] # Process each file and store counting results for filename in filelist: # Check whether index already exists for the bam file, needed for pysam use if (not os.path.isfile(filename + '.bai')): print 'Creating index for ' + filename pysam.index(filename) print ' Done.' bamlist.append(bam_file.bam_file(filename)) sizes = numpy.array([bam.nreads() for bam in bamlist]) minsize = sizes.min() print 'The smaller bam is ' + filelist[ sizes.argmin()] + ' and contains ' + str(minsize) + ' reads.' # Process each file and store counting results for i, bamfile in enumerate(bamlist): print 'Processing ' + bamfile.filename print 'Results will be written at ' + fileoutlist[i] # Check whether normalization should be run if (normalize): normalizedbam = bamfile.normalize(minsize) else: normalizedbam = bamfile coveragefile = TMP + '/' + pid + '.coverage' print 'Calculating coverage per position...' run(BEDTOOLSPATH + 'coverageBed -d -abam ' + normalizedbam.filename + ' -b ' + bedfilelist[i] + ' > ' + coveragefile) coverage = region_coverage(coveragefile) print 'Calculating nt content...' bedfd = pybedtools.BedTool(bedfilelist[i]) pybedtools._bedtools_installed = True pybedtools.set_bedtools_path(BEDTOOLSPATH) ntcontent = bedfd.nucleotide_content(REF) # Each entry in ntcontent is parsed to extract the gc content of each exon gccontent = {} for entry in ntcontent: gccontent[(entry.fields[0], string.atoi( entry.fields[1]), string.atoi( entry.fields[2]))] = string.atof(entry.fields[-8]) * 100 print ' Done.' fig = pyplot.figure(figsize=(13, 6)) ax = fig.add_subplot(111) region_ids = coverage.keys() coveragearray = numpy.array([coverage[id] for id in region_ids]) gccontentarray = numpy.array([gccontent[id] for id in region_ids]) # Values in [0,1] xmin = gccontentarray.min() xmax = gccontentarray.max( ) # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] ymin = coveragearray.min() ymax = coveragearray.max() # Perform a kernel density estimator on the results X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = c_[X.ravel(), Y.ravel()] values = c_[gccontentarray, coveragearray] kernel = stats.kde.gaussian_kde(values.T) Z = reshape(kernel(positions.T).T, X.T.shape) fig = pyplot.figure(figsize=(6, 6)) ax = fig.add_subplot(111) sc = ax.imshow( rot90(Z), cmap=cm.gist_earth_r, extent=[xmin, 100, ymin, ymax], aspect="auto" ) # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] cbar = fig.colorbar(sc, ticks=[numpy.min(Z), numpy.max(Z)]) cbar.ax.set_yticklabels(['Low', 'High']) cbar.set_label('Density') ax.set_xlabel('GC content (%)') ax.set_ylabel('Mean coverage') fig.savefig(fileoutlist[i]) matplotlib.pyplot.close(fig) print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
def rot90(self, i): # Rotate by 90 degrees i times if CXP.reconstruction.verbose: CXP.log.info('Rotating data by {:d}'.format(i * 90)) for j, data in enumerate(self.data): self.data[j] = sp.rot90(data, i)
def math_func(table): """ table = [[1, 2], [2, 3]] | | Первое качество | Второе качество | | Первый эксперт | 2 | 3 | | Второй эксперт | 2 | 4 | """ # table = [[10, 9, 7, 5, 9], # [9, 8, 8, 6, 8], # [10, 9, 8, 4, 9]] # table = scipy.array(table) print(table) # 1. Переводим оценки группы экспертов из баллов в ранги. # Первому рангу будет соответствовать наибольшая оценка в баллах. rank_table = scipy.array([scipy.stats.mstats.rankdata(a) for a in table]) rank_table = scipy.array( [scipy.array([r.shape[0] + 1 - value if value > 0 else 0 for value in r]) for r in rank_table]) print(rank_table) m, n = rank_table.shape # Количество экспертов, количетво качеств # Находим критические точки распределения Пирсона по таблице [5], # вычисленные при заданном уровне значимости α = 0,05 и при числе степеней # свободы K = n - 1 print("sdfsdf", n) critical_chi = scipy.stats.chi2.isf(0.05, n - 1) # 2. При обработке оценок, выданных экспертами в рангах, должна # соблюдаться нормировка рангов, то есть сумма рангов должна быть равна сумме # членов натурального ряда # 3. Вычисляем сумму рангов по каждому из ПВК sum_rank_for_q = scipy.array([scipy.sum(a) for a in scipy.rot90(rank_table, k=-1)]) # 4. Получаем общую сумму рангов по всей матрице sum_all_rank = scipy.sum(sum_rank_for_q) # 5. Находим по формуле среднего арифметического коллективное мнение # группы экспертов. average_a = scipy.array([scipy.average(a) for a in scipy.rot90(rank_table, k=-1)]) # 6. Вычисляем среднее пофакторное значение суммы рангов. average_value_sum_r = m * (n + 1) / 2 # 7. Находим фактические отклонения пофакторных сумм рангов от # среднего значения. actual_deviation = sum_rank_for_q - average_value_sum_r # 8. Вычисляем квадраты фактических отклонений пофакторных сумм # рангов от общего среднего square_actual_deviation = actual_deviation ** 2 # 9. Суммируем квадраты отклонений, находим sum_square = scipy.sum(square_actual_deviation) # 10. Вычисляем максимально возможное значение суммы квадратов # отклонений оценок по каждому из ПВК от общей средней max_sum_square = m ** 2 * (n ** 3 - n) / 12 # 11. Находим выборочное значение коэффициента конкордации Кендэлла. W = sum_square / max_sum_square # 12. Вычисляем выборочное значение хи-квадрат Пирсона chi2 = W * m * (n - 1) print("chi2", chi2) print("critical", critical_chi) # 13. Проверка согласованности показаний всей группы экспертов с # помощью коэффициента конкордации Кендэлла производится согласно # следующему альтернативному соглашению: if chi2 >= critical_chi: return False else: return True
def gcbias_lite(coveragefile, bedfilename, reference, fileout, graphtitle=None, executiongranted=None, status=None, bedTools=False): """************************************************************************************************************************************************************ Task: draws coverage as a function of gc content. IMPROVED VERSION of gcbias that avoids the use of bedtools (pybedtools) Input: coveragefile: string containing the full path of the bam.coverage file to analyze. This file has been built according to 1-base format bedfilename: target file -> assumes original-standard bed file reference: fasta file with reference genome fileout: string containing the full path of the bmp file where the restulting figure will be saved. bedTools: whether pybedtools are used instead of the own method Output: a png file will be created named "fileout" where a graph that compares gc content and mean coverage will be saved. ************************************************************************************************************************************************************""" if(executiongranted<>None): executiongranted.acquire() pid = str(os.getpid()) # print 'Processing '+coveragefile # print 'Results will be written at '+fileout coverage = region_coverage(coveragefile) # Calculate mean coverage per region ## fdw=file('regionCoverage.txt','w') ## for element in sorted(coverage.keys()): ## fdw.write(str(element)+'\n') ## fdw.close() if(len(coverage)>1): if not bedTools: # Own method # print 'Own method' chromosomes={} allKeys=coverage.keys() for currentKey in allKeys: chromosomes[currentKey[0]]=1 # Stores all chromosomes to be examined (the ones contained in the target file) # Load BED file -> since coverage information is in 1-base format, BED format must be transformed to 1-base bed=bed_file.bed_file(bedfilename) sortedBed=bed.my_sort_bed() # Sort bed avoiding bedtools nonOverlappingBed=sortedBed.non_overlapping_exons(1) # Base 1!!! # This generates a BED file in base 1 (Non-standard BED) finalBed=nonOverlappingBed.my_sort_bed() # BED file in base 1 (Non-standard BED) finalBed.load_custom(-1) # Load chromosome and positions in base 1....(finalBed is in base 1 -> Non-standard BED) #Load FASTA file fastaFile=file(reference,'r') storeSequence=False wholeChromosome='' currentChromosome='' gccontent={} for line in fastaFile: # Read each line of the fasta file if line.startswith('>'): # New chromosome starts -> reading a new line until another '>' is found # print 'Processing ' +line+'\n' if storeSequence: # a chromosome has been read run gc bias currentGCcontent=measureGCbias(wholeChromosome,currentChromosome,finalBed) gccontent.update(currentGCcontent) # Update dictionary storeSequence=False currentChromosome=re.split(' +',line)[0] # Format: >1 dna:chromosome chromosome:GRCh37:1:1:249250621:1 currentChromosome=currentChromosome.split('>')[1].strip() # Chromosome string if(currentChromosome in chromosomes): # If current chromosome read in the FASTA file is in the list of chromosomes in the BED file storeSequence=True wholeChromosome='' # To store whole sequence for the current chromosome elif (not re.search('>',line) and storeSequence): wholeChromosome=wholeChromosome+line.rstrip() # Remove '\n' from current line and concatenates to wholeChromosome if(storeSequence): # For the last chromosome currentGCcontent=measureGCbias(wholeChromosome,currentChromosome,finalBed) gccontent.update(currentGCcontent) # Update dictionary fastaFile.close() region_ids=[] region_ids = coverage.keys() if(len(gccontent)==0): print 'ERROR: G+C content values can not be calculated. Probably the provided reference file '+reference+' does not match with ' print ' the target file '+bedfilename+'. That is, sequences of regions in the target file are probably not included within the' print ' reference file.' sys.exit(1) else: print 'Calculating nt content by means of pybedtools...' bed=bed_file.bed_file(bedfilename) sortedBed=bed.my_sort_bed() # Sort bed avoiding bedtools nonOverlappingBed=sortedBed.non_overlapping_exons(1) # base one!!! finalBed=nonOverlappingBed.my_sort_bed() # BED file in base 1 bedfd = pybedtools.BedTool(finalBed.filename) bedfd=bedfd.remove_invalid() # Remove negative coordinates or features with length=0, which do not work with bedtools pybedtools._bedtools_installed = True pybedtools.set_bedtools_path(BEDTOOLSPATH) ntcontent = bedfd.nucleotide_content(reference) # Each entry in ntcontent is parsed to extract the gc content of each exon gccontent = {} for entry in ntcontent: gccontent[(entry.fields[0], string.atoi(entry.fields[1]), string.atoi(entry.fields[2]))] = string.atof(entry.fields[-8])*100 print ' Done.' # gccontent keys in dictionary: chromosome, exon init, exon end region_ids=[] for currentKey in coverage.keys(): # Pybedtools does not work with regions with zero length -> remove them (there are a few of them) if currentKey[1]!=currentKey[2]: region_ids.append(currentKey) ## ## fdw=file('gcContent.txt','w') ## for element in sorted(gccontent.keys()): ## fdw.write(str(element)+'\n') ## fdw.close() ## #region_ids = gccontent.keys() coveragearray = numpy.array([coverage[id] for id in region_ids]) gccontentarray = numpy.array([gccontent[id] for id in region_ids]) # Values in [0,1] # fig = pyplot.figure(figsize=(6,6)) # ax = fig.add_subplot(111) # # ax.hist(gccontentarray,bins=100) # fig.suptitle('Dsitribution of GC content regardless of coverage value') # ax.set_ylabel('Frequency') # ax.set_xlabel('GC content') # ax.set_xlim(0, 100) # fig.savefig('distribution.png') xmin = gccontentarray.min() xmax = gccontentarray.max() # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] ymin = coveragearray.min() ymax = coveragearray.max() # Perform a kernel density estimator on the results X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = c_[X.ravel(), Y.ravel()] values = c_[gccontentarray, coveragearray] kernel = stats.kde.gaussian_kde(values.T) Z = reshape(kernel(positions.T).T, X.T.shape) fig = pyplot.figure(figsize=(6,6)) ax = fig.add_subplot(111) sc=ax.imshow(rot90(Z),cmap=cm.gist_earth_r,extent=[xmin, 100, ymin, ymax], aspect="auto") # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] cbar=fig.colorbar(sc,ticks=[numpy.min(Z),numpy.max(Z)]) cbar.ax.set_yticklabels(['Low','High']) cbar.set_label('Density') ax.set_xlabel('GC content (%)') ax.set_ylabel('Mean coverage') if(len(graphtitle)>25): ax.set_title(graphtitle[:25]+'...') else: ax.set_title(graphtitle) fig.savefig(fileout) matplotlib.pyplot.close(fig) if(status<>None): meanvalue = gccontentarray.mean() status.value = (meanvalue>=45 and meanvalue<=55) else: print 'WARNING: only one region found in the bed file. Skipping GC bias calculation.' if(executiongranted<>None): executiongranted.release()
def rot90(self, i): # Rotate by 90 degrees i times if CXP.reconstruction.verbose: CXP.log.info('Rotating data by {:d}'.format(i*90)) for j, data in enumerate(self.data): self.data[j] = sp.rot90(data, i)
def gcbias_lite(coveragefile, bedfilename, reference, fileout, graphtitle=None, executiongranted=None, status=None, bedTools=False): """************************************************************************************************************************************************************ Task: draws coverage as a function of gc content. IMPROVED VERSION of gcbias that avoids the use of bedtools (pybedtools) Input: coveragefile: string containing the full path of the bam.coverage file to analyze. This file has been built according to 1-base format bedfilename: target file -> assumes original-standard bed file reference: fasta file with reference genome fileout: string containing the full path of the bmp file where the restulting figure will be saved. bedTools: whether pybedtools are used instead of the own method Output: a png file will be created named "fileout" where a graph that compares gc content and mean coverage will be saved. ************************************************************************************************************************************************************""" if (executiongranted <> None): executiongranted.acquire() pid = str(os.getpid()) # print 'Processing '+coveragefile # print 'Results will be written at '+fileout coverage = region_coverage( coveragefile) # Calculate mean coverage per region ## fdw=file('regionCoverage.txt','w') ## for element in sorted(coverage.keys()): ## fdw.write(str(element)+'\n') ## fdw.close() if (len(coverage) > 1): if not bedTools: # Own method # print 'Own method' chromosomes = {} allKeys = coverage.keys() for currentKey in allKeys: chromosomes[currentKey[ 0]] = 1 # Stores all chromosomes to be examined (the ones contained in the target file) # Load BED file -> since coverage information is in 1-base format, BED format must be transformed to 1-base bed = bed_file.bed_file(bedfilename) sortedBed = bed.my_sort_bed() # Sort bed avoiding bedtools nonOverlappingBed = sortedBed.non_overlapping_exons( 1 ) # Base 1!!! # This generates a BED file in base 1 (Non-standard BED) finalBed = nonOverlappingBed.my_sort_bed( ) # BED file in base 1 (Non-standard BED) finalBed.load_custom( -1 ) # Load chromosome and positions in base 1....(finalBed is in base 1 -> Non-standard BED) #Load FASTA file fastaFile = file(reference, 'r') storeSequence = False wholeChromosome = '' currentChromosome = '' gccontent = {} for line in fastaFile: # Read each line of the fasta file if line.startswith( '>' ): # New chromosome starts -> reading a new line until another '>' is found # print 'Processing ' +line+'\n' if storeSequence: # a chromosome has been read run gc bias currentGCcontent = measureGCbias( wholeChromosome, currentChromosome, finalBed) gccontent.update(currentGCcontent) # Update dictionary storeSequence = False currentChromosome = re.split( ' +', line )[0] # Format: >1 dna:chromosome chromosome:GRCh37:1:1:249250621:1 currentChromosome = currentChromosome.split( '>')[1].strip() # Chromosome string if ( currentChromosome in chromosomes ): # If current chromosome read in the FASTA file is in the list of chromosomes in the BED file storeSequence = True wholeChromosome = '' # To store whole sequence for the current chromosome elif (not re.search('>', line) and storeSequence): wholeChromosome = wholeChromosome + line.rstrip( ) # Remove '\n' from current line and concatenates to wholeChromosome if (storeSequence): # For the last chromosome currentGCcontent = measureGCbias(wholeChromosome, currentChromosome, finalBed) gccontent.update(currentGCcontent) # Update dictionary fastaFile.close() region_ids = [] region_ids = coverage.keys() if (len(gccontent) == 0): print 'ERROR: G+C content values can not be calculated. Probably the provided reference file ' + reference + ' does not match with ' print ' the target file ' + bedfilename + '. That is, sequences of regions in the target file are probably not included within the' print ' reference file.' sys.exit(1) else: print 'Calculating nt content by means of pybedtools...' bed = bed_file.bed_file(bedfilename) sortedBed = bed.my_sort_bed() # Sort bed avoiding bedtools nonOverlappingBed = sortedBed.non_overlapping_exons( 1) # base one!!! finalBed = nonOverlappingBed.my_sort_bed() # BED file in base 1 bedfd = pybedtools.BedTool(finalBed.filename) bedfd = bedfd.remove_invalid( ) # Remove negative coordinates or features with length=0, which do not work with bedtools pybedtools._bedtools_installed = True pybedtools.set_bedtools_path(BEDTOOLSPATH) ntcontent = bedfd.nucleotide_content(reference) # Each entry in ntcontent is parsed to extract the gc content of each exon gccontent = {} for entry in ntcontent: gccontent[(entry.fields[0], string.atoi(entry.fields[1]), string.atoi(entry.fields[2]))] = string.atof( entry.fields[-8]) * 100 print ' Done.' # gccontent keys in dictionary: chromosome, exon init, exon end region_ids = [] for currentKey in coverage.keys( ): # Pybedtools does not work with regions with zero length -> remove them (there are a few of them) if currentKey[1] != currentKey[2]: region_ids.append(currentKey) ## ## fdw=file('gcContent.txt','w') ## for element in sorted(gccontent.keys()): ## fdw.write(str(element)+'\n') ## fdw.close() ## #region_ids = gccontent.keys() coveragearray = numpy.array([coverage[id] for id in region_ids]) gccontentarray = numpy.array([gccontent[id] for id in region_ids]) # Values in [0,1] # fig = pyplot.figure(figsize=(6,6)) # ax = fig.add_subplot(111) # # ax.hist(gccontentarray,bins=100) # fig.suptitle('Dsitribution of GC content regardless of coverage value') # ax.set_ylabel('Frequency') # ax.set_xlabel('GC content') # ax.set_xlim(0, 100) # fig.savefig('distribution.png') xmin = gccontentarray.min() xmax = gccontentarray.max( ) # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] ymin = coveragearray.min() ymax = coveragearray.max() # Perform a kernel density estimator on the results X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = c_[X.ravel(), Y.ravel()] values = c_[gccontentarray, coveragearray] kernel = stats.kde.gaussian_kde(values.T) Z = reshape(kernel(positions.T).T, X.T.shape) fig = pyplot.figure(figsize=(6, 6)) ax = fig.add_subplot(111) sc = ax.imshow( rot90(Z), cmap=cm.gist_earth_r, extent=[xmin, 100, ymin, ymax], aspect="auto" ) # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] cbar = fig.colorbar(sc, ticks=[numpy.min(Z), numpy.max(Z)]) cbar.ax.set_yticklabels(['Low', 'High']) cbar.set_label('Density') ax.set_xlabel('GC content (%)') ax.set_ylabel('Mean coverage') if (len(graphtitle) > 25): ax.set_title(graphtitle[:25] + '...') else: ax.set_title(graphtitle) fig.savefig(fileout) matplotlib.pyplot.close(fig) if (status <> None): meanvalue = gccontentarray.mean() status.value = (meanvalue >= 45 and meanvalue <= 55) else: print 'WARNING: only one region found in the bed file. Skipping GC bias calculation.' if (executiongranted <> None): executiongranted.release()
def gcbias(filelist, fileoutlist, bedfilelist): """************************************************************************************************************************************************************ Task: draws coverage as a function of gc content Input: filelist: list of strings, each containing the full path of the bam file to analyze. fileoutlist: list of strings, each containing the full path of the png file where the corresponding figure will be saved. bedfilelist: Output: a bmp file will be created named "fileout" where a graph that compares gc content and mean coverage will be saved. ************************************************************************************************************************************************************""" pid = str(os.getpid()) numpy.random.seed(1) ntotal_positions = [] bamlist = [] # Process each file and store counting results for filename in filelist: # Check whether index already exists for the bam file, needed for pysam use if(not os.path.isfile(filename+'.bai')): print 'Creating index for '+filename pysam.index(filename) print ' Done.' bamlist.append(bam_file.bam_file(filename)) sizes = numpy.array([bam.nreads() for bam in bamlist]) minsize = sizes.min() print 'The smaller bam is '+filelist[sizes.argmin()]+' and contains '+str(minsize)+' reads.' # Process each file and store counting results for i,bamfile in enumerate(bamlist): print 'Processing '+bamfile.filename print 'Results will be written at '+fileoutlist[i] # Check whether normalization should be run if(normalize): normalizedbam = bamfile.normalize(minsize) else: normalizedbam = bamfile coveragefile = TMP+'/'+pid+'.coverage' print 'Calculating coverage per position...' run(BEDTOOLSPATH+'coverageBed -d -abam '+normalizedbam.filename+' -b '+bedfilelist[i]+' > '+coveragefile) coverage = region_coverage(coveragefile) print 'Calculating nt content...' bedfd = pybedtools.BedTool(bedfilelist[i]) pybedtools._bedtools_installed = True pybedtools.set_bedtools_path(BEDTOOLSPATH) ntcontent = bedfd.nucleotide_content(REF) # Each entry in ntcontent is parsed to extract the gc content of each exon gccontent = {} for entry in ntcontent: gccontent[(entry.fields[0], string.atoi(entry.fields[1]), string.atoi(entry.fields[2]))] = string.atof(entry.fields[-8])*100 print ' Done.' fig = pyplot.figure(figsize=(13,6)) ax = fig.add_subplot(111) region_ids = coverage.keys() coveragearray = numpy.array([coverage[id] for id in region_ids]) gccontentarray = numpy.array([gccontent[id] for id in region_ids]) # Values in [0,1] xmin = gccontentarray.min() xmax = gccontentarray.max() # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] ymin = coveragearray.min() ymax = coveragearray.max() # Perform a kernel density estimator on the results X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = c_[X.ravel(), Y.ravel()] values = c_[gccontentarray, coveragearray] kernel = stats.kde.gaussian_kde(values.T) Z = reshape(kernel(positions.T).T, X.T.shape) fig = pyplot.figure(figsize=(6,6)) ax = fig.add_subplot(111) sc=ax.imshow(rot90(Z),cmap=cm.gist_earth_r,extent=[xmin, 100, ymin, ymax], aspect="auto") # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] cbar=fig.colorbar(sc,ticks=[numpy.min(Z),numpy.max(Z)]) cbar.ax.set_yticklabels(['Low','High']) cbar.set_label('Density') ax.set_xlabel('GC content (%)') ax.set_ylabel('Mean coverage') fig.savefig(fileoutlist[i]) matplotlib.pyplot.close(fig) print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
def fftcorrelatend(template, A): """ Perform a 2D fft correlation using fftconvolve. """ return fftconvolve(rot90(template, 2), A)
plt.title("scatter", fontsize=fontsize) plt.xticks(fontsize=fontsize) plt.yticks(fontsize=fontsize) ################################################## # now make a KDE of it and plot that fig = plt.subplot(2,2,2) kdeX, kdeY = mgrid[xmin:xmax:gridsize*1j, ymin:ymax:gridsize*1j] positions = c_[kdeX.ravel(), kdeY.ravel()] values = c_[data[0,:], data[1,:]] kernel = stats.kde.gaussian_kde(values.T) kdeZ = reshape(kernel(positions.T).T, kdeX.T.shape) plt.imshow(rot90(kdeZ), cmap=cm.binary, aspect='auto') plt.title("density of points", fontsize=fontsize) plt.xticks(fontsize=fontsize) plt.yticks(fontsize=fontsize) ################################################## # now make a delaunay triangulation of it and plot that fig = plt.subplot(2,2,3) tt = matplotlib.delaunay.triangulate.Triangulation(data[0,:], data[1,:]) #triang = tri.Triangulation(data[0,:], data[1,:]) #plt.triplot(triang, 'bo-') # this plots the actual triangles of the triangulation. I'm more interested in their interpolated values #extrap = tt.linear_extrapolator(data[2,:]) extrap = tt.nn_extrapolator(data[2,:]) interped = extrap[xmin:xmax:gridsize*1j, ymin:ymax:gridsize*1j]
def fftcorrelate2d(template, A): """ Perform a 2D fft correlation using fftconvolve2d. """ return fftconvolve2d(rot90(template,2), A)
colours = np.zeros((gridsize, gridsize, 4)) kdeZmin = np.min(kdeZ) kdeZmax = np.max(kdeZ) confdepth = 0.45 for x in range(gridsize): for y in range(gridsize): conf = (kdeZ[x,y] - kdeZmin) / (kdeZmax - kdeZmin) val = min(1., max(0., interped[x,y])) colour = list(cm.rainbow(val)) # now fade it out to white according to conf for index in [0,1,2]: colour[index] = (colour[index] * conf) + (1.0 * (1. -conf)) colours[x,y,:] = colour #colours[x,y,:] = np.hstack((hls_to_rgb(val, 0.5 + confdepth - (confdepth * conf), 1.0), 1.0)) #colours[x,y,:] = [conf, conf, 1.0-conf, val] print colours plt.imshow(rot90(colours), cmap=cm.rainbow, norm=LogNorm(\ vmin=zmin, vmax=zmax)) plt.title("interpolated & confidence-shaded") plt.ylim([ymin,ymax]) plt.xlim([xmin,xmax]) plt.xticks(fontsize=fontsize) plt.yticks(fontsize=fontsize) ############################################ plt.savefig("plot_heati_simple.svg", format='SVG')
from scipy import stats, mgrid, c_, reshape, random, rot90 def measure(n): """ Measurement model, return two coupled measurements. """ m1 = random.normal(size=n) m2 = random.normal(scale=0.5, size=n) return m1 + m2, m1 - m2 # Draw experiments and plot the results m1, m2 = measure(500) xmin = m1.min() xmax = m1.max() ymin = m2.min() ymax = m1.max() # Perform a kernel density estimator on the results X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = c_[X.ravel(), Y.ravel()] values = c_[m1, m2] kernel = stats.kde.gaussian_kde(values.T) Z = reshape(kernel(positions.T).T, X.T.shape) figure(figsize=(3, 3)) imshow(rot90(Z), cmap=cm.gist_earth_r, extent=[xmin, xmax, ymin, ymax]) plot(m1, m2, 'k.', markersize=2) show()
def __init__(self, **kw): self.kw = kw self.E = kw.get('E', 12.0) # Edge halflength of the containing cube. self.dE = kw.get('dE', 0.2) # Edge increment length. self.dR = kw.get('dR', 0.1) # Radial increment for coincidence. #print self.E, self.dE, self.dR rng = list(arange(-self.E,self.E+self.dE, self.dE)) # Edge values. # Force rebalance rng -= (rng[-1] + rng[0])/2.0 self.X = array([rng,] * len(rng)) # x values plane segment self.Y = rot90(self.X) # y values plane segment self.r = sqrt(self.X**2 + self.Y**2) # r values plane segment #print self.kw #print self.X #print self.Y #print self.r # persistent information about nodes, rings of nodes, and edges between self.ring = {} # radius: ring dictionary (duplicates) self.unique = {} # radius: ring dictionary (unique) self.edges = [] # pairs of vertices self.point = {} # unique points in paraboloid self.keyed = {} # keyed access to rings of unique points last = None # Build a ring stack comprising a discrete paraboloid of revolution. z = -self.dE zdiv = 30 self.zmax = 0 while True: z += self.dE self.zmax = z / zdiv r = around(sqrt(z / self.E), decimals=1) if r > self.E: break Q = fabs(self.r-r) this = [(x,y,z) for x,y,c in zip( around(self.X[Q <= self.dR], decimals=2).flatten(), around(self.Y[Q <= self.dR], decimals=2).flatten(), around( Q[Q <= self.dR], decimals=2).flatten()) if c] if last == None or this != last: if this != last: # connect vertices from last to this pass last = this self.ring[r] = this self.unique[r] = this self.keyed[r] = [] for i,j,k in this: xykey = '%+3.2e %+3.2e' % (i,j) # Keep a dictionary of unique points. self.point[xykey] = (i,j) n = sqrt(i*i+j*j) if not self.point.has_key(xykey): self.keyed[r] += [ {'key':xykey, 'xy':(i,j), 'tip':(i/n,j/n) },] if kw.get('verbose', False): print r, array(self.ring[r]) print where(Q <= self.dR, '*', ' ') else: # Don't eliminate duplicates. self.ring[r] = last filename = 'graph/E%3.1fe%3.1fr%3.2f' % (self.E,self.dE, self.dR) #terminal = "png nocrop enhanced font verdana 12 size 640,480" terminal = "png nocrop size 640,480" print filename, 'to', filename+'.png' with open(filename+'.obj', 'w') as objfile: with open(filename+'.dat', 'w') as datfile: print>>datfile, "# %s (this file)" % (filename) print>>datfile, "# %f %f %f" % (self.E, self.dE, self.dR) print>>datfile, "# %d points" % (len(self.point)) print>>datfile, "# %d z values" % (len(self.ring.keys())) print>>datfile, "# %d (x,y) sets" % (len(self.unique.keys())) print>>datfile, "set terminal %s" % (terminal) print>>datfile, "set output '%s.png'" % (filename) print>>datfile, "set zrange[0:%d]" % (self.zmax+1) print>>datfile, "set isosamples 100" print>>datfile, "splot '-' with vectors title 'Paraboloid'" print>>datfile for r, layer in self.unique.iteritems(): for i,j,kl in layer: n = sqrt(i*i+j*j) n += float(n==0.0) k = float(kl)/zdiv print>>datfile, '%+3.2e '*6 % (i, j, k, i/n, j/n, 0) print>>objfile, ('v'+' %+3.2e'*3) % (i, j, k) call(['gnuplot', filename+'.dat'])
def measure(n): """ Measurement model, return two coupled measurements. """ m1 = random.normal(size=n) m2 = random.normal(scale=0.5, size=n) return m1+m2, m1-m2 # Draw experiments and plot the results m1, m2 = measure(500) xmin = m1.min() xmax = m1.max() ymin = m2.min() ymax = m1.max() # Perform a kernel density estimator on the results X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = c_[X.ravel(), Y.ravel()] values = c_[m1, m2] kernel = stats.kde.gaussian_kde(values.T) Z = reshape(kernel(positions.T).T, X.T.shape) figure(figsize=(3, 3)) imshow( rot90(Z), cmap=cm.gist_earth_r, extent=[xmin, xmax, ymin, ymax]) plot(m1, m2, 'k.', markersize=2) show()
import numpy as np from scipy import ndimage, rot90 pattern = ndimage.imread('/res/fish1.png', flatten=True) im = ndimage.imread('/res/school.jpg', flatten=True) imn = ndimage.imread('/res/school.jpg') print(im.shape) fp = np.fft.fft2(rot90(pattern, 2), im.shape) fi = np.fft.fft2(im) m = np.multiply(fp, fi) corr = np.fft.ifft2(m) corr = np.abs(corr) corr = corr.astype(float) print(corr.size) i_M, j_M = corr.shape it = 0 corr[corr < 0.5 * np.amax(corr)] = 0 for i in range(i_M): for j in range(j_M): it += 1 if corr[i, j] > 0: print(corr[i, j]) imn[i, j][0] = 255 imn[i, j][1] = 255 imn[i, j][2] = 255 import matplotlib.pyplot as plt plt.imshow(imn) plt.show()
plt.title("scatter", fontsize=fontsize) plt.xticks(fontsize=fontsize) plt.yticks(fontsize=fontsize) ################################################## # now make a KDE of it and plot that fig = plt.subplot(2, 2, 2) kdeX, kdeY = mgrid[xmin:xmax:gridsize * 1j, ymin:ymax:gridsize * 1j] positions = c_[kdeX.ravel(), kdeY.ravel()] values = c_[data[0, :], data[1, :]] kernel = stats.kde.gaussian_kde(values.T) kdeZ = reshape(kernel(positions.T).T, kdeX.T.shape) plt.imshow(rot90(kdeZ), cmap=cm.binary, aspect='auto') plt.title("density of points", fontsize=fontsize) plt.xticks(fontsize=fontsize) plt.yticks(fontsize=fontsize) ################################################## # now make a delaunay triangulation of it and plot that fig = plt.subplot(2, 2, 3) tt = matplotlib.delaunay.triangulate.Triangulation(data[0, :], data[1, :]) #triang = tri.Triangulation(data[0,:], data[1,:]) #plt.triplot(triang, 'bo-') # this plots the actual triangles of the triangulation. I'm more interested in their interpolated values #extrap = tt.linear_extrapolator(data[2,:]) extrap = tt.nn_extrapolator(data[2, :]) interped = extrap[xmin:xmax:gridsize * 1j, ymin:ymax:gridsize * 1j]