#============================================================================== # Prepare parameters #============================================================================== #chromosomes = pd.Series.unique(data['CHR']) #chromosomes = natsorted(chromosomes, alg=ns.IGNORECASE) #natural sorting chromosomes = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chrX','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15', 'chr16','chr17','chr18','chr20','chrY','chr19','chr22','chr21'] ind = 0 genome_hist = [] genome_edges = [] start = 0 for chromosome in chromosomes: chrdata = data[data.CHR == chromosome] positions = chrdata['LOC'] out = dict(enumerate(grouper(positions, 2000), 1)) out_mean = {k: np.mean(v) + start for k, v in out.items()} out_mean = out_mean.values() hist, edges = np.histogram(out_mean, bins=10) genome_hist = np.hstack((genome_hist,hist)) ind = ind + 1 start = start + chr_size[chromosome] plt.figure() plt.stem(genome_hist) plt.title('genome sorted by chromosomes length for '+sys.argv[2]) plt.xlabel('genome') plt.ylabel('DSB counts') #plt.show()
def dsb(z): chrdata = data[data.CHR == z[0]] positions = chrdata['LOC'] out = dict(enumerate(grouper(positions, z[1]), 1)) return len(out)
#chromosomes = pd.Series.unique(data['CHR']) #chromosomes = natsorted(chromosomes, alg=ns.IGNORECASE) #natural sorting chromosomes = [ 'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chrX', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr20', 'chrY', 'chr19', 'chr22', 'chr21' ] GW_span = [] start = 0 threshold = 2000 for chromosome in chromosomes: chrdata = data[data.CHR == chromosome] positions = chrdata['LOC'] out = dict(enumerate(grouper(positions, threshold), 1)) out_span = {k: max(v) - min(v) for k, v in out.items()} out_span = out_span.values() GW_span.extend(out_span) # remove the zeros # GW_span = [x for x in GW_span if x != 0] #plt.figure() #plt.hist(GW_span,bins=100,range=[0,50000]) #plt.title('GW histogram of the max span for hotspots at threshold = ' + str(threshold)) #plt.show() outfile = '/home/garner1/Work/dataset/rm35/outdata/rm35.hotspot_diameter.threshold2000.txt' with open(outfile, 'w') as f: