Beispiel #1
0
#==============================================================================
# Prepare parameters
#==============================================================================
#chromosomes = pd.Series.unique(data['CHR'])
#chromosomes = natsorted(chromosomes, alg=ns.IGNORECASE) #natural sorting
chromosomes = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chrX','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15',
               'chr16','chr17','chr18','chr20','chrY','chr19','chr22','chr21']

ind = 0
genome_hist = []
genome_edges = []
start = 0
for chromosome in chromosomes:
    chrdata = data[data.CHR == chromosome]
    positions = chrdata['LOC']
    out = dict(enumerate(grouper(positions, 2000), 1))
   
    out_mean = {k: np.mean(v) + start for k, v in out.items()}
    out_mean = out_mean.values()    
    hist, edges = np.histogram(out_mean, bins=10)
    genome_hist = np.hstack((genome_hist,hist))
   
    ind = ind + 1
    start = start + chr_size[chromosome]

plt.figure()
plt.stem(genome_hist)
plt.title('genome sorted by chromosomes length for '+sys.argv[2])
plt.xlabel('genome')
plt.ylabel('DSB counts')
#plt.show()
Beispiel #2
0
def dsb(z):
    chrdata = data[data.CHR == z[0]]
    positions = chrdata['LOC']
    out = dict(enumerate(grouper(positions, z[1]), 1))
    return len(out)
Beispiel #3
0
def dsb(z):
    chrdata = data[data.CHR == z[0]]
    positions = chrdata['LOC']
    out = dict(enumerate(grouper(positions, z[1]), 1))
    return len(out)
Beispiel #4
0
#==============================================================================
# Prepare parameters
#==============================================================================
#chromosomes = pd.Series.unique(data['CHR'])
#chromosomes = natsorted(chromosomes, alg=ns.IGNORECASE) #natural sorting
chromosomes = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chrX','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15',
               'chr16','chr17','chr18','chr20','chrY','chr19','chr22','chr21']

ind = 0
genome_hist = []
genome_edges = []
start = 0
for chromosome in chromosomes:
    chrdata = data[data.CHR == chromosome]
    positions = chrdata['LOC']
    out = dict(enumerate(grouper(positions, 2000), 1))
   
    out_mean = {k: np.mean(v) + start for k, v in out.items()}
    out_mean = out_mean.values()    
    hist, edges = np.histogram(out_mean, bins=10)
    genome_hist = np.hstack((genome_hist,hist))
   
    ind = ind + 1
    start = start + chr_size[chromosome]

plt.figure()
plt.stem(genome_hist)
plt.title('genome sorted by chromosomes length for '+sys.argv[2])
plt.xlabel('genome')
plt.ylabel('DSB counts')
#plt.show()
Beispiel #5
0
#chromosomes = pd.Series.unique(data['CHR'])
#chromosomes = natsorted(chromosomes, alg=ns.IGNORECASE) #natural sorting
chromosomes = [
    'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chrX', 'chr8',
    'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16',
    'chr17', 'chr18', 'chr20', 'chrY', 'chr19', 'chr22', 'chr21'
]

GW_span = []
start = 0
threshold = 2000

for chromosome in chromosomes:
    chrdata = data[data.CHR == chromosome]
    positions = chrdata['LOC']
    out = dict(enumerate(grouper(positions, threshold), 1))

    out_span = {k: max(v) - min(v) for k, v in out.items()}
    out_span = out_span.values()
    GW_span.extend(out_span)

# remove the zeros
# GW_span = [x for x in GW_span if x != 0]

#plt.figure()
#plt.hist(GW_span,bins=100,range=[0,50000])
#plt.title('GW histogram of the max span for hotspots at threshold = ' + str(threshold))
#plt.show()

outfile = '/home/garner1/Work/dataset/rm35/outdata/rm35.hotspot_diameter.threshold2000.txt'
with open(outfile, 'w') as f: