Ejemplo n.º 1
0
def mode_2(exp_matrix):
    
    #remember value of bedgraph, ugly way
    value = {}
    for regions in exp_matrix.get_regionsets():
        for region in regions:
            value[(region.chrom, region.initial, region.final)] = region.data
    
    for region in exp_matrix.get_regionsets():
        f = open("region_" + str(region.name) + ".data", 'w')
        
        region_set = GenomicRegionSet("")
        _, _, mappedGenes, _, gene_peaks_mapping = region_set.filter_by_gene_association(region.fileName, None, gene_file, genome_file, threshDist=2000)
        
        for k in gene_peaks_mapping.keys():
            chr, raw_positions = k.split(':')
            start, end = map(lambda x: int(x), raw_positions.split('-'))
            
            #if peak is not assigned, an empty string occurs
            if "" in gene_peaks_mapping[k]:
                gene_peaks_mapping[k].remove("")
            
            list = 'NA' if not gene_peaks_mapping[k] else ','.join(gene_peaks_mapping[k])
            
            print(chr, start, end, value[(chr, start, end)], list, sep='\t', file = f)
        
        f.close()
Ejemplo n.º 2
0
def mode_3(exp_matrix):
    #remember value of bedgraph, ugly way
    score = {}
    for regions in exp_matrix.get_regionsets():
        for region in regions:
            score[(region.chrom + ':' + str(region.initial) + '-' + str(region.final))] = region.data
    
    
    for region in exp_matrix.get_regionsets():
        f = open("region_" + str(region.name) + ".data", 'w')
        
        region_set = GenomicRegionSet("")
        _, _, mappedGenes, _, gene_peaks_mapping = region_set.filter_by_gene_association(region.fileName, None, gene_file, genome_file, threshDist=2000)
        
        avg_score = {} #score per peak
        genes = {}
        
        for peak, gene_list in gene_peaks_mapping.items():
            for gen in gene_list: #reverse mapping peak -> gene to gene -> peak
                if not gen:
                    continue
                genes[gen] = genes.get(gen, set())
                genes[gen].add(peak)
                avg_score[gen] = avg_score.get(gen, [])
                avg_score[gen].append(score[peak]) #join all scores of peaks assigned to a gen
        
        for gen in genes.keys():
            avg = sum(map(lambda x: float(x), avg_score[gen]))/ float(len(avg_score[gen]))
            print(gen, avg, ", ".join(str(t) for t in genes[gen]), sep='\t', file = f)
               
        f.close()
Ejemplo n.º 3
0
def mode_3(exp_matrix):
    #remember value of bedgraph, ugly way
    score = {}
    for regions in exp_matrix.get_regionsets():
        for region in regions:
            score[(region.chrom + ':' + str(region.initial) + '-' + str(region.final))] = region.data
    
    
    for region in exp_matrix.get_regionsets():
        f = open("region_" + str(region.name) + ".data", 'w')
        
        region_set = GenomicRegionSet("")
        _, _, mappedGenes, _, gene_peaks_mapping = region_set.filter_by_gene_association(region.fileName, None, gene_file, genome_file, thresh_dist=2000)
        
        avg_score = {} #score per peak
        genes = {}
        
        for peak, gene_list in list(gene_peaks_mapping.items()):
            for gen in gene_list: #reverse mapping peak -> gene to gene -> peak
                if not gen:
                    continue
                genes[gen] = genes.get(gen, set())
                genes[gen].add(peak)
                avg_score[gen] = avg_score.get(gen, [])
                avg_score[gen].append(score[peak]) #join all scores of peaks assigned to a gen
        
        for gen in list(genes.keys()):
            avg = sum([float(x) for x in avg_score[gen]])/ float(len(avg_score[gen]))
            print(gen, avg, ", ".join(str(t) for t in genes[gen]), sep='\t', file = f)
               
        f.close()
Ejemplo n.º 4
0
def mode_2(exp_matrix):
    
    #remember value of bedgraph, ugly way
    value = {}
    for regions in exp_matrix.get_regionsets():
        for region in regions:
            value[(region.chrom, region.initial, region.final)] = region.data
    
    for region in exp_matrix.get_regionsets():
        f = open("region_" + str(region.name) + ".data", 'w')
        
        region_set = GenomicRegionSet("")
        _, _, mappedGenes, _, gene_peaks_mapping = region_set.filter_by_gene_association(region.fileName, None, gene_file, genome_file, thresh_dist=2000)
        
        for k in list(gene_peaks_mapping.keys()):
            chr, raw_positions = k.split(':')
            start, end = [int(x) for x in raw_positions.split('-')]
            
            #if peak is not assigned, an empty string occurs
            if "" in gene_peaks_mapping[k]:
                gene_peaks_mapping[k].remove("")
            
            list = 'NA' if not gene_peaks_mapping[k] else ','.join(gene_peaks_mapping[k])
            
            print(chr, start, end, value[(chr, start, end)], list, sep='\t', file = f)
        
        f.close()
Ejemplo n.º 5
0
def mode_1(exp_matrix):
    for region in exp_matrix.get_regionsets():
        region_set = GenomicRegionSet("")
        _, _, mappedGenes, _, _ = region_set.filter_by_gene_association(
            region.fileName, None, gene_file, genome_file, thresh_dist=50000)
        print('#number of mapped genes:', mappedGenes)
        print(region.name + "\t" + ("\t".join(region_set.genes)))
Ejemplo n.º 6
0
 allgenes.append(r.name)
allgenes=list(set(allgenes))

genesets=exps.get_genesets()

if len(sys.argv) > 3:
    back=True
    backGroundPeaks = sys.argv[3]
    backBed=GenomicRegionSet("BACK")
    backBed.read_bed(backGroundPeaks)


backBed=GenomicRegionSet("BACK")    
backBed.read_bed(backGroundPeaks)
backUP=GenomicRegionSet("BACKUP")
[back_de_genes,back_de_peak_genes, back_mappedGenes, back_totalPeaks] = backUP.filter_by_gene_association(backGroundPeaks,genesets[0],geneFile,genomeFile)
prop_back=back_mappedGenes/float(len(allgenes))

for g in genesets:
    for region in exps.get_regionsets():
        bed = GenomicRegionSet("")
        [degenes,de_peak_genes, mappedGenes, totalPeaks] = bed.filter_by_gene_association(region.fileName,g,geneFile,genomeFile)
        #print degenes
        #print bed.genes
        a=de_peak_genes
        b=degenes-de_peak_genes
        c=back_mappedGenes-de_peak_genes
        d=len(allgenes)-b-c-a
        prop_de=de_peak_genes/float(degenes)
        p= pvalue(a,b,c,d)
        print region.name,g.name,a,b,c,d,degenes,mappedGenes,len(allgenes),prop_de,prop_back,prop_de/prop_back,p.right_tail,p.left_tail
Ejemplo n.º 7
0
def mode_1(exp_matrix):
    for region in exp_matrix.get_regionsets():
        region_set = GenomicRegionSet("")
        _, _, mappedGenes, _, _ = region_set.filter_by_gene_association(region.fileName, None, gene_file, genome_file, threshDist=50000)
        print('#number of mapped genes:', mappedGenes)
        print(region.name+"\t"+("\t".join(region_set.genes)))
Ejemplo n.º 8
0
allgenes = list(set(allgenes))

genesets = exps.get_genesets()

if len(sys.argv) > 3:
    back = True
    backGroundPeaks = sys.argv[3]
    backBed = GenomicRegionSet("BACK")
    backBed.read_bed(backGroundPeaks)

backBed = GenomicRegionSet("BACK")
backBed.read_bed(backGroundPeaks)
backUP = GenomicRegionSet("BACKUP")
[back_de_genes, back_de_peak_genes, back_mappedGenes,
 back_totalPeaks] = backUP.filter_by_gene_association(backGroundPeaks,
                                                      genesets[0], geneFile,
                                                      genomeFile)
prop_back = back_mappedGenes / float(len(allgenes))

for g in genesets:
    for region in exps.get_regionsets():
        bed = GenomicRegionSet("")
        [degenes, de_peak_genes, mappedGenes,
         totalPeaks] = bed.filter_by_gene_association(region.fileName, g,
                                                      geneFile, genomeFile)
        #print degenes
        #print bed.genes
        a = de_peak_genes
        b = degenes - de_peak_genes
        c = back_mappedGenes - de_peak_genes
        d = len(allgenes) - b - c - a
Ejemplo n.º 9
0
            print len(backBed), len(region)
            br = backBed.random_subregions(len(region))
        else:
            br = region.random_regions('hg19',
                                       total_size=len(region),
                                       overlap_result=True,
                                       overlap_input=True)

        br.write_bed(str(j) + "random.bed")
    for g in genesets:
        #print region,g
        bed = GenomicRegionSet("")
        [degenes, de_peak_genes, mappedGenes, totalPeaks,
         bla] = bed.filter_by_gene_association(region.fileName,
                                               g.genes,
                                               geneFile,
                                               genomeFile,
                                               threshDist=distance)
        randomRes = []
        #backBed=GenomicRegionSet("BACK")
        #backBed.read_bed(backGroundPeaks)
        for j, n in enumerate(range(randomize)):
            backUP = GenomicRegionSet("BACKUP")
            [
                back_de_genes, back_de_peak_genes, back_mappedGenes,
                back_totalPeaks, bla
            ] = backUP.filter_by_gene_association(str(j) + "random.bed",
                                                  g.genes,
                                                  geneFile,
                                                  genomeFile,
                                                  threshDist=distance)
Ejemplo n.º 10
0
  
print genesets

for region in exps.get_regionsets():
    for j,n in enumerate(range(randomize)):
            if backGroundPeaks:
              print len(backBed), len(region)
              br=backBed.random_subregions(len(region))
            else:
              br=region.random_regions('hg19',total_size=len(region),overlap_result=True, overlap_input=True)

            br.write_bed(str(j)+"random.bed")
    for g in genesets:
        #print region,g
        bed = GenomicRegionSet("")
        [degenes,de_peak_genes, mappedGenes, totalPeaks,bla] = bed.filter_by_gene_association(region.fileName,g.genes,geneFile,genomeFile,threshDist=distance)
        randomRes=[]
        #backBed=GenomicRegionSet("BACK")    
        #backBed.read_bed(backGroundPeaks)
        for j,n in enumerate(range(randomize)):
            backUP=GenomicRegionSet("BACKUP")
            [back_de_genes,back_de_peak_genes, back_mappedGenes, back_totalPeaks,bla] = backUP.filter_by_gene_association(str(j)+"random.bed",g.genes,geneFile,genomeFile,threshDist=distance)
            randomRes.append(back_de_peak_genes)
            #print str(j)+"random.bed"
        randomRes=numpy.array(randomRes)
        #print randomRes
        a=de_peak_genes
        m=numpy.mean(randomRes)
        s=numpy.std(randomRes)
        z=(a-m)/s
        prop_de=de_peak_genes/float(degenes)