Beispiel #1
0
def bed_seq_data(gap_chr_data, chr_size_file, out_bed_folder, seq_len=524288, shift=524288//2, cutting_chr_edges = 100000, test_fraction=0.2, chr_list="all"):

    if os.path.exists(out_bed_folder+"train_test_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed"):
        logging.info("Found dump for seq bed file " + out_bed_folder+"train_test_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed")
        bed_reader = BedReader(out_bed_folder+"train_test_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed")
        bed_reader.read_file(renamer = {"0":"chr","1":"start","2":"end", "3":"train_test"})
        return bed_reader.chr_data
    else:
        chr_size_data = pd.read_csv(chr_size_file, header=None, names = ["chr", "size"], sep="\t")
        chr_data = dict()
        if chr_list=="all":
            chr_list = chr_size_data["chr"]
        else:
            chr_list = chr_list
        # print("chr_list", chr_list)
        for chr in chr_list:
            chr_size = chr_size_data[chr_size_data["chr"]==chr]
            assert len(chr_size)==1
            chr_len = chr_size.iloc[0][1]
            if chr not in gap_chr_data.keys():
                logging.getLogger(__name__).warning("There are no gaps on " + chr + " chromosome")
                gaps = [(chr_len - cutting_chr_edges , chr_len - cutting_chr_edges)]
            else:
                gaps = list(zip( gap_chr_data[chr]["start"], gap_chr_data[chr]["end"]))
            chrs, starts, ends = [], [], []
            start_seq = cutting_chr_edges
            for count,gap in enumerate(sorted(gaps)):
                end_seq = gap[0] - seq_len #start of last seq in region between gaps
                if end_seq-start_seq < seq_len:
                    start_seq = gap[1]
                    continue
                else:
                    for start in range(start_seq, end_seq+1,shift):
                        chrs.append(chr)
                        starts.append(start)
                        ends.append(start+seq_len)
                    start_seq = gap[1]
                if count == len(gaps) - 1:
                    start_seq = gap[1]
                    end_seq = chr_len - cutting_chr_edges -seq_len
                    if end_seq - start_seq > seq_len:
                        for start in range(start_seq, end_seq,shift):
                            chrs.append(chr)
                            starts.append(start)
                            ends.append(start+seq_len)
            data = pd.DataFrame({"chr":chrs, "start":starts, "end":ends})
            data["train_test"] = ["train"] * len(data)
            # print(data)
            random.seed()
            rand_int = random.randint(0, len(data) - round(len(data)*test_fraction))
            # print("rand_int", rand_int)
            data.iloc[rand_int:rand_int + round(len(data)*test_fraction),3] = "test"
            chr_data[chr] = data
        conc_data = pd.concat([chr_data[chr] for chr in chr_data.keys()])
        conc_data.to_csv(out_bed_folder+"train_test_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed", sep="\t", header = False, index=False)
        conc_data[conc_data["train_test"]=="train"][["chr", "start", "end"]].to_csv(out_bed_folder+"train_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed",  header=False, index=False, sep="\t")
        conc_data[conc_data["train_test"] == "test"][["chr", "start", "end"]].to_csv(out_bed_folder + "test_shift"+str(seq_len//shift)+"_"+str(chr_list)+".bed",header=False, index=False, sep="\t")
        return chr_data
Beispiel #2
0
 def addUTR(self, filename, genes):
     hash = self.hashGenes(genes)
     reader = BedReader(filename)
     while (True):
         record = reader.nextRecord()
         if (not record): break
         if (not record.isBed6()):
             raise Exception("BED file has too few fields")
         id = record.name
         gene = hash.get(id, None)
         if (not gene):
             gene = BedGene(id, record.chr, record.strand)
             genes.append(gene)
             hash[id] = gene
         gene.addUTR(record.interval)
     for gene in genes:
         gene.coalesce()
Beispiel #3
0
 def addUTR(self,filename,genes):
     hash=self.hashGenes(genes)
     reader=BedReader(filename)
     while(True):
         record=reader.nextRecord()
         if(not record): break
         if(not record.isBed6()):
             raise Exception("BED file has too few fields")
         id=record.name
         gene=hash.get(id,None)
         if(not gene): 
             gene=BedGene(id,record.chr,record.strand)
             genes.append(gene)
             hash[id]=gene
         gene.addUTR(record.interval)
     for gene in genes:
         gene.coalesce()
Beispiel #4
0
 def readCDS(self, filename):
     reader = BedReader(filename)
     genes = []
     genesByName = {}
     while (True):
         record = reader.nextRecord()
         if (not record): break
         if (not record.isBed6()):
             raise Exception("BED file has too few fields")
         id = record.name
         gene = genesByName.get(id, None)
         if (not gene):
             gene = BedGene(id, record.chr, record.strand)
             genesByName[id] = gene
             genes.append(gene)
         gene.addCDS(record.interval)
     reader.close()
     return genes
Beispiel #5
0
 def readCDS(self,filename):
     reader=BedReader(filename)
     genes=[]
     genesByName={}
     while(True):
         record=reader.nextRecord()
         if(not record): break
         if(not record.isBed6()):
             raise Exception("BED file has too few fields")
         id=record.name
         gene=genesByName.get(id,None)
         if(not gene):
             gene=BedGene(id,record.chr,record.strand)
             genesByName[id]=gene
             genes.append(gene)
         gene.addCDS(record.interval)
     reader.close()
     return genes
Beispiel #6
0
def generate_gaps(chr_list,
                  cool_file,
                  output_gap_folder,
                  zero_proc_in_line=97,
                  bins_min_gap_between=3,
                  bins_min_gap=3):
    output_gap_file = output_gap_folder + "gaps+chr" + str(
        chr_list) + "_proc" + str(zero_proc_in_line) + ".bed"
    if os.path.exists(output_gap_file):
        logging.info("Found dump for gap file " + output_gap_file)
        bed_reader = BedReader(output_gap_file)
        bed_reader.read_file()
        return bed_reader.chr_data
    else:
        c = cooler.Cooler(cool_file)
        chr_data = dict()
        for chr in chr_list:
            data = c.matrix(balance=False).fetch(chr)
            bin_size = c.binsize
            minimum_gap_between_length = bin_size * bins_min_gap_between
            min_gap_length = bin_size * bins_min_gap
            zero_counts = []
            proc_zero_counts = []
            gaps = []
            for n, row in enumerate(data):
                #calculate procent of zeroes in every column
                zero_count = len(row) - np.count_nonzero(row)
                proc_zero = zero_count / len(row) * 100
                zero_counts.append(zero_count)
                proc_zero_counts.append(proc_zero)
                if proc_zero >= zero_proc_in_line:
                    gaps.append((chr, (n + 1) * bin_size,
                                 (n + 1) * bin_size + bin_size))
            # print(gaps)
            data = merge_gaps(gaps, minimum_gap_between_length, min_gap_length,
                              output_gap_file)
            # convert to chr-dict
            chr_data[chr] = data
            logging.info("Found " + str(len(chr_data[chr])) + " gaps on chr " +
                         chr)
        conc_data = pd.concat([chr_data[chr] for chr in chr_data.keys()])
        conc_data.to_csv(output_gap_file, sep="\t", header=False, index=False)
        return chr_data
def addPeaks(bedDir,timepoint,direction,expression):
    records=BedReader.readAll(bedDir+"/"+timepoint+"_"+direction+".bed")
    for rec in records:
        array=expression.get(rec.chr,None)
        if(array is None): array=expression[rec.chr]=[]
        interval=rec.interval
        interval.begin=interval.intCenter()
        interval.end=interval.begin+1
        interval.type="peak"
        interval.dir=direction
        array.append(interval)
Beispiel #8
0
#!/usr/bin/env python
#=========================================================================
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
# License (GPL) version 3, as described at www.opensource.org.
# Copyright (C)2016 William H. Majoros ([email protected]).
#=========================================================================
from __future__ import (absolute_import, division, print_function,
                        unicode_literals, generators, nested_scopes,
                        with_statement)
from builtins import (bytes, dict, int, list, object, range, str, ascii, chr,
                      hex, input, next, oct, open, pow, round, super, filter,
                      map, zip)
from BedReader import BedReader

BASE = "/Users/bmajoros/python/test/data"
filename = BASE + "/DEGs_downreg.FDR_0.1.TSS.protein_coding.bed"

reader = BedReader(filename)
while (True):
    rec = reader.nextRecord()
    if (rec is None): break
    begin = rec.getBegin()
    end = rec.getEnd()
    print(begin, "-", end, sep="", end="")
    print("\t" + rec.name + "\t" + str(rec.score) + "\t" + rec.strand)
reader.close()
Beispiel #9
0
#!/usr/bin/env python
#=========================================================================
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
# License (GPL) version 3, as described at www.opensource.org.
# Copyright (C)2016 William H. Majoros ([email protected]).
#=========================================================================
from __future__ import (absolute_import, division, print_function, 
   unicode_literals, generators, nested_scopes, with_statement)
from builtins import (bytes, dict, int, list, object, range, str, ascii,
   chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
from BedReader import BedReader

BASE="/Users/bmajoros/python/test/data"
filename=BASE+"/DEGs_downreg.FDR_0.1.TSS.protein_coding.bed"

reader=BedReader(filename)
while(True):
    rec=reader.nextRecord()
    if(rec is None): break
    begin=rec.getBegin()
    end=rec.getEnd()
    print(begin,"-",end,sep="",end="")
    print("\t"+rec.name+"\t"+str(rec.score)+"\t"+rec.strand)
reader.close()