def __init__(self, genomeBuild="hg18"): self.sequenceBase = os.path.expanduser("~/mount/publicdata/" + genomeBuild + "/assembly/fasta/") self.chromosomeEnds = ChromosomeEnds(genomeBuild) self.chrms = {}
def __init__(self, build, annotation, exons, minExons, numbExons, tsses, chunks, ud, ui, dd, di): assert not (exons and tsses) #Stores all parameters self.exons = exons self.minExons = minExons self.numbExons = numbExons self.tsses = tsses self.geneNumbChunks = 20 if (chunks == None) else chunks self.upstreamDistance = 5000 if (ud == None) else ud self.upstreamInterval = 1000 if (ui == None) else ui self.downstreamDistance = 5000 if (dd == None) else dd self.downstreamInterval = 1000 if (di == None) else di # Gets dictionaries of genes, exons transcripts etc from Ensembl class #assert build == "hg18", "Non hg-18 not supported for Ensembl regions at the moment" #Gets the lengths of chromosomes # we keep this as a non instance variable as well so that the subclass below can access it genedata = Ensembl.EnsemblGenes(assembly=build, annotation=annotation) self.genedata = genedata self.chromosomeEnds = ChromosomeEnds(build) # Gets an object of the genelist class class EnsemblNameAndIDFormatter(list): def __init__(self, genesToUseLocation): geneList = GeneList(genesToUseLocation) self.seengenes = set() for gene in geneList: if gene in self.seengenes: # seen this gene id before (generally shouldnt be the case as using GeneList which guarantees this # for the source list at least pass elif gene in genedata: # we've not seen the gene but it's in the ensembl ids list self.append(gene) self.seengenes.add(gene) else: # it's not in the ensembl ids list, it could be a gene name found = False for geneid in genedata.getGeneIDs(gene): if geneid not in self.seengenes: self.append(geneid) self.seengenes.add(geneid) found = True if not found: print "No GeneID for:" + gene print genesToUseLocation + ":" + str(len(self)) self.regionIterator = EnsemblNameAndIDFormatter
def __init__(self, build, ud, ui, dd, di): self.upstreamDistance = 100000 if (ud == None) else ud self.upstreamInterval = 1000 if (ui == None) else ui self.downstreamDistance = 100000 if (dd == None) else dd self.downstreamInterval = 1000 if (di == None) else di self.chromosomeEnds = ChromosomeEnds(build) self.regionIterator = PointList
def __init__(self, build, chunks, ud, ui, dd, di): self.numbChunks = 20 if (chunks == None) else chunks self.upstreamDistance = 100000 if (ud == None) else ud self.upstreamInterval = 1000 if (ui == None) else ui self.downstreamDistance = 100000 if (dd == None) else dd self.downstreamInterval = 1000 if (di == None) else di print self.numbChunks, self.upstreamDistance, self.upstreamInterval, self.downstreamDistance, self.downstreamInterval self.chromosomeEnds = ChromosomeEnds(build) self.regionIterator = BedList
print str(err) # will print something like "option -a not recognized" sys.exit(2) # add executing directory as part of path sys.path.append(sys.path[0]) direction = True assembly = "hg18" for opt, arg in opts: if opt == "-a": a = arg elif opt == "-b": b = arg elif opt == "--assembly": assembly = arg gaps = ChromosomeGaps(assembly) ends = ChromosomeEnds(assembly) effectiveGenomeSize = 0 for chr in ChrList(assembly): effectiveGenomeSize += ends[chr] - gaps.chrmgaps[chr] print "---" print a.split("/")[-1] + " which have " + b.split("/")[-1] print "---" compareTwo(a, b, effectiveGenomeSize, ChrList(assembly))
import gc gc.disable() from bed.treatment import SimpleBed from bed.treatment import Bed as BedIntervalTree from datastructures.genomeintervaltree import GenomeIntervalTree from genemapping.gaps import ChromosomeGaps from genemapping.chrmEnds import ChromosomeEnds from genemapping.chrList import ChrList from multiprocessing import Pool import random import collections gaps = ChromosomeGaps("hg18") ends = ChromosomeEnds("hg18") from datastructures.wrg import WeightedRandomGenerator #weights = collections.defaultdict(int) effectiveGenomeSize = 0 for chr in ChrList("hg18"): effectiveGenomeSize += ends[chr] - gaps.chrmgaps[chr] # below is needed for weighted chromosome selection #weights[chr] = ends[chr] - gaps.chrmgaps[chr] #assert weights[chr] > 0 print "Effective Genome Size:" + str(effectiveGenomeSize) # optional - removed for now. allows regions to move between chromosomes #randomChromosome = WeightedRandomGenerator(weights)
sys.exit(2) # add executing directory as part of path sys.path.append(sys.path[0]) vstepWidth = 200 #default pointlist = False for o, a in opts: if (o=="-s"): vstepWidth = int(a) elif (o=="-d"): debug = True elif (o=="-c"): chromosomeEnds = ChromosomeEnds(a) elif (o=="-g"): GeneSummary.surroundingDistance = int(a) for o, a in opts: if o == "-s": pass # we dealt with this already elif (o=="-d"): pass # we dealt with this already elif (o=="-c"): pass # we dealt with this already elif (o=="-g"): pass # we dealt with this already # gene mappings / point list settings (for how to deal with the arguments representing which genes / points to plot) elif o == "-a":
def __init__(self,filename,build): self.treatment = BedFile(filename) self.getValues = self.treatment.getIntervalsInRange self.chromosomeEnds = ChromosomeEnds(build)
def __init__(self,build): self.genome = Genome(genomeBuild = build) self.valuesBehaviour = missingValuesDontCount self.chromosomeEnds = ChromosomeEnds(build)
joinedfile = a print "Joined file: ", a elif o == "--binsize": binsize = int(a) print "Binsize: ", a #assert False, "Two different chromosomes not implemented yet" elif o == "--genome": genome = a print "Genome: ", genome elif o == "--minsize": minsize = int(a) print "Minsize: ", minsize assert joinedfile != None chrmEnds = ChromosomeEnds(genome) chrmList = ChrList(genome) chrmMatrix = defaultdict(dict) for chrmA in chrmList: for chrmB in chrmList: chrmMatrix[chrmA][chrmB] = defaultdict(int) chrmMatrix[chrmB][chrmA] = defaultdict(int) reads = csv.reader(open(joinedfile, "r"), delimiter="\t") for read in reads: #print read[lread_chr],read[lread_pos],read[rread_chr],read[rread_pos] lread_chr = read[lread_chr_col] rread_chr = read[rread_chr_col]
outfile = fname + ".extended" print "writing to outfile", outfile writer = csv.writer(open(outfile, "w"), delimiter="\t") for row in reader: chr = row[0] start = int(row[1]) end = int(row[2]) strand = row[5] if strand == "+": row[2] = min(ends[chr], start + extendlen) elif strand == "-": row[1] = max(0, end - extendlen) writer.writerow(row) if __name__ == "__main__": from glob import glob import sys if len(sys.argv) != 4: print "usage: extendbed.py <beddir> <genomebuild> <extendlength>" sys.exit(1) ends = ChromosomeEnds(sys.argv[2]) beddir = sys.argv[1] extendlen = int(sys.argv[3]) allfiles = glob(beddir + "/*.bed") for fname in allfiles: extendOne(fname, extendlen)