Exemple #1
0
    def __init__(self, genomeBuild="hg18"):
        self.sequenceBase = os.path.expanduser("~/mount/publicdata/" +
                                               genomeBuild +
                                               "/assembly/fasta/")

        self.chromosomeEnds = ChromosomeEnds(genomeBuild)

        self.chrms = {}
Exemple #2
0
    def __init__(self, build, annotation, exons, minExons, numbExons, tsses,
                 chunks, ud, ui, dd, di):

        assert not (exons and tsses)

        #Stores all parameters
        self.exons = exons
        self.minExons = minExons
        self.numbExons = numbExons

        self.tsses = tsses

        self.geneNumbChunks = 20 if (chunks == None) else chunks
        self.upstreamDistance = 5000 if (ud == None) else ud
        self.upstreamInterval = 1000 if (ui == None) else ui
        self.downstreamDistance = 5000 if (dd == None) else dd
        self.downstreamInterval = 1000 if (di == None) else di

        # Gets dictionaries of genes, exons transcripts etc from Ensembl class
        #assert build == "hg18", "Non hg-18 not supported for Ensembl regions at the moment"

        #Gets the lengths of chromosomes
        # we keep this as a non instance variable as well so that the subclass below can access it
        genedata = Ensembl.EnsemblGenes(assembly=build, annotation=annotation)
        self.genedata = genedata

        self.chromosomeEnds = ChromosomeEnds(build)

        # Gets an object of the genelist class
        class EnsemblNameAndIDFormatter(list):
            def __init__(self, genesToUseLocation):
                geneList = GeneList(genesToUseLocation)

                self.seengenes = set()

                for gene in geneList:
                    if gene in self.seengenes:
                        # seen this gene id before (generally shouldnt be the case as using GeneList which guarantees this
                        # for the source list at least
                        pass
                    elif gene in genedata:
                        # we've not seen the gene but it's in the ensembl ids list
                        self.append(gene)
                        self.seengenes.add(gene)
                    else:
                        # it's not in the ensembl ids list, it could be a gene name
                        found = False
                        for geneid in genedata.getGeneIDs(gene):
                            if geneid not in self.seengenes:
                                self.append(geneid)
                                self.seengenes.add(geneid)
                                found = True
                        if not found:
                            print "No GeneID for:" + gene

                print genesToUseLocation + ":" + str(len(self))

        self.regionIterator = EnsemblNameAndIDFormatter
Exemple #3
0
    def __init__(self, build, ud, ui, dd, di):

        self.upstreamDistance = 100000 if (ud == None) else ud
        self.upstreamInterval = 1000 if (ui == None) else ui
        self.downstreamDistance = 100000 if (dd == None) else dd
        self.downstreamInterval = 1000 if (di == None) else di

        self.chromosomeEnds = ChromosomeEnds(build)

        self.regionIterator = PointList
Exemple #4
0
    def __init__(self, build, chunks, ud, ui, dd, di):

        self.numbChunks = 20 if (chunks == None) else chunks
        self.upstreamDistance = 100000 if (ud == None) else ud
        self.upstreamInterval = 1000 if (ui == None) else ui
        self.downstreamDistance = 100000 if (dd == None) else dd
        self.downstreamInterval = 1000 if (di == None) else di

        print self.numbChunks, self.upstreamDistance, self.upstreamInterval, self.downstreamDistance, self.downstreamInterval

        self.chromosomeEnds = ChromosomeEnds(build)

        self.regionIterator = BedList
Exemple #5
0
        print str(err)  # will print something like "option -a not recognized"
        sys.exit(2)

    # add executing directory as part of path
    sys.path.append(sys.path[0])

    direction = True

    assembly = "hg18"

    for opt, arg in opts:
        if opt == "-a":
            a = arg
        elif opt == "-b":
            b = arg
        elif opt == "--assembly":
            assembly = arg

    gaps = ChromosomeGaps(assembly)
    ends = ChromosomeEnds(assembly)

    effectiveGenomeSize = 0
    for chr in ChrList(assembly):
        effectiveGenomeSize += ends[chr] - gaps.chrmgaps[chr]

    print "---"
    print a.split("/")[-1] + " which have " + b.split("/")[-1]
    print "---"

    compareTwo(a, b, effectiveGenomeSize, ChrList(assembly))
Exemple #6
0
import gc

gc.disable()

from bed.treatment import SimpleBed
from bed.treatment import Bed as BedIntervalTree
from datastructures.genomeintervaltree import GenomeIntervalTree
from genemapping.gaps import ChromosomeGaps
from genemapping.chrmEnds import ChromosomeEnds
from genemapping.chrList import ChrList
from multiprocessing import Pool
import random
import collections

gaps = ChromosomeGaps("hg18")
ends = ChromosomeEnds("hg18")

from datastructures.wrg import WeightedRandomGenerator

#weights = collections.defaultdict(int)

effectiveGenomeSize = 0
for chr in ChrList("hg18"):
    effectiveGenomeSize += ends[chr] - gaps.chrmgaps[chr]
    # below is needed for weighted chromosome selection
    #weights[chr] = ends[chr] - gaps.chrmgaps[chr]
    #assert weights[chr] > 0
print "Effective Genome Size:" + str(effectiveGenomeSize)

# optional - removed for now.  allows regions to move between chromosomes
#randomChromosome = WeightedRandomGenerator(weights)
        sys.exit(2)

    # add executing directory as part of path
    sys.path.append(sys.path[0])
    
    vstepWidth = 200 #default
    
    pointlist = False
    
    for o, a in opts:
        if (o=="-s"):
            vstepWidth = int(a)
        elif (o=="-d"):    
            debug = True
        elif (o=="-c"):
            chromosomeEnds = ChromosomeEnds(a)
        elif (o=="-g"):
            GeneSummary.surroundingDistance = int(a)

    for o, a in opts:
        if o == "-s":
            pass # we dealt with this already
        elif (o=="-d"):
            pass # we dealt with this already
        elif (o=="-c"):
            pass # we dealt with this already
        elif (o=="-g"):
            pass # we dealt with this already

        # gene mappings / point list settings (for how to deal with the arguments representing which genes / points to plot)
        elif o == "-a":
Exemple #8
0
 def __init__(self,filename,build):
     self.treatment = BedFile(filename)
     self.getValues = self.treatment.getIntervalsInRange
     self.chromosomeEnds = ChromosomeEnds(build)
Exemple #9
0
 def __init__(self,build):
     self.genome = Genome(genomeBuild = build)
     
     self.valuesBehaviour = missingValuesDontCount
     self.chromosomeEnds = ChromosomeEnds(build)
            joinedfile = a
            print "Joined file: ", a
        elif o == "--binsize":
            binsize = int(a)
            print "Binsize: ", a
            #assert False, "Two different chromosomes not implemented yet"
        elif o == "--genome":
            genome = a
            print "Genome: ", genome
        elif o == "--minsize":
            minsize = int(a)
            print "Minsize: ", minsize

    assert joinedfile != None

    chrmEnds = ChromosomeEnds(genome)
    chrmList = ChrList(genome)

    chrmMatrix = defaultdict(dict)
    for chrmA in chrmList:
        for chrmB in chrmList:
            chrmMatrix[chrmA][chrmB] = defaultdict(int)
            chrmMatrix[chrmB][chrmA] = defaultdict(int)

    reads = csv.reader(open(joinedfile, "r"), delimiter="\t")

    for read in reads:
        #print read[lread_chr],read[lread_pos],read[rread_chr],read[rread_pos]

        lread_chr = read[lread_chr_col]
        rread_chr = read[rread_chr_col]
Exemple #11
0
    outfile = fname + ".extended"
    print "writing to outfile", outfile
    writer = csv.writer(open(outfile, "w"), delimiter="\t")
    for row in reader:
        chr = row[0]
        start = int(row[1])
        end = int(row[2])
        strand = row[5]
        if strand == "+":
            row[2] = min(ends[chr], start + extendlen)
        elif strand == "-":
            row[1] = max(0, end - extendlen)

        writer.writerow(row)


if __name__ == "__main__":
    from glob import glob
    import sys
    if len(sys.argv) != 4:
        print "usage: extendbed.py <beddir> <genomebuild> <extendlength>"
        sys.exit(1)

    ends = ChromosomeEnds(sys.argv[2])

    beddir = sys.argv[1]
    extendlen = int(sys.argv[3])
    allfiles = glob(beddir + "/*.bed")
    for fname in allfiles:
        extendOne(fname, extendlen)