Esempio n. 1
0
    def __init__(self, build, annotation, exons, minExons, numbExons, tsses,
                 chunks, ud, ui, dd, di):

        assert not (exons and tsses)

        #Stores all parameters
        self.exons = exons
        self.minExons = minExons
        self.numbExons = numbExons

        self.tsses = tsses

        self.geneNumbChunks = 20 if (chunks == None) else chunks
        self.upstreamDistance = 5000 if (ud == None) else ud
        self.upstreamInterval = 1000 if (ui == None) else ui
        self.downstreamDistance = 5000 if (dd == None) else dd
        self.downstreamInterval = 1000 if (di == None) else di

        # Gets dictionaries of genes, exons transcripts etc from Ensembl class
        #assert build == "hg18", "Non hg-18 not supported for Ensembl regions at the moment"

        #Gets the lengths of chromosomes
        # we keep this as a non instance variable as well so that the subclass below can access it
        genedata = Ensembl.EnsemblGenes(assembly=build, annotation=annotation)
        self.genedata = genedata

        self.chromosomeEnds = ChromosomeEnds(build)

        # Gets an object of the genelist class
        class EnsemblNameAndIDFormatter(list):
            def __init__(self, genesToUseLocation):
                geneList = GeneList(genesToUseLocation)

                self.seengenes = set()

                for gene in geneList:
                    if gene in self.seengenes:
                        # seen this gene id before (generally shouldnt be the case as using GeneList which guarantees this
                        # for the source list at least
                        pass
                    elif gene in genedata:
                        # we've not seen the gene but it's in the ensembl ids list
                        self.append(gene)
                        self.seengenes.add(gene)
                    else:
                        # it's not in the ensembl ids list, it could be a gene name
                        found = False
                        for geneid in genedata.getGeneIDs(gene):
                            if geneid not in self.seengenes:
                                self.append(geneid)
                                self.seengenes.add(geneid)
                                found = True
                        if not found:
                            print "No GeneID for:" + gene

                print genesToUseLocation + ":" + str(len(self))

        self.regionIterator = EnsemblNameAndIDFormatter
try:
    opts, args = getopt.getopt(sys.argv[1:], "g:", [])
except getopt.GetoptError, err:
    # print help information and exit:
    print str(err)  # will print something like "option -a not recognized"
    sys.exit(2)

UPSTREAM_PROMOTOR_DIST = 5000

genelists = []

for o, a in opts:
    if o == "-g":
        genelists.append(GeneList(a))

assert len(genelists) > 0

genedata = Ensembl.EnsemblGenes(assembly="hg18", annotation="ncbi36.1")

for genelist in genelists:
    print genelist.getFullName()
    for gene in genedata:
        # does the gene match the pattern
        for pattern in genelist:
            if re.match(pattern, genedata[gene].name):
                start, stop = genedata[gene].getGeneWithPromotor(
                    upstreamPadding=UPSTREAM_PROMOTOR_DIST)

                print genedata[gene].id, genedata[gene].name, genedata[
                    gene].chr, start, stop
                break
Esempio n. 3
0
        print str(err)  # will print something like "option -a not recognized"
        sys.exit(2)

    assembly = "hg19"
    ensemblids = []
    fastaFile = None

    for o, a in opts:
        if o == "--ensemblids":
            ensemblids = GeneList(a)
        elif o == "--fasta":
            fastaFile = FastAFile(a)

    if len(ensemblids) > 0:

        genedata = Ensembl.EnsemblGenes(assembly=assembly)

        print "ensemblid", "chr", "start", "stop", "transcripts", "stemloops", "polyas"

        for ensemblid in ensemblids:

            stemloops = 0
            polyas = 0

            for transcriptid in genedata[ensemblid]:
                # http://genome-euro.ucsc.edu/cgi-bin/hgc?db=hg19&g=htcGeneMrna&i=ENST00000314332&o=ensGene&table=ensGene

                chrm = genedata[ensemblid][transcriptid].chr
                start = str(genedata[ensemblid][transcriptid].start)
                stop = str(genedata[ensemblid][transcriptid].end)
Esempio n. 4
0
    vstepWidth = 200  #default
    numbSlices = 20  # default

    for o, a in opts:
        # need a vstep width
        if o == "-s":
            vstepWidth = int(a)
        # need a vstep file
        elif o == "-v":
            vstepFile = a
        # need a list of genes -> coords
        elif o == "-e":
            genesFileName = a  # eg : "genes-and-exons-human-NCBI36.csv"
            print "Loading gene mapping"
            genesmapping = Ensembl.GenesMapping(genesFileName)
        # need an output folder
        elif o == "-o":
            outputFolder = a
        # need a number of slices
        elif o == "-p":
            numbSlices = a
        elif o == "-f":
            friendlyGenesNames = Ensembl.FriendlyGeneNames(a)
        elif o == "-w":
            webroot = a

    # need lists of genes
    geneListLocations = args

    # create output folder
Esempio n. 5
0
        print str(err)  # will print something like "option -a not recognized"
        print "Usage: main.py  [Space seperated list of gene id sets]"
        sys.exit(2)

    for o, a in opts:
        if (o == "-G") or (o == "--gtf"):
            gtfFile = a
        elif (o == "-D") or (o == "--gene-expression-difference"):
            differences = a
        elif (o == "-o") or (o == "--output"):
            outputFile = a
        else:
            print "Unknown parameter: " + o + " " + a
            sys.exit(2)

    genedata = Ensembl.EnsemblGenes(assembly="hg19",
                                    annotation="EnsemblGenes73")

    gtfReader = csv.reader(open(gtfFile, "r"), delimiter="\t")

    cuffGeneIdsToEnsemblGeneIds = collections.defaultdict(set)

    missing = set()
    for row in gtfReader:
        detailsColumn = row[8]
        details = dict(
            item.replace("\"", "").split(" ")
            for item in detailsColumn.split("; "))

        if "nearest_ref" not in details:
            continue
Esempio n. 6
0
        rnaSeqExpressionData = IndexedCSV(a, key="test_id")
    elif o == "-a":
        assembly = a

UPSTREAM_PROMOTOR_DIST = 2000
DOWNSTREAM_PROMOTOR_DIST = 2000

writer = csv.writer(open(outfile, "w"), delimiter="\t")

genome = Genome(assembly)

###

# load data

genedata = Ensembl.EnsemblGenes(assembly=assembly)

genes = Ensembl.ReverseGeneMapping(genedata)

genespluspromotor = Ensembl.ReverseGeneMapping(
    genedata, tssPadding=UPSTREAM_PROMOTOR_DIST)

genepromotors = Ensembl.ReversePromotorMapping(
    genedata,
    upstreamPadding=UPSTREAM_PROMOTOR_DIST,
    downstreamPadding=DOWNSTREAM_PROMOTOR_DIST)

exons = Ensembl.ReverseExonMapping(genedata)

transcriptionSites = Ensembl.TranscriptionSites(genedata)
Esempio n. 7
0
    opts, args = getopt.getopt(sys.argv[1:], "", [])
except getopt.GetoptError, err:
    # print help information and exit:
    print str(err)  # will print something like "option -a not recognized"
    sys.exit(2)

UPSTREAM_PROMOTOR_DIST = 5000
DOWNSTREAM_PROMOTOR_DIST = 1000

# probably want to change this to be exons rather than genes

###

# load data

genedata = Ensembl.EnsemblGenes(assembly="hg18", annotation="ncbi36.1")

bp = 0
for gene in genedata:
    bp += genedata[gene].end - genedata[gene].start

print bp

#genedata = Ensembl.GenesMapping(os.path.expanduser("~/mount/publicdata/hg18/ncbi36.1/genes-and-exons-human-NCBI36.1.csv"))

genes = Ensembl.ReverseGeneMapping(genedata)

genespluspromotor = Ensembl.ReverseGeneMapping(
    genedata, tssPadding=UPSTREAM_PROMOTOR_DIST)

genepromotors = Ensembl.ReversePromotorMapping(
Esempio n. 8
0
reader = csv.reader(open(infile), delimiter="\t")

writer = csv.writer(open(outfile, "w"), delimiter="\t")

###

TSS_TTS_Distance = 5000
TTS_TTS_Distance_Human = str(TSS_TTS_Distance / 1000) + "kb"

Small_TSS_TTS_Distance = 1000
Small_TTS_TTS_Distance_Human = str(Small_TSS_TTS_Distance / 1000) + "kb"

# load data

genedata = Ensembl.EnsemblGenes(assembly="hg18", annotation="ncbi36.1")

genes = Ensembl.ReverseGeneMapping(genedata)

exons = Ensembl.ReverseExonMapping(genedata)

transcriptionSites = Ensembl.TranscriptionSites(genedata)

cpgIslands = ExtendedBed(
    os.path.expanduser(
        "~/mount/publicdata/hg18/cpgislands/cpgislands-0-index.bed"))

affyannotation = NetAffxAnnotation()

paddedGenes = Ensembl.ReverseGeneMapping(genedata, tssPadding=TSS_TTS_Distance)