Esempio n. 1
0
    def __init__(self, build, annotation, exons, minExons, numbExons, tsses,
                 chunks, ud, ui, dd, di):

        assert not (exons and tsses)

        #Stores all parameters
        self.exons = exons
        self.minExons = minExons
        self.numbExons = numbExons

        self.tsses = tsses

        self.geneNumbChunks = 20 if (chunks == None) else chunks
        self.upstreamDistance = 5000 if (ud == None) else ud
        self.upstreamInterval = 1000 if (ui == None) else ui
        self.downstreamDistance = 5000 if (dd == None) else dd
        self.downstreamInterval = 1000 if (di == None) else di

        # Gets dictionaries of genes, exons transcripts etc from Ensembl class
        #assert build == "hg18", "Non hg-18 not supported for Ensembl regions at the moment"

        #Gets the lengths of chromosomes
        # we keep this as a non instance variable as well so that the subclass below can access it
        genedata = Ensembl.EnsemblGenes(assembly=build, annotation=annotation)
        self.genedata = genedata

        self.chromosomeEnds = ChromosomeEnds(build)

        # Gets an object of the genelist class
        class EnsemblNameAndIDFormatter(list):
            def __init__(self, genesToUseLocation):
                geneList = GeneList(genesToUseLocation)

                self.seengenes = set()

                for gene in geneList:
                    if gene in self.seengenes:
                        # seen this gene id before (generally shouldnt be the case as using GeneList which guarantees this
                        # for the source list at least
                        pass
                    elif gene in genedata:
                        # we've not seen the gene but it's in the ensembl ids list
                        self.append(gene)
                        self.seengenes.add(gene)
                    else:
                        # it's not in the ensembl ids list, it could be a gene name
                        found = False
                        for geneid in genedata.getGeneIDs(gene):
                            if geneid not in self.seengenes:
                                self.append(geneid)
                                self.seengenes.add(geneid)
                                found = True
                        if not found:
                            print "No GeneID for:" + gene

                print genesToUseLocation + ":" + str(len(self))

        self.regionIterator = EnsemblNameAndIDFormatter
try:
    opts, args = getopt.getopt(sys.argv[1:], "g:", [])
except getopt.GetoptError, err:
    # print help information and exit:
    print str(err)  # will print something like "option -a not recognized"
    sys.exit(2)

UPSTREAM_PROMOTOR_DIST = 5000

genelists = []

for o, a in opts:
    if o == "-g":
        genelists.append(GeneList(a))

assert len(genelists) > 0

genedata = Ensembl.EnsemblGenes(assembly="hg18", annotation="ncbi36.1")

for genelist in genelists:
    print genelist.getFullName()
    for gene in genedata:
        # does the gene match the pattern
        for pattern in genelist:
            if re.match(pattern, genedata[gene].name):
                start, stop = genedata[gene].getGeneWithPromotor(
                    upstreamPadding=UPSTREAM_PROMOTOR_DIST)

                print genedata[gene].id, genedata[gene].name, genedata[
                    gene].chr, start, stop
                break
Esempio n. 3
0
        print str(err)  # will print something like "option -a not recognized"
        sys.exit(2)

    assembly = "hg19"
    ensemblids = []
    fastaFile = None

    for o, a in opts:
        if o == "--ensemblids":
            ensemblids = GeneList(a)
        elif o == "--fasta":
            fastaFile = FastAFile(a)

    if len(ensemblids) > 0:

        genedata = Ensembl.EnsemblGenes(assembly=assembly)

        print "ensemblid", "chr", "start", "stop", "transcripts", "stemloops", "polyas"

        for ensemblid in ensemblids:

            stemloops = 0
            polyas = 0

            for transcriptid in genedata[ensemblid]:
                # http://genome-euro.ucsc.edu/cgi-bin/hgc?db=hg19&g=htcGeneMrna&i=ENST00000314332&o=ensGene&table=ensGene

                chrm = genedata[ensemblid][transcriptid].chr
                start = str(genedata[ensemblid][transcriptid].start)
                stop = str(genedata[ensemblid][transcriptid].end)
Esempio n. 4
0
        print str(err)  # will print something like "option -a not recognized"
        print "Usage: main.py  [Space seperated list of gene id sets]"
        sys.exit(2)

    for o, a in opts:
        if (o == "-G") or (o == "--gtf"):
            gtfFile = a
        elif (o == "-D") or (o == "--gene-expression-difference"):
            differences = a
        elif (o == "-o") or (o == "--output"):
            outputFile = a
        else:
            print "Unknown parameter: " + o + " " + a
            sys.exit(2)

    genedata = Ensembl.EnsemblGenes(assembly="hg19",
                                    annotation="EnsemblGenes73")

    gtfReader = csv.reader(open(gtfFile, "r"), delimiter="\t")

    cuffGeneIdsToEnsemblGeneIds = collections.defaultdict(set)

    missing = set()
    for row in gtfReader:
        detailsColumn = row[8]
        details = dict(
            item.replace("\"", "").split(" ")
            for item in detailsColumn.split("; "))

        if "nearest_ref" not in details:
            continue