def __init__(self, build, annotation, exons, minExons, numbExons, tsses, chunks, ud, ui, dd, di): assert not (exons and tsses) #Stores all parameters self.exons = exons self.minExons = minExons self.numbExons = numbExons self.tsses = tsses self.geneNumbChunks = 20 if (chunks == None) else chunks self.upstreamDistance = 5000 if (ud == None) else ud self.upstreamInterval = 1000 if (ui == None) else ui self.downstreamDistance = 5000 if (dd == None) else dd self.downstreamInterval = 1000 if (di == None) else di # Gets dictionaries of genes, exons transcripts etc from Ensembl class #assert build == "hg18", "Non hg-18 not supported for Ensembl regions at the moment" #Gets the lengths of chromosomes # we keep this as a non instance variable as well so that the subclass below can access it genedata = Ensembl.EnsemblGenes(assembly=build, annotation=annotation) self.genedata = genedata self.chromosomeEnds = ChromosomeEnds(build) # Gets an object of the genelist class class EnsemblNameAndIDFormatter(list): def __init__(self, genesToUseLocation): geneList = GeneList(genesToUseLocation) self.seengenes = set() for gene in geneList: if gene in self.seengenes: # seen this gene id before (generally shouldnt be the case as using GeneList which guarantees this # for the source list at least pass elif gene in genedata: # we've not seen the gene but it's in the ensembl ids list self.append(gene) self.seengenes.add(gene) else: # it's not in the ensembl ids list, it could be a gene name found = False for geneid in genedata.getGeneIDs(gene): if geneid not in self.seengenes: self.append(geneid) self.seengenes.add(geneid) found = True if not found: print "No GeneID for:" + gene print genesToUseLocation + ":" + str(len(self)) self.regionIterator = EnsemblNameAndIDFormatter
try: opts, args = getopt.getopt(sys.argv[1:], "g:", []) except getopt.GetoptError, err: # print help information and exit: print str(err) # will print something like "option -a not recognized" sys.exit(2) UPSTREAM_PROMOTOR_DIST = 5000 genelists = [] for o, a in opts: if o == "-g": genelists.append(GeneList(a)) assert len(genelists) > 0 genedata = Ensembl.EnsemblGenes(assembly="hg18", annotation="ncbi36.1") for genelist in genelists: print genelist.getFullName() for gene in genedata: # does the gene match the pattern for pattern in genelist: if re.match(pattern, genedata[gene].name): start, stop = genedata[gene].getGeneWithPromotor( upstreamPadding=UPSTREAM_PROMOTOR_DIST) print genedata[gene].id, genedata[gene].name, genedata[ gene].chr, start, stop break
print str(err) # will print something like "option -a not recognized" sys.exit(2) assembly = "hg19" ensemblids = [] fastaFile = None for o, a in opts: if o == "--ensemblids": ensemblids = GeneList(a) elif o == "--fasta": fastaFile = FastAFile(a) if len(ensemblids) > 0: genedata = Ensembl.EnsemblGenes(assembly=assembly) print "ensemblid", "chr", "start", "stop", "transcripts", "stemloops", "polyas" for ensemblid in ensemblids: stemloops = 0 polyas = 0 for transcriptid in genedata[ensemblid]: # http://genome-euro.ucsc.edu/cgi-bin/hgc?db=hg19&g=htcGeneMrna&i=ENST00000314332&o=ensGene&table=ensGene chrm = genedata[ensemblid][transcriptid].chr start = str(genedata[ensemblid][transcriptid].start) stop = str(genedata[ensemblid][transcriptid].end)
print str(err) # will print something like "option -a not recognized" print "Usage: main.py [Space seperated list of gene id sets]" sys.exit(2) for o, a in opts: if (o == "-G") or (o == "--gtf"): gtfFile = a elif (o == "-D") or (o == "--gene-expression-difference"): differences = a elif (o == "-o") or (o == "--output"): outputFile = a else: print "Unknown parameter: " + o + " " + a sys.exit(2) genedata = Ensembl.EnsemblGenes(assembly="hg19", annotation="EnsemblGenes73") gtfReader = csv.reader(open(gtfFile, "r"), delimiter="\t") cuffGeneIdsToEnsemblGeneIds = collections.defaultdict(set) missing = set() for row in gtfReader: detailsColumn = row[8] details = dict( item.replace("\"", "").split(" ") for item in detailsColumn.split("; ")) if "nearest_ref" not in details: continue