def __init__(self, build, annotation, exons, minExons, numbExons, tsses, chunks, ud, ui, dd, di): assert not (exons and tsses) #Stores all parameters self.exons = exons self.minExons = minExons self.numbExons = numbExons self.tsses = tsses self.geneNumbChunks = 20 if (chunks == None) else chunks self.upstreamDistance = 5000 if (ud == None) else ud self.upstreamInterval = 1000 if (ui == None) else ui self.downstreamDistance = 5000 if (dd == None) else dd self.downstreamInterval = 1000 if (di == None) else di # Gets dictionaries of genes, exons transcripts etc from Ensembl class #assert build == "hg18", "Non hg-18 not supported for Ensembl regions at the moment" #Gets the lengths of chromosomes # we keep this as a non instance variable as well so that the subclass below can access it genedata = Ensembl.EnsemblGenes(assembly=build, annotation=annotation) self.genedata = genedata self.chromosomeEnds = ChromosomeEnds(build) # Gets an object of the genelist class class EnsemblNameAndIDFormatter(list): def __init__(self, genesToUseLocation): geneList = GeneList(genesToUseLocation) self.seengenes = set() for gene in geneList: if gene in self.seengenes: # seen this gene id before (generally shouldnt be the case as using GeneList which guarantees this # for the source list at least pass elif gene in genedata: # we've not seen the gene but it's in the ensembl ids list self.append(gene) self.seengenes.add(gene) else: # it's not in the ensembl ids list, it could be a gene name found = False for geneid in genedata.getGeneIDs(gene): if geneid not in self.seengenes: self.append(geneid) self.seengenes.add(geneid) found = True if not found: print "No GeneID for:" + gene print genesToUseLocation + ":" + str(len(self)) self.regionIterator = EnsemblNameAndIDFormatter
try: opts, args = getopt.getopt(sys.argv[1:], "g:", []) except getopt.GetoptError, err: # print help information and exit: print str(err) # will print something like "option -a not recognized" sys.exit(2) UPSTREAM_PROMOTOR_DIST = 5000 genelists = [] for o, a in opts: if o == "-g": genelists.append(GeneList(a)) assert len(genelists) > 0 genedata = Ensembl.EnsemblGenes(assembly="hg18", annotation="ncbi36.1") for genelist in genelists: print genelist.getFullName() for gene in genedata: # does the gene match the pattern for pattern in genelist: if re.match(pattern, genedata[gene].name): start, stop = genedata[gene].getGeneWithPromotor( upstreamPadding=UPSTREAM_PROMOTOR_DIST) print genedata[gene].id, genedata[gene].name, genedata[ gene].chr, start, stop break
print str(err) # will print something like "option -a not recognized" sys.exit(2) assembly = "hg19" ensemblids = [] fastaFile = None for o, a in opts: if o == "--ensemblids": ensemblids = GeneList(a) elif o == "--fasta": fastaFile = FastAFile(a) if len(ensemblids) > 0: genedata = Ensembl.EnsemblGenes(assembly=assembly) print "ensemblid", "chr", "start", "stop", "transcripts", "stemloops", "polyas" for ensemblid in ensemblids: stemloops = 0 polyas = 0 for transcriptid in genedata[ensemblid]: # http://genome-euro.ucsc.edu/cgi-bin/hgc?db=hg19&g=htcGeneMrna&i=ENST00000314332&o=ensGene&table=ensGene chrm = genedata[ensemblid][transcriptid].chr start = str(genedata[ensemblid][transcriptid].start) stop = str(genedata[ensemblid][transcriptid].end)
vstepWidth = 200 #default numbSlices = 20 # default for o, a in opts: # need a vstep width if o == "-s": vstepWidth = int(a) # need a vstep file elif o == "-v": vstepFile = a # need a list of genes -> coords elif o == "-e": genesFileName = a # eg : "genes-and-exons-human-NCBI36.csv" print "Loading gene mapping" genesmapping = Ensembl.GenesMapping(genesFileName) # need an output folder elif o == "-o": outputFolder = a # need a number of slices elif o == "-p": numbSlices = a elif o == "-f": friendlyGenesNames = Ensembl.FriendlyGeneNames(a) elif o == "-w": webroot = a # need lists of genes geneListLocations = args # create output folder
print str(err) # will print something like "option -a not recognized" print "Usage: main.py [Space seperated list of gene id sets]" sys.exit(2) for o, a in opts: if (o == "-G") or (o == "--gtf"): gtfFile = a elif (o == "-D") or (o == "--gene-expression-difference"): differences = a elif (o == "-o") or (o == "--output"): outputFile = a else: print "Unknown parameter: " + o + " " + a sys.exit(2) genedata = Ensembl.EnsemblGenes(assembly="hg19", annotation="EnsemblGenes73") gtfReader = csv.reader(open(gtfFile, "r"), delimiter="\t") cuffGeneIdsToEnsemblGeneIds = collections.defaultdict(set) missing = set() for row in gtfReader: detailsColumn = row[8] details = dict( item.replace("\"", "").split(" ") for item in detailsColumn.split("; ")) if "nearest_ref" not in details: continue
rnaSeqExpressionData = IndexedCSV(a, key="test_id") elif o == "-a": assembly = a UPSTREAM_PROMOTOR_DIST = 2000 DOWNSTREAM_PROMOTOR_DIST = 2000 writer = csv.writer(open(outfile, "w"), delimiter="\t") genome = Genome(assembly) ### # load data genedata = Ensembl.EnsemblGenes(assembly=assembly) genes = Ensembl.ReverseGeneMapping(genedata) genespluspromotor = Ensembl.ReverseGeneMapping( genedata, tssPadding=UPSTREAM_PROMOTOR_DIST) genepromotors = Ensembl.ReversePromotorMapping( genedata, upstreamPadding=UPSTREAM_PROMOTOR_DIST, downstreamPadding=DOWNSTREAM_PROMOTOR_DIST) exons = Ensembl.ReverseExonMapping(genedata) transcriptionSites = Ensembl.TranscriptionSites(genedata)
opts, args = getopt.getopt(sys.argv[1:], "", []) except getopt.GetoptError, err: # print help information and exit: print str(err) # will print something like "option -a not recognized" sys.exit(2) UPSTREAM_PROMOTOR_DIST = 5000 DOWNSTREAM_PROMOTOR_DIST = 1000 # probably want to change this to be exons rather than genes ### # load data genedata = Ensembl.EnsemblGenes(assembly="hg18", annotation="ncbi36.1") bp = 0 for gene in genedata: bp += genedata[gene].end - genedata[gene].start print bp #genedata = Ensembl.GenesMapping(os.path.expanduser("~/mount/publicdata/hg18/ncbi36.1/genes-and-exons-human-NCBI36.1.csv")) genes = Ensembl.ReverseGeneMapping(genedata) genespluspromotor = Ensembl.ReverseGeneMapping( genedata, tssPadding=UPSTREAM_PROMOTOR_DIST) genepromotors = Ensembl.ReversePromotorMapping(
reader = csv.reader(open(infile), delimiter="\t") writer = csv.writer(open(outfile, "w"), delimiter="\t") ### TSS_TTS_Distance = 5000 TTS_TTS_Distance_Human = str(TSS_TTS_Distance / 1000) + "kb" Small_TSS_TTS_Distance = 1000 Small_TTS_TTS_Distance_Human = str(Small_TSS_TTS_Distance / 1000) + "kb" # load data genedata = Ensembl.EnsemblGenes(assembly="hg18", annotation="ncbi36.1") genes = Ensembl.ReverseGeneMapping(genedata) exons = Ensembl.ReverseExonMapping(genedata) transcriptionSites = Ensembl.TranscriptionSites(genedata) cpgIslands = ExtendedBed( os.path.expanduser( "~/mount/publicdata/hg18/cpgislands/cpgislands-0-index.bed")) affyannotation = NetAffxAnnotation() paddedGenes = Ensembl.ReverseGeneMapping(genedata, tssPadding=TSS_TTS_Distance)