def twilio(): body = request.form['Body'].split(' ') resp = twiml.Response() myText = "unknown command. Please enter: <buy/sell TICKER #OFSHARES> to make a trade, or enter <status> to see the current portfolio value." if len(body) != 3: if body[0].lower() == "status": myText = str(utils.get_portfolio_val(g)) resp.message(myText) return str(resp) order, sym, value = body myText = "Nice! You put in an order to " + order.lower() + " " + value + " shares of " + sym.upper() resp.message(myText) utils.order(order, sym, value, g) return str(resp)
def promptio(): myJson = request.get_json() myText = "enter <@stock_00017 status> to check portfolio value. enter <@stock_00017 buy/sell TICKER #OFSHARES> to make a trade." body = myJson['message'].split(" ") if body[0].lower() == "status": myText = str(utils.get_portfolio_val(g)) if len(body) == 3: order, sym, value = body utils.order(order, sym, value, g) myText = "Nice! You put in an order to " + order.lower() + " " + value + " shares of " + sym.upper() dat = jsonify(sendmms=False, showauthurl=False, authstate=None, text=myText, speech=myText, status="OK", webhookreply=None, images=[{"imageurl": None, "alttext": None}]) # insert json responses here resp = make_response(dat, 200, {"Content-Type": "application/json"}) return resp
def twilio(): body = request.form['Body'].split(' ') resp = twiml.Response() myText = "unknown command. Please enter: <buy/sell TICKER #OFSHARES> to make a trade, or enter <status> to see the current portfolio value." if len(body) != 3: if body[0].lower() == "status": myText = str(utils.get_portfolio_val(g)) resp.message(myText) return str(resp) order, sym, value = body myText = "Nice! You put in an order to " + order.lower( ) + " " + value + " shares of " + sym.upper() resp.message(myText) utils.order(order, sym, value, g) return str(resp)
def visit_character(self, node, children) -> Any: ident = children[0]['top'] char = children[0]['nodes'][ident]['label'] if char == '" "': char = ' ' return order(char)
def buy(): order = request.form['order'] sym = request.form['sym'] val = request.form['val'] if utils.valid_order(order, sym, val): r = utils.order(order, sym, val, g) return str(r) return "not valid"
def solve(G, s): """ Args: G: networkx.Graph s: stress_budget Returns: D: Dictionary mapping for student to breakout room r e.g. {0:2, 1:0, 2:1, 3:2} k: Number of breakout rooms """ n = G.order() happiness = {} stress = {} for i in range(n): happiness[i] = {} stress[i] = {} for i in range(n): for j in range(i + 1, n): happiness[i][j] = G.get_edge_data(i, j)['happiness'] stress[i][j] = G.get_edge_data(i, j)['stress'] print("Input Size:", n) if n <= 10: #brute force approach start_time = time.perf_counter() bf_arr, bf_val = bruteforce.bruteforce(happiness, stress, len(list(happiness.keys())), s) bf_val = round(bf_val, 3) end_time = time.perf_counter() print("Brute Force Approach Time:", end_time - start_time) #ILP start_time = time.perf_counter() answer, rooms, best_k = lp.lp_solve(happiness, stress, s, n) end_time = time.perf_counter() times.append(end_time - start_time) print("Gurobi Approach Time: ", end_time - start_time) print("Gurobi Answer: ", answer) #print("Gurobi Rooms (raw): ", rooms, best_k) print("Gurobi Rooms:", order(rooms)[0]) if n <= 10: #assert bf_val == answer, "Incorrect computation" print("Brutef Rooms:", bf_arr) print("Brutef Answer:", bf_val) return rooms, order(rooms)[1]
def mergeCollections(nameDict, analysisName, output='', superOnly=True): ''' merges them collections ''' allLoci = [] namesList = nameDict.keys() for name in namesList: seCollection = makeSECollection(nameDict[name]['enhancerFile'], name, superOnly) if superOnly: print "DATASET: %s HAS %s SUPERENHANCERS" % (name, len(seCollection)) else: print "DATASET: %s HAS %s ENHANCERS" % (name, len(seCollection)) allLoci += seCollection.getLoci() print len(allLoci) mergedCollection = utils.LocusCollection(allLoci, 50) #stitch the collection together stitchedCollection = mergedCollection.stitchCollection() stitchedLoci = stitchedCollection.getLoci() print "IDENTIFIED %s CONSENSUS ENHANCER REGIONS" % (len(stitchedLoci)) #sort by size and provide a unique ID sizeList = [locus.len() for locus in stitchedLoci] sizeOrder = utils.order(sizeList, decreasing=True) orderedLoci = [stitchedLoci[i] for i in sizeOrder] for i in range(len(orderedLoci)): orderedLoci[i]._ID = 'merged_%s_%s' % (analysisName, str(i + 1)) mergedGFF = [] for locus in orderedLoci: newLine = [ locus.chr(), locus.ID(), '', locus.start(), locus.end(), '', locus.sense(), '', locus.ID() ] mergedGFF.append(newLine) if len(output) == 0: return mergedGFF else: print "writing merged gff to %s" % (output) utils.unParseTable(mergedGFF, output, '\t') return output
def mergeCollections(nameDict,analysisName,output='',superOnly=True): ''' merges them collections ''' allLoci = [] namesList = nameDict.keys() for name in namesList: seCollection =makeSECollection(nameDict[name]['enhancerFile'],name,superOnly) if superOnly: print "DATASET: %s HAS %s SUPERENHANCERS" % (name,len(seCollection)) else: print "DATASET: %s HAS %s ENHANCERS" % (name,len(seCollection)) allLoci += seCollection.getLoci() print len(allLoci) mergedCollection = utils.LocusCollection(allLoci,50) #stitch the collection together stitchedCollection = mergedCollection.stitchCollection() stitchedLoci = stitchedCollection.getLoci() print "IDENTIFIED %s CONSENSUS ENHANCER REGIONS" % (len(stitchedLoci)) #sort by size and provide a unique ID sizeList = [locus.len() for locus in stitchedLoci] sizeOrder = utils.order(sizeList,decreasing=True) orderedLoci = [stitchedLoci[i] for i in sizeOrder] for i in range(len(orderedLoci)): orderedLoci[i]._ID = 'merged_%s_%s' % (analysisName,str(i+1)) mergedGFF = [] for locus in orderedLoci: newLine = [locus.chr(),locus.ID(),'',locus.start(),locus.end(),'',locus.sense(),'',locus.ID()] mergedGFF.append(newLine) if len(output) == 0: return mergedGFF else: print "writing merged gff to %s" % (output) utils.unParseTable(mergedGFF,output,'\t') return output
def promptio(): myJson = request.get_json() myText = "enter <@stock_00017 status> to check portfolio value. enter <@stock_00017 buy/sell TICKER #OFSHARES> to make a trade." body = myJson['message'].split(" ") if body[0].lower() == "status": myText = str(utils.get_portfolio_val(g)) if len(body) == 3: order, sym, value = body utils.order(order, sym, value, g) myText = "Nice! You put in an order to " + order.lower( ) + " " + value + " shares of " + sym.upper() dat = jsonify(sendmms=False, showauthurl=False, authstate=None, text=myText, speech=myText, status="OK", webhookreply=None, images=[{ "imageurl": None, "alttext": None }]) # insert json responses here resp = make_response(dat, 200, {"Content-Type": "application/json"}) return resp
def __init__(self, conf_file=None, environment=None): self.log = utils._setupLogging() self.containers = {} if environment: self.load(environment) else: if not conf_file.startswith('/'): conf_file = os.path.join(os.path.dirname(sys.argv[0]), conf_file) data = open(conf_file, 'r') self.config = yaml.load(data) # On load order containers into the proper startup sequence self.start_order = utils.order(self.config['containers'])
def __init__(self, conf_file=None, environment=None): self.log = utils.setupLogging() self.containers = {} self.templates = {} self.state = 'live' if environment: self.load(environment) else: # If we didn't get an absolute path to a file, look for it in the current directory. if not conf_file.startswith('/'): conf_file = os.path.join(os.path.dirname(sys.argv[0]), conf_file) data = open(conf_file, 'r') self.config = yaml.load(data) # On load, order templates into the proper startup sequence self.start_order = utils.order(self.config['templates'])
def search(): body = request.get_json() query_subs: List[Subreddit] = None try: query_subs = solr.validate_query(body["subreddits"]) except Exception as e: traceback.print_exc() return jsonify({"message": str(e)}), 400 query_words, query_words_lmtzd = utils.analyze_queries(query_subs) doc_subs: List[Subreddit] = None doc_subs = solr.get_docs(query_words) docs_lmtzd = utils.analyze_documents(doc_subs) result = utils.order(query_words_lmtzd, docs_lmtzd, doc_subs) return jsonify({"result": list(result)})
def makeRigerTable(foldTableFile, output=''): ''' blah ''' #need a table of this format rigerTable = [[ 'Construct', 'GeneSymbol', 'NormalizedScore', 'Construct Rank', 'HairpinWeight' ]] #set weight to 1 for now foldTable = utils.parseTable(foldTableFile, '\t') constructOrder = utils.order([float(line[2]) for line in foldTable[1:]], decreasing=True) #make geneCountDict print("making gene count dictionary") geneCountDict = defaultdict(int) for line in foldTable[1:]: geneCountDict[line[1]] += 1 print("iterating through constructs") constructRank = 1 for i in constructOrder: rowIndex = i + 1 # accounts for the header geneName = foldTable[rowIndex][1] if geneCountDict[geneName] == 1: print( "Gene %s only has one guide RNA. Excluding from FRIGER analysis" % (geneName)) continue newLine = foldTable[rowIndex][0:3] + [constructRank, 1] rigerTable.append(newLine) constructRank += 1 if len(output) == 0: output = string.replace(foldTableFile, '_log2Ratio.txt', '_friger.txt') utils.unParseTable(rigerTable, output, '\t') return output
def load(self, filename='envrionment.yml'): self.log.info('Loading environment from: %s', filename) with open(filename, 'r') as input_file: self.config = yaml.load(input_file) self.state = self.config['state'] for tmpl in self.config['templates']: # TODO fix hardcoded service name and version self.templates[tmpl] = template.Template(tmpl, self.config['templates'][tmpl], 'service', '0.1') self.containers[tmpl] = {} self.start_order = utils.order(self.config['templates']) for container in self.config['containers']: tmpl = self.config['containers'][container]['template'] self.containers[tmpl][container] = Container(container, self.config['containers'][container], self.config['templates'][tmpl]['config'])
def make_T_top_regions(signal_table_path, top=1000): ''' makes a gff and fasta of the top N T regions based off the signal table ''' signal_table = utils.parseTable(signal_table_path, '\t') signal_dict = defaultdict(float) for line in signal_table[1:]: signal = (max(float(line[2]) - float(line[3]), 0) + max(float(line[4]) - float(line[5]), 0)) / 2 signal_dict[line[1]] = signal signal_vector = [signal_dict[line[1]] for line in signal_table[1:]] signal_order = utils.order(signal_vector, decreasing=True) t_top_gff_path = '%sCH22_T_UNION_TOP_%s_-0_+0.gff' % (gffFolder, str(top)) print(t_top_gff_path) t_top_gff = [] for i in range(top): signal_row = signal_order[i] + 1 line = signal_table[signal_row] region_id = line[1] chrom = region_id.split('(')[0] coords = region_id.split(':')[-1].split('-') gff_line = [ chrom, region_id, '', coords[0], coords[1], '', '.', '', region_id ] t_top_gff.append(gff_line) utils.unParseTable(t_top_gff, t_top_gff_path, '\t') t_top_fasta = utils.gffToFasta('HG19', genomeDirectory, t_top_gff) t_top_fasta_path = '%sHG19_CH22_T_UNION_TOP_%s_-0_+0.fasta' % (fastaFolder, top) utils.unParseTable(t_top_fasta, t_top_fasta_path, '') return t_top_fasta_path
def makeRigerTable(foldTableFile,output=''): ''' blah ''' #need a table of this format rigerTable = [['Construct','GeneSymbol','NormalizedScore','Construct Rank','HairpinWeight']] #set weight to 1 for now foldTable = utils.parseTable(foldTableFile,'\t') constructOrder = utils.order([float(line[2]) for line in foldTable[1:]],decreasing=True) #make geneCountDict print("making gene count dictionary") geneCountDict= defaultdict(int) for line in foldTable[1:]: geneCountDict[line[1]] +=1 print("iterating through constructs") constructRank = 1 for i in constructOrder: rowIndex = i+1 # accounts for the header geneName = foldTable[rowIndex][1] if geneCountDict[geneName] == 1: print("Gene %s only has one guide RNA. Excluding from FRIGER analysis" % (geneName)) continue newLine = foldTable[rowIndex][0:3] + [constructRank,1] rigerTable.append(newLine) constructRank += 1 if len(output) == 0: output = string.replace(foldTableFile,'_log2Ratio.txt','_friger.txt') utils.unParseTable(rigerTable,output,'\t') return output
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' startDict = utils.makeStartDict(annotFile) enhancerName = enhancerFile.split('/')[-1].split('.')[0] enhancerTable = utils.parseTable(enhancerFile, '\t') # internal parameter for debugging byRefseq = False if len(transcribedFile) > 0: transcribedTable = utils.parseTable(transcribedFile, '\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = startDict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = utils.makeTranscriptCollection( annotFile, 0, 0, 500, transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really # matter tssCollection = utils.LocusCollection(tssLoci, 50) geneDict = {'overlapping': defaultdict( list), 'proximal': defaultdict(list)} # dictionaries to hold ranks and superstatus of gene nearby enhancers rankDict = defaultdict(list) superDict = defaultdict(list) # list of all genes that appear in this analysis overallGeneList = [] # find the damn header for line in enhancerTable: if line[0][0] == '#': continue else: header = line break if noFormatTable: # set up the output tables # first by enhancer enhancerToGeneTable = [ header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']] else: # set up the output tables # first by enhancer enhancerToGeneTable = [ header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]] # next by gene geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']] # next make the gene to enhancer table geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']] for line in enhancerTable: if line[0][0] == '#' or line[0][0] == 'R': continue enhancerString = '%s:%s-%s' % (line[1], line[2], line[3]) enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0]) # overlapping genes are transcribed genes whose transcript is directly # in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap( enhancerLocus, 'both') overlappingGenes = [] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) # proximalGenes are transcribed genes where the tss is within 50kb of # the boundary of the stitched loci proximalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both') proximalGenes = [] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both') distalGenes = [] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) distalGenes = utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes # these checks make sure each gene list is unique. # technically it is possible for a gene to be overlapping, but not proximal since the # gene could be longer than the 50kb window, but we'll let that slide # here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) # Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: # get enhancerCenter enhancerCenter = (int(line[2]) + int(line[3])) / 2 # get absolute distance to enhancer center distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] # get the ID and convert to name closestGene = startDict[ allEnhancerGenes[distList.index(min(distList))]]['name'] # NOW WRITE THE ROW FOR THE ENHANCER TABLE if noFormatTable: newEnhancerLine = list(line) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) else: newEnhancerLine = line[0:9] newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) newEnhancerLine += line[-2:] enhancerToGeneTable.append(newEnhancerLine) # Now grab all overlapping and proximal genes for the gene ordered # table overallGeneList += overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) overallGeneList += proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) # End loop through # Make table by gene print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION') overallGeneList = utils.uniquify(overallGeneList) #get the chromLists from the various bams here cmd = 'samtools idxstats %s' % (rankByBamFile) idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) idxStats= idxStats.communicate() bamChromList = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]] if len(controlBamFile) > 0: cmd = 'samtools idxstats %s' % (controlBamFile) idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) idxStats= idxStats.communicate() bamChromListControl = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]] bamChromList = [chrom for chrom in bamChromList if bamChromListControl.count(chrom) != 0] #now make sure no genes have a bad chrom overallGeneList = [gene for gene in overallGeneList if bamChromList.count(startDict[gene]['chr']) != 0] #now make an enhancer collection of all transcripts enhancerGeneCollection = utils.makeTranscriptCollection( annotFile, 5000, 5000, 500, overallGeneList) enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection) # dump the gff to file enhancerFolder = utils.getParentFolder(enhancerFile) gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome) enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName) utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t') # now we need to run bamToGFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = 'bamliquidator_batch' print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS') # map density at genes in the +/- 5kb tss region # first on the rankBy bam bamName = rankByBamFile.split('/')[-1] mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName) mappedRankByFile = "%s%s_%s_%s/matrix.txt" % (enhancerFolder,enhancerName, gffRootName, bamName) cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile) print("Mapping rankby bam %s" % (rankByBamFile)) print(cmd) os.system(cmd) #check for completion if utils.checkOutput(mappedRankByFile,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) sys.exit() # next on the control bam if it exists if len(controlBamFile) > 0: controlName = controlBamFile.split('/')[-1] mappedControlFolder = "%s%s_%s_%s/" % ( enhancerFolder, enhancerName,gffRootName, controlName) mappedControlFile = "%s%s_%s_%s/matrix.txt" % ( enhancerFolder, enhancerName,gffRootName, controlName) cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile) print("Mapping control bam %s" % (controlBamFile)) print(cmd) os.system(cmd) #check for completion if utils.checkOutput(mappedControlFile,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) sys.exit() # now get the appropriate output files if len(controlBamFile) > 0: print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" % (mappedRankByFile, mappedControlFile)) if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile, mappedControlFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() else: print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile)) if utils.checkOutput(mappedRankByFile, 1, 30): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() # use enhancer rank to order rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList]) usedNames = [] # make a new dict to hold TSS signal by max per geneName geneNameSigDict = defaultdict(list) print('MAKING GENE TABLE') for i in rankOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['overlapping'][ refID] + geneDict['proximal'][refID] superStatus = max(superDict[refID]) enhancerRanks = join([str(x) for x in rankDict[refID]], ',') enhancerSignal = signalDict[refID] geneNameSigDict[geneName].append(enhancerSignal) newLine = [geneName, refID, join( proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal] geneToEnhancerTable.append(newLine) #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t') print('MAKING ENHANCER TO TOP GENE TABLE') if noFormatTable: enhancerToTopGeneTable = [ enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']] else: enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [ 'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]] for line in enhancerToGeneTable[1:]: geneList = [] if noFormatTable: geneList += line[-3].split(',') geneList += line[-2].split(',') else: geneList += line[10].split(',') geneList += line[11].split(',') geneList = utils.uniquify([x for x in geneList if len(x) > 0]) if len(geneList) > 0: try: sigVector = [max(geneNameSigDict[x]) for x in geneList] maxIndex = sigVector.index(max(sigVector)) maxGene = geneList[maxIndex] maxSig = sigVector[maxIndex] if maxSig == 0.0: maxGene = 'NONE' maxSig = 'NONE' except ValueError: if len(geneList) == 1: maxGene = geneList[0] maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' if noFormatTable: newLine = line + [maxGene, maxSig] else: newLine = line[0:12] + [maxGene, maxSig] + line[-2:] enhancerToTopGeneTable.append(newLine) # resort enhancerToGeneTable if noFormatTable: return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable else: enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]]) sortedTable = [enhancerToGeneTable[0]] sortedTopGeneTable = [enhancerToTopGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i + 1)]) sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)]) return sortedTable, sortedTopGeneTable, geneToEnhancerTable
def mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, output, refName): ''' makes a table of factor density in a stitched locus and ranks table by number of loci stitched together ''' print('FORMATTING TABLE') loci = stitchedCollection.getLoci() locusTable = [[ 'REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE' ]] lociLenList = [] # strip out any that are in chrY for locus in list(loci): if locus.chr() == 'chrY': loci.remove(locus) for locus in loci: # numLociList.append(int(stitchLocus.ID().split('_')[1])) lociLenList.append(locus.len()) # numOrder = order(numLociList,decreasing=True) lenOrder = utils.order(lociLenList, decreasing=True) ticker = 0 for i in lenOrder: ticker += 1 if ticker % 1000 == 0: print(ticker) locus = loci[i] # First get the size of the enriched regions within the stitched locus refEnrichSize = 0 refOverlappingLoci = referenceCollection.getOverlap(locus, 'both') for refLocus in refOverlappingLoci: refEnrichSize += refLocus.len() try: stitchCount = int(locus.ID().split('_')[0]) except ValueError: stitchCount = 1 coords = [int(x) for x in locus.coords()] locusTable.append([ locus.ID(), locus.chr(), min(coords), max(coords), stitchCount, refEnrichSize ]) print('GETTING MAPPED DATA') print("USING A BAMFILE LIST:") print(bamFileList) for bamFile in bamFileList: bamFileName = bamFile.split('/')[-1] print('GETTING MAPPING DATA FOR %s' % bamFile) # assumes standard convention for naming enriched region gffs # opening up the mapped GFF print('OPENING %s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName)) mappedGFF = utils.parseTable( '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName), '\t') signalDict = defaultdict(float) print('MAKING SIGNAL DICT FOR %s' % (bamFile)) mappedLoci = [] for line in mappedGFF[1:]: chrom = line[1].split('(')[0] start = int(line[1].split(':')[-1].split('-')[0]) end = int(line[1].split(':')[-1].split('-')[1]) mappedLoci.append(utils.Locus(chrom, start, end, '.', line[0])) try: signalDict[line[0]] = float(line[2]) * (abs(end - start)) except ValueError: print('WARNING NO SIGNAL FOR LINE:') print(line) continue mappedCollection = utils.LocusCollection(mappedLoci, 500) locusTable[0].append(bamFileName) for i in range(1, len(locusTable)): signal = 0.0 line = locusTable[i] lineLocus = utils.Locus(line[1], line[2], line[3], '.') overlappingRegions = mappedCollection.getOverlap(lineLocus, sense='both') for region in overlappingRegions: signal += signalDict[region.ID()] locusTable[i].append(signal) utils.unParseTable(locusTable, output, '\t')
def visit_character_set(self, node, children) -> Any: negated = children[0] == '^' if negated: children = children[1:] classes, values = set(), set() if children[0] in ['-', ']']: values.add(ord(children[0])) children = children[1:] for child in children: if isinstance(child, dict): ident = child['top'] label = child['nodes'][ident]['label'] try: values.update(order(label)) except TypeError: classes.add(label) else: values.update(child) graph = add_node( f"{'^' if negated else ''}charset", font=Font.ITALIC, shape=Shape.TRAPEZIUM, color=NEGATED if negated else None, ) source = graph['top'] for class_ in sorted(classes): child = add_node(class_, shape=Shape.BOX, style=Style.FILLED) graph = merge(graph, child) graph = add_edge(source, child['top'], graph) for group, symbol in [ (BUT_SPACE, '\\S'), (BUT_DIGIT, '\\D'), (BUT_WORD, '\\W'), (WORD, '\\w'), (DIGIT, '\\d'), (SPACE, '\\s'), ]: if group in values: child = add_node(symbol, shape=Shape.BOX, style=Style.FILLED) graph = merge(graph, child) graph = add_edge(source, child['top'], graph) values -= group start, last = None, None for value in sorted(values): if last is None: start, last = value, value elif value == last + 1: last = value else: label = normal( start ) if last == start else f"{normal(start)}-{normal(last)}" child = add_node(label, shape=Shape.BOX) graph = merge(graph, child) graph = add_edge(source, child['top'], graph) start, last = None, None if last is not None: label = normal( start) if last == start else f"{normal(start)}-{normal(last)}" child = add_node(label, shape=Shape.BOX) graph = merge(graph, child) graph = add_edge(source, child['top'], graph) return graph
def mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, output, refName): ''' makes a table of factor density in a stitched locus and ranks table by number of loci stitched together ''' print('FORMATTING TABLE') loci = stitchedCollection.getLoci() locusTable = [['REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE']] lociLenList = [] # strip out any that are in chrY for locus in list(loci): if locus.chr() == 'chrY': loci.remove(locus) for locus in loci: # numLociList.append(int(stitchLocus.ID().split('_')[1])) lociLenList.append(locus.len()) # numOrder = order(numLociList,decreasing=True) lenOrder = utils.order(lociLenList, decreasing=True) ticker = 0 for i in lenOrder: ticker += 1 if ticker % 1000 == 0: print(ticker) locus = loci[i] # First get the size of the enriched regions within the stitched locus refEnrichSize = 0 refOverlappingLoci = referenceCollection.getOverlap(locus, 'both') for refLocus in refOverlappingLoci: refEnrichSize += refLocus.len() try: stitchCount = int(locus.ID().split('_')[0]) except ValueError: stitchCount = 1 coords = [int(x) for x in locus.coords()] locusTable.append([locus.ID(), locus.chr(), min(coords), max(coords), stitchCount, refEnrichSize]) print('GETTING MAPPED DATA') print("USING A BAMFILE LIST:") print(bamFileList) for bamFile in bamFileList: bamFileName = bamFile.split('/')[-1] print('GETTING MAPPING DATA FOR %s' % bamFile) # assumes standard convention for naming enriched region gffs # opening up the mapped GFF print('OPENING %s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName)) mappedGFF = utils.parseTable('%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName), '\t') signalDict = defaultdict(float) print('MAKING SIGNAL DICT FOR %s' % (bamFile)) mappedLoci = [] for line in mappedGFF[1:]: chrom = line[1].split('(')[0] start = int(line[1].split(':')[-1].split('-')[0]) end = int(line[1].split(':')[-1].split('-')[1]) mappedLoci.append(utils.Locus(chrom, start, end, '.', line[0])) try: signalDict[line[0]] = float(line[2]) * (abs(end - start)) except ValueError: print('WARNING NO SIGNAL FOR LINE:') print(line) continue mappedCollection = utils.LocusCollection(mappedLoci, 500) locusTable[0].append(bamFileName) for i in range(1, len(locusTable)): signal = 0.0 line = locusTable[i] lineLocus = utils.Locus(line[1], line[2], line[3], '.') overlappingRegions = mappedCollection.getOverlap(lineLocus, sense='both') for region in overlappingRegions: signal += signalDict[region.ID()] locusTable[i].append(signal) utils.unParseTable(locusTable, output, '\t')
def mapEnhancerToGene(annotFile,enhancerFile,transcribedFile='',uniqueGenes=True,searchWindow =50000,noFormatTable = False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' startDict = utils.makeStartDict(annotFile) enhancerTable = utils.parseTable(enhancerFile,'\t') #internal parameter for debugging byRefseq = False if len(transcribedFile) > 0: transcribedTable = utils.parseTable(transcribedFile,'\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = startDict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = utils.makeTranscriptCollection(annotFile,0,0,500,transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(utils.makeTSSLocus(geneID,startDict,0,0)) #this turns the tssLoci list into a LocusCollection #50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = utils.LocusCollection(tssLoci,50) geneDict = {'overlapping':defaultdict(list),'proximal':defaultdict(list)} #dictionaries to hold ranks and superstatus of gene nearby enhancers rankDict = defaultdict(list) superDict= defaultdict(list) #list of all genes that appear in this analysis overallGeneList = [] if noFormatTable: #set up the output tables #first by enhancer enhancerToGeneTable = [enhancerTable[0]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE']] else: #set up the output tables #first by enhancer enhancerToGeneTable = [enhancerTable[0][0:9]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE'] + enhancerTable[5][-2:]] #next by gene geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS']] #next make the gene to enhancer table geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS','ENHANCER_RANKS','IS_SUPER']] for line in enhancerTable: if line[0][0] =='#' or line[0][0] == 'R': continue enhancerString = '%s:%s-%s' % (line[1],line[2],line[3]) enhancerLocus = utils.Locus(line[1],line[2],line[3],'.',line[0]) #overlapping genes are transcribed genes whose transcript is directly in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap(enhancerLocus,'both') overlappingGenes =[] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) #proximalGenes are transcribed genes where the tss is within 50kb of the boundary of the stitched loci proximalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancerLocus,searchWindow,searchWindow),'both') proximalGenes =[] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancerLocus,1000000,1000000),'both') distalGenes =[] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) distalGenes = utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes #these checks make sure each gene list is unique. #technically it is possible for a gene to be overlapping, but not proximal since the #gene could be longer than the 50kb window, but we'll let that slide here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) #Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: #get enhancerCenter enhancerCenter = (int(line[2]) + int(line[3]))/2 #get absolute distance to enhancer center distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] #get the ID and convert to name closestGene = startDict[allEnhancerGenes[distList.index(min(distList))]]['name'] #NOW WRITE THE ROW FOR THE ENHANCER TABLE if noFormatTable: newEnhancerLine = list(line) newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),',')) newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]),',')) newEnhancerLine.append(closestGene) else: newEnhancerLine = line[0:9] newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),',')) newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]),',')) newEnhancerLine.append(closestGene) newEnhancerLine += line[-2:] enhancerToGeneTable.append(newEnhancerLine) #Now grab all overlapping and proximal genes for the gene ordered table overallGeneList +=overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) overallGeneList+=proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) #End loop through #Make table by gene overallGeneList = utils.uniquify(overallGeneList) #use enhancer rank to order rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList]) usedNames = [] for i in rankOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['overlapping'][refID]+geneDict['proximal'][refID] superStatus = max(superDict[refID]) enhancerRanks = join([str(x) for x in rankDict[refID]],',') newLine = [geneName,refID,join(proxEnhancers,','),enhancerRanks,superStatus] geneToEnhancerTable.append(newLine) #resort enhancerToGeneTable if noFormatTable: return enhancerToGeneTable,geneToEnhancerTable else: enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]]) sortedTable = [enhancerToGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i+1)]) return sortedTable,geneToEnhancerTable
def rank_eboxes(nb_all_chip_dataFile,mycn_gff_path,macsFolder,genomeDirectory,window = 100): ''' uses the conserved MYCN sites and ranks eboxes within them by average background subtracted signal searches 100bp (window variable) from mycn summits ''' window = int(window) #bring in the conserved mycn region print('making gff of nb mycn summits') nb_mycn_gff = utils.parseTable(mycn_gff_path,'\t') nb_mycn_collection = utils.gffToLocusCollection(nb_mycn_gff,50) dataDict =pipeline_dfci.loadDataTable(nb_all_chip_dataFile) names_list = [name for name in dataDict.keys() if name.count('MYCN') == 1] names_list.sort() summit_loci = [] #first makes a gff of all summits +/- 100bp for all nb mycn datasets for name in names_list: summit_bed_path = '%s%s/%s_summits.bed' % (macsFolder,name,name) summit_bed = utils.parseTable(summit_bed_path,'\t') for line in summit_bed: summit_locus = utils.Locus(line[0],int(line[1])-window,int(line[2])+window,'.',line[3]) if len(nb_mycn_collection.getOverlap(summit_locus)) > 0: summit_loci.append(summit_locus) summit_collection =utils.LocusCollection(summit_loci,50) summit_merged_collection = summit_collection.stitchCollection() summit_gff = utils.locusCollectionToGFF(summit_merged_collection) summit_gff_path = '%sHG19_NB_MYCN_SUMMITS_-%s_+%s.gff' % (gffFolder,window,window) utils.unParseTable(summit_gff,summit_gff_path,'\t') #this is borrowed from above and maps chip-seq signal to the gff print('mapping to nb mycn summits and making signal dict') gffList = [summit_gff_path] summit_signal_path = pipeline_dfci.map_regions(nb_all_chip_dataFile,gffList) mycnSignalTable = utils.parseTable(summit_signal_path,'\t') #making a signal dictionary for MYCN binding names_list = ['BE2C_MYCN','KELLY_MYCN','NGP_MYCN','SHEP21_0HR_MYCN_NOSPIKE'] background_list = [dataDict[name]['background'] for name in names_list] header = mycnSignalTable[0] chip_columns = [header.index(name) for name in names_list] background_columns = [header.index(background_name) for background_name in background_list] mycn_sig_dict = {} for line in mycnSignalTable[1:]: line_sig = [] for i in range(len(names_list)): line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]])) region_id = line[1] coords = [int(x) for x in line[1].split(':')[-1].split('-')] line_length = coords[1]-coords[0] mycn_sig_dict[region_id] = numpy.mean(line_sig)*line_length #now for each region find the eboxes and then add up the signal print('making ebox ranking') ebox_list = ['CACGTG','CAGTTG','CAAGTG','CAGGTG','CAATTG','CAAATG','CATCTG','CAGCTG','CATGTG','CATATG'] eboxDict = {} for ebox in ebox_list: eboxDict[ebox] = [] ticker = 0 for line in summit_gff: if ticker % 1000 == 0: print(ticker) ticker+=1 chrom = line[0] sense = '.' start = int(line[3]) end = int(line[4]) region_id = '%s(%s):%s-%s' % (line[0],line[6],line[3],line[4]) signal = mycn_sig_dict[region_id] sequenceLine = utils.fetchSeq(genomeDirectory,chrom,start,end,True) motifVector = [] matches = re.finditer('CA..TG',str.upper(sequenceLine)) if matches: for match in matches: motifVector.append(match.group()) #count only 1 of each motif type per line #motifVector = utils.uniquify(motifVector) for motif in motifVector: if ebox_list.count(motif) > 0: eboxDict[motif].append(signal) else: eboxDict[utils.revComp(motif)].append(signal) eboxTable =[] eboxTableOrdered =[['EBOX','OCCURENCES','AVG_HEIGHT']] for ebox in eboxDict.keys(): newLine = [ebox,len(eboxDict[ebox]),numpy.mean(eboxDict[ebox])] eboxTable.append(newLine) occurenceOrder = utils.order([line[2] for line in eboxTable],decreasing=True) for x in occurenceOrder: eboxTableOrdered.append(eboxTable[x]) print(eboxTableOrdered) ebox_outfile = '%sHG19_NB_MYCN_CONSERVED_SUMMITS_-%s_+%s_EBOX_RANK.txt' % (tableFolder,window,window) utils.unParseTable(eboxTableOrdered,ebox_outfile,'\t') return ebox_outfile
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' startDict = utils.makeStartDict(annotFile) enhancerName = enhancerFile.split('/')[-1].split('.')[0] enhancerTable = utils.parseTable(enhancerFile, '\t') # internal parameter for debugging byRefseq = False if len(transcribedFile) > 0: transcribedTable = utils.parseTable(transcribedFile, '\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = startDict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = utils.makeTranscriptCollection( annotFile, 0, 0, 500, transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really # matter tssCollection = utils.LocusCollection(tssLoci, 50) geneDict = {'overlapping': defaultdict( list), 'proximal': defaultdict(list)} # dictionaries to hold ranks and superstatus of gene nearby enhancers rankDict = defaultdict(list) superDict = defaultdict(list) # list of all genes that appear in this analysis overallGeneList = [] # find the damn header for line in enhancerTable: if line[0][0] == '#': continue else: header = line break if noFormatTable: # set up the output tables # first by enhancer enhancerToGeneTable = [ header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']] else: # set up the output tables # first by enhancer enhancerToGeneTable = [ header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]] # next by gene geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']] # next make the gene to enhancer table geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']] for line in enhancerTable: if line[0][0] == '#' or line[0][0] == 'R': continue enhancerString = '%s:%s-%s' % (line[1], line[2], line[3]) enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0]) # overlapping genes are transcribed genes whose transcript is directly # in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap( enhancerLocus, 'both') overlappingGenes = [] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) # proximalGenes are transcribed genes where the tss is within 50kb of # the boundary of the stitched loci proximalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both') proximalGenes = [] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both') distalGenes = [] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) distalGenes = utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes # these checks make sure each gene list is unique. # technically it is possible for a gene to be overlapping, but not proximal since the # gene could be longer than the 50kb window, but we'll let that slide # here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) # Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: # get enhancerCenter enhancerCenter = (int(line[2]) + int(line[3])) / 2 # get absolute distance to enhancer center distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] # get the ID and convert to name closestGene = startDict[ allEnhancerGenes[distList.index(min(distList))]]['name'] # NOW WRITE THE ROW FOR THE ENHANCER TABLE if noFormatTable: newEnhancerLine = list(line) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) else: newEnhancerLine = line[0:9] newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) newEnhancerLine += line[-2:] enhancerToGeneTable.append(newEnhancerLine) # Now grab all overlapping and proximal genes for the gene ordered # table overallGeneList += overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) overallGeneList += proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) # End loop through # Make table by gene print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION') overallGeneList = utils.uniquify(overallGeneList) enhancerGeneCollection = utils.makeTranscriptCollection( annotFile, 5000, 5000, 500, overallGeneList) enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection) # dump the gff to file enhancerFolder = utils.getParentFolder(enhancerFile) gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome) enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName) utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t') # now we need to run bamToGFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): bamliquidator_path = 'bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): raise ValueError('bamliquidator_batch.py not found in path') print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS') # map density at genes in the +/- 5kb tss region # first on the rankBy bam bamName = rankByBamFile.split('/')[-1] mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName) mappedRankByFile = "%s%s_%s_%s/matrix.gff" % (enhancerFolder,enhancerName, gffRootName, bamName) cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile) print("Mapping rankby bam %s" % (rankByBamFile)) print(cmd) outputRank = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) outputRank = outputRank.communicate() if len(outputRank[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) sys.exit() # next on the control bam if it exists if len(controlBamFile) > 0: controlName = controlBamFile.split('/')[-1] mappedControlFolder = "%s%s_%s_%s/" % ( enhancerFolder, enhancerName,gffRootName, controlName) mappedControlFile = "%s%s_%s_%s/matrix.gff" % ( enhancerFolder, enhancerName,gffRootName, controlName) cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile) print("Mapping control bam %s" % (controlBamFile)) print(cmd) outputControl = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) outputControl = outputControl.communicate() if len(outputControl[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) sys.exit() # now get the appropriate output files if len(controlBamFile) > 0: print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" % (mappedRankByFile, mappedControlFile)) if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile, mappedControlFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() else: print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile)) if utils.checkOutput(mappedRankByFile, 1, 30): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() # use enhancer rank to order rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList]) usedNames = [] # make a new dict to hold TSS signal by max per geneName geneNameSigDict = defaultdict(list) print('MAKING GENE TABLE') for i in rankOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['overlapping'][ refID] + geneDict['proximal'][refID] superStatus = max(superDict[refID]) enhancerRanks = join([str(x) for x in rankDict[refID]], ',') enhancerSignal = signalDict[refID] geneNameSigDict[geneName].append(enhancerSignal) newLine = [geneName, refID, join( proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal] geneToEnhancerTable.append(newLine) #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t') print('MAKING ENHANCER TO TOP GENE TABLE') if noFormatTable: enhancerToTopGeneTable = [ enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']] else: enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [ 'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]] for line in enhancerToGeneTable[1:]: geneList = [] if noFormatTable: geneList += line[-3].split(',') geneList += line[-2].split(',') else: geneList += line[10].split(',') geneList += line[11].split(',') geneList = utils.uniquify([x for x in geneList if len(x) > 0]) if len(geneList) > 0: try: sigVector = [max(geneNameSigDict[x]) for x in geneList] maxIndex = sigVector.index(max(sigVector)) maxGene = geneList[maxIndex] maxSig = sigVector[maxIndex] if maxSig == 0.0: maxGene = 'NONE' maxSig = 'NONE' except ValueError: if len(geneList) == 1: maxGene = geneList[0] maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' if noFormatTable: newLine = line + [maxGene, maxSig] else: newLine = line[0:12] + [maxGene, maxSig] + line[-2:] enhancerToTopGeneTable.append(newLine) # resort enhancerToGeneTable if noFormatTable: return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable else: enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]]) sortedTable = [enhancerToGeneTable[0]] sortedTopGeneTable = [enhancerToTopGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i + 1)]) sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)]) return sortedTable, sortedTopGeneTable, geneToEnhancerTable