Example #1
0
def twilio():
    body = request.form['Body'].split(' ')
    resp = twiml.Response()
    myText = "unknown command. Please enter: <buy/sell TICKER #OFSHARES> to make a trade, or enter <status> to see the current portfolio value."
    if len(body) != 3:
        if body[0].lower() == "status":
            myText = str(utils.get_portfolio_val(g))
        resp.message(myText)
        return str(resp)
    order, sym, value = body
    myText = "Nice! You put in an order to " + order.lower() + " " + value + " shares of " + sym.upper()
    resp.message(myText)
    utils.order(order, sym, value, g)
    return str(resp)
Example #2
0
def promptio():
    myJson = request.get_json()
    myText = "enter <@stock_00017 status> to check portfolio value. enter <@stock_00017 buy/sell TICKER #OFSHARES> to make a trade."
    body = myJson['message'].split(" ")
    if body[0].lower() == "status":
        myText = str(utils.get_portfolio_val(g))
    if len(body) == 3:
        order, sym, value = body
        utils.order(order, sym, value, g)
        myText = "Nice! You put in an order to " + order.lower() + " " + value + " shares of " + sym.upper()
    dat = jsonify(sendmms=False, showauthurl=False, authstate=None,
                  text=myText, speech=myText, status="OK", webhookreply=None,
                  images=[{"imageurl": None, "alttext": None}])  # insert json responses here
    resp = make_response(dat, 200, {"Content-Type": "application/json"})
    return resp
Example #3
0
def twilio():
    body = request.form['Body'].split(' ')
    resp = twiml.Response()
    myText = "unknown command. Please enter: <buy/sell TICKER #OFSHARES> to make a trade, or enter <status> to see the current portfolio value."
    if len(body) != 3:
        if body[0].lower() == "status":
            myText = str(utils.get_portfolio_val(g))
        resp.message(myText)
        return str(resp)
    order, sym, value = body
    myText = "Nice! You put in an order to " + order.lower(
    ) + " " + value + " shares of " + sym.upper()
    resp.message(myText)
    utils.order(order, sym, value, g)
    return str(resp)
Example #4
0
    def visit_character(self, node, children) -> Any:
        ident = children[0]['top']
        char = children[0]['nodes'][ident]['label']
        if char == '" "':
            char = ' '

        return order(char)
Example #5
0
def buy():
    order = request.form['order']
    sym = request.form['sym']
    val = request.form['val']
    if utils.valid_order(order, sym, val):
        r = utils.order(order, sym, val, g)
        return str(r)
    return "not valid"
Example #6
0
def buy():
    order = request.form['order']
    sym = request.form['sym']
    val = request.form['val']
    if utils.valid_order(order, sym, val):
        r = utils.order(order, sym, val, g)
        return str(r)
    return "not valid"
Example #7
0
def solve(G, s):
    """
    Args:
        G: networkx.Graph
        s: stress_budget
    Returns:
        D: Dictionary mapping for student to breakout room r e.g. {0:2, 1:0, 2:1, 3:2}
        k: Number of breakout rooms
    """
    n = G.order()
    happiness = {}
    stress = {}
    for i in range(n):
        happiness[i] = {}
        stress[i] = {}

    for i in range(n):
        for j in range(i + 1, n):
            happiness[i][j] = G.get_edge_data(i, j)['happiness']
            stress[i][j] = G.get_edge_data(i, j)['stress']

    print("Input Size:", n)
    if n <= 10:  #brute force approach
        start_time = time.perf_counter()
        bf_arr, bf_val = bruteforce.bruteforce(happiness, stress,
                                               len(list(happiness.keys())), s)
        bf_val = round(bf_val, 3)
        end_time = time.perf_counter()
        print("Brute Force Approach Time:", end_time - start_time)

    #ILP
    start_time = time.perf_counter()
    answer, rooms, best_k = lp.lp_solve(happiness, stress, s, n)
    end_time = time.perf_counter()
    times.append(end_time - start_time)
    print("Gurobi Approach Time:     ", end_time - start_time)
    print("Gurobi Answer:            ", answer)
    #print("Gurobi Rooms (raw):       ", rooms, best_k)
    print("Gurobi Rooms:", order(rooms)[0])
    if n <= 10:
        #assert bf_val == answer, "Incorrect computation"
        print("Brutef Rooms:", bf_arr)
        print("Brutef Answer:", bf_val)
    return rooms, order(rooms)[1]
Example #8
0
def mergeCollections(nameDict, analysisName, output='', superOnly=True):
    '''
    merges them collections
    '''

    allLoci = []
    namesList = nameDict.keys()
    for name in namesList:

        seCollection = makeSECollection(nameDict[name]['enhancerFile'], name,
                                        superOnly)
        if superOnly:
            print "DATASET: %s HAS %s SUPERENHANCERS" % (name,
                                                         len(seCollection))
        else:
            print "DATASET: %s HAS %s ENHANCERS" % (name, len(seCollection))
        allLoci += seCollection.getLoci()

    print len(allLoci)

    mergedCollection = utils.LocusCollection(allLoci, 50)

    #stitch the collection together
    stitchedCollection = mergedCollection.stitchCollection()

    stitchedLoci = stitchedCollection.getLoci()
    print "IDENTIFIED %s CONSENSUS ENHANCER REGIONS" % (len(stitchedLoci))
    #sort by size and provide a unique ID

    sizeList = [locus.len() for locus in stitchedLoci]

    sizeOrder = utils.order(sizeList, decreasing=True)

    orderedLoci = [stitchedLoci[i] for i in sizeOrder]

    for i in range(len(orderedLoci)):
        orderedLoci[i]._ID = 'merged_%s_%s' % (analysisName, str(i + 1))

    mergedGFF = []
    for locus in orderedLoci:
        newLine = [
            locus.chr(),
            locus.ID(), '',
            locus.start(),
            locus.end(), '',
            locus.sense(), '',
            locus.ID()
        ]
        mergedGFF.append(newLine)

    if len(output) == 0:
        return mergedGFF
    else:
        print "writing merged gff to %s" % (output)
        utils.unParseTable(mergedGFF, output, '\t')
        return output
Example #9
0
def mergeCollections(nameDict,analysisName,output='',superOnly=True):

    '''
    merges them collections
    '''

    allLoci = []
    namesList = nameDict.keys()
    for name in namesList:
        
        seCollection =makeSECollection(nameDict[name]['enhancerFile'],name,superOnly)
        if superOnly:
            print "DATASET: %s HAS %s SUPERENHANCERS" % (name,len(seCollection))
        else:
            print "DATASET: %s HAS %s ENHANCERS" % (name,len(seCollection))
        allLoci += seCollection.getLoci()

    print len(allLoci)


    mergedCollection = utils.LocusCollection(allLoci,50)

    #stitch the collection together
    stitchedCollection = mergedCollection.stitchCollection()

    stitchedLoci = stitchedCollection.getLoci()
    print "IDENTIFIED %s CONSENSUS ENHANCER REGIONS" % (len(stitchedLoci))
    #sort by size and provide a unique ID

    sizeList = [locus.len() for locus in stitchedLoci]

    sizeOrder = utils.order(sizeList,decreasing=True)
    
    orderedLoci = [stitchedLoci[i] for i in sizeOrder]

    for i in range(len(orderedLoci)):
        orderedLoci[i]._ID = 'merged_%s_%s' % (analysisName,str(i+1))

    mergedGFF = []
    for locus in orderedLoci:
        newLine = [locus.chr(),locus.ID(),'',locus.start(),locus.end(),'',locus.sense(),'',locus.ID()]
        mergedGFF.append(newLine)


    if len(output) == 0:
        return mergedGFF
    else:
        print "writing merged gff to %s" % (output)
        utils.unParseTable(mergedGFF,output,'\t')
        return output
Example #10
0
def promptio():
    myJson = request.get_json()
    myText = "enter <@stock_00017 status> to check portfolio value. enter <@stock_00017 buy/sell TICKER #OFSHARES> to make a trade."
    body = myJson['message'].split(" ")
    if body[0].lower() == "status":
        myText = str(utils.get_portfolio_val(g))
    if len(body) == 3:
        order, sym, value = body
        utils.order(order, sym, value, g)
        myText = "Nice! You put in an order to " + order.lower(
        ) + " " + value + " shares of " + sym.upper()
    dat = jsonify(sendmms=False,
                  showauthurl=False,
                  authstate=None,
                  text=myText,
                  speech=myText,
                  status="OK",
                  webhookreply=None,
                  images=[{
                      "imageurl": None,
                      "alttext": None
                  }])  # insert json responses here
    resp = make_response(dat, 200, {"Content-Type": "application/json"})
    return resp
Example #11
0
    def __init__(self, conf_file=None, environment=None):
        self.log = utils._setupLogging()
        self.containers = {}

        if environment:
            self.load(environment)
        else:
            if not conf_file.startswith('/'):
                conf_file = os.path.join(os.path.dirname(sys.argv[0]),
                                         conf_file)

            data = open(conf_file, 'r')
            self.config = yaml.load(data)

        # On load order containers into the proper startup sequence
        self.start_order = utils.order(self.config['containers'])
Example #12
0
  def __init__(self, conf_file=None, environment=None):
    self.log = utils.setupLogging()
    self.containers = {}
    self.templates = {}
    self.state = 'live'

    if environment:
      self.load(environment)      
    else:
      # If we didn't get an absolute path to a file, look for it in the current directory.
      if not conf_file.startswith('/'):
        conf_file = os.path.join(os.path.dirname(sys.argv[0]), conf_file)

      data = open(conf_file, 'r')
      self.config = yaml.load(data)
      
    # On load, order templates into the proper startup sequence      
    self.start_order = utils.order(self.config['templates'])
Example #13
0
def search():
    body = request.get_json()

    query_subs: List[Subreddit] = None
    try:
        query_subs = solr.validate_query(body["subreddits"])
    except Exception as e:
        traceback.print_exc()
        return jsonify({"message": str(e)}), 400
    query_words, query_words_lmtzd = utils.analyze_queries(query_subs)

    doc_subs: List[Subreddit] = None
    doc_subs = solr.get_docs(query_words)
    docs_lmtzd = utils.analyze_documents(doc_subs)

    result = utils.order(query_words_lmtzd, docs_lmtzd, doc_subs)

    return jsonify({"result": list(result)})
Example #14
0
def makeRigerTable(foldTableFile, output=''):
    '''
    blah
    '''

    #need a table of this format
    rigerTable = [[
        'Construct', 'GeneSymbol', 'NormalizedScore', 'Construct Rank',
        'HairpinWeight'
    ]]
    #set weight to 1 for now

    foldTable = utils.parseTable(foldTableFile, '\t')

    constructOrder = utils.order([float(line[2]) for line in foldTable[1:]],
                                 decreasing=True)

    #make geneCountDict
    print("making gene count dictionary")
    geneCountDict = defaultdict(int)
    for line in foldTable[1:]:
        geneCountDict[line[1]] += 1

    print("iterating through constructs")
    constructRank = 1
    for i in constructOrder:
        rowIndex = i + 1  # accounts for the header
        geneName = foldTable[rowIndex][1]
        if geneCountDict[geneName] == 1:
            print(
                "Gene %s only has one guide RNA. Excluding from FRIGER analysis"
                % (geneName))
            continue

        newLine = foldTable[rowIndex][0:3] + [constructRank, 1]
        rigerTable.append(newLine)
        constructRank += 1

    if len(output) == 0:
        output = string.replace(foldTableFile, '_log2Ratio.txt', '_friger.txt')

    utils.unParseTable(rigerTable, output, '\t')

    return output
Example #15
0
  def load(self, filename='envrionment.yml'):
    self.log.info('Loading environment from: %s', filename)      
    
    with open(filename, 'r') as input_file:
      self.config = yaml.load(input_file)

      self.state = self.config['state']
      
      for tmpl in self.config['templates']:
        # TODO fix hardcoded service name and version
        self.templates[tmpl] = template.Template(tmpl, self.config['templates'][tmpl], 'service', '0.1')
        self.containers[tmpl] = {}

      self.start_order = utils.order(self.config['templates'])
      for container in self.config['containers']:
        tmpl = self.config['containers'][container]['template']
      
        self.containers[tmpl][container] = Container(container, self.config['containers'][container], 
          self.config['templates'][tmpl]['config'])
def make_T_top_regions(signal_table_path, top=1000):
    '''
    makes a gff and fasta of the top N T regions based off the signal table
    '''

    signal_table = utils.parseTable(signal_table_path, '\t')

    signal_dict = defaultdict(float)
    for line in signal_table[1:]:

        signal = (max(float(line[2]) - float(line[3]), 0) +
                  max(float(line[4]) - float(line[5]), 0)) / 2
        signal_dict[line[1]] = signal

    signal_vector = [signal_dict[line[1]] for line in signal_table[1:]]
    signal_order = utils.order(signal_vector, decreasing=True)

    t_top_gff_path = '%sCH22_T_UNION_TOP_%s_-0_+0.gff' % (gffFolder, str(top))
    print(t_top_gff_path)
    t_top_gff = []
    for i in range(top):
        signal_row = signal_order[i] + 1
        line = signal_table[signal_row]
        region_id = line[1]
        chrom = region_id.split('(')[0]
        coords = region_id.split(':')[-1].split('-')
        gff_line = [
            chrom, region_id, '', coords[0], coords[1], '', '.', '', region_id
        ]
        t_top_gff.append(gff_line)

    utils.unParseTable(t_top_gff, t_top_gff_path, '\t')

    t_top_fasta = utils.gffToFasta('HG19', genomeDirectory, t_top_gff)
    t_top_fasta_path = '%sHG19_CH22_T_UNION_TOP_%s_-0_+0.fasta' % (fastaFolder,
                                                                   top)
    utils.unParseTable(t_top_fasta, t_top_fasta_path, '')

    return t_top_fasta_path
Example #17
0
def makeRigerTable(foldTableFile,output=''):

    '''
    blah
    '''

    #need a table of this format
    rigerTable = [['Construct','GeneSymbol','NormalizedScore','Construct Rank','HairpinWeight']]
    #set weight to 1 for now

    foldTable = utils.parseTable(foldTableFile,'\t')

    constructOrder = utils.order([float(line[2]) for line in foldTable[1:]],decreasing=True)

    #make geneCountDict
    print("making gene count dictionary")
    geneCountDict= defaultdict(int)
    for line in foldTable[1:]:
        geneCountDict[line[1]] +=1

    print("iterating through constructs")
    constructRank = 1
    for i in constructOrder:
        rowIndex = i+1 # accounts for the header
        geneName = foldTable[rowIndex][1]
        if geneCountDict[geneName] == 1:
            print("Gene %s only has one guide RNA. Excluding from FRIGER analysis" % (geneName))
            continue

        newLine = foldTable[rowIndex][0:3] + [constructRank,1]
        rigerTable.append(newLine)
        constructRank += 1

    if len(output) == 0:
        output = string.replace(foldTableFile,'_log2Ratio.txt','_friger.txt')
    
    utils.unParseTable(rigerTable,output,'\t')

    return output
Example #18
0
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False):
    '''
    maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq
    '''
    startDict = utils.makeStartDict(annotFile)
    enhancerName = enhancerFile.split('/')[-1].split('.')[0]
    enhancerTable = utils.parseTable(enhancerFile, '\t')

    # internal parameter for debugging
    byRefseq = False

    if len(transcribedFile) > 0:
        transcribedTable = utils.parseTable(transcribedFile, '\t')
        transcribedGenes = [line[1] for line in transcribedTable]
    else:
        transcribedGenes = startDict.keys()

    print('MAKING TRANSCRIPT COLLECTION')
    transcribedCollection = utils.makeTranscriptCollection(
        annotFile, 0, 0, 500, transcribedGenes)

    print('MAKING TSS COLLECTION')
    tssLoci = []
    for geneID in transcribedGenes:
        tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0))

    # this turns the tssLoci list into a LocusCollection
    # 50 is the internal parameter for LocusCollection and doesn't really
    # matter
    tssCollection = utils.LocusCollection(tssLoci, 50)

    geneDict = {'overlapping': defaultdict(
        list), 'proximal': defaultdict(list)}

    # dictionaries to hold ranks and superstatus of gene nearby enhancers
    rankDict = defaultdict(list)
    superDict = defaultdict(list)

    # list of all genes that appear in this analysis
    overallGeneList = []

    # find the damn header
    for line in enhancerTable:
        if line[0][0] == '#':
            continue
        else:
            header = line
            break

    if noFormatTable:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [
            header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']]

    else:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [
            header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]]

        # next by gene
        geneToEnhancerTable = [
            ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']]

    # next make the gene to enhancer table
    geneToEnhancerTable = [
        ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']]

    for line in enhancerTable:
        if line[0][0] == '#' or line[0][0] == 'R':
            continue

        enhancerString = '%s:%s-%s' % (line[1], line[2], line[3])

        enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0])

        # overlapping genes are transcribed genes whose transcript is directly
        # in the stitchedLocus
        overlappingLoci = transcribedCollection.getOverlap(
            enhancerLocus, 'both')
        overlappingGenes = []
        for overlapLocus in overlappingLoci:
            overlappingGenes.append(overlapLocus.ID())

        # proximalGenes are transcribed genes where the tss is within 50kb of
        # the boundary of the stitched loci
        proximalLoci = tssCollection.getOverlap(
            utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both')
        proximalGenes = []
        for proxLocus in proximalLoci:
            proximalGenes.append(proxLocus.ID())

        distalLoci = tssCollection.getOverlap(
            utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both')
        distalGenes = []
        for proxLocus in distalLoci:
            distalGenes.append(proxLocus.ID())

        overlappingGenes = utils.uniquify(overlappingGenes)
        proximalGenes = utils.uniquify(proximalGenes)
        distalGenes = utils.uniquify(distalGenes)
        allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes
        # these checks make sure each gene list is unique.
        # technically it is possible for a gene to be overlapping, but not proximal since the
        # gene could be longer than the 50kb window, but we'll let that slide
        # here
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)

        for refID in proximalGenes:
            if distalGenes.count(refID) == 1:
                distalGenes.remove(refID)

        # Now find the closest gene
        if len(allEnhancerGenes) == 0:
            closestGene = ''
        else:
            # get enhancerCenter
            enhancerCenter = (int(line[2]) + int(line[3])) / 2

            # get absolute distance to enhancer center
            distList = [abs(enhancerCenter - startDict[geneID]['start'][0])
                        for geneID in allEnhancerGenes]
            # get the ID and convert to name
            closestGene = startDict[
                allEnhancerGenes[distList.index(min(distList))]]['name']

        # NOW WRITE THE ROW FOR THE ENHANCER TABLE
        if noFormatTable:

            newEnhancerLine = list(line)
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)

        else:
            newEnhancerLine = line[0:9]
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)
            newEnhancerLine += line[-2:]

        enhancerToGeneTable.append(newEnhancerLine)
        # Now grab all overlapping and proximal genes for the gene ordered
        # table

        overallGeneList += overlappingGenes
        for refID in overlappingGenes:
            geneDict['overlapping'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))

        overallGeneList += proximalGenes
        for refID in proximalGenes:
            geneDict['proximal'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))

    # End loop through
    # Make table by gene
    print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION')
    overallGeneList = utils.uniquify(overallGeneList)

    #get the chromLists from the various bams here
    cmd = 'samtools idxstats %s' % (rankByBamFile)
    idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True)
    idxStats= idxStats.communicate()
    bamChromList = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]]
    
    if len(controlBamFile) > 0:
        cmd = 'samtools idxstats %s' % (controlBamFile)
        idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True)
        idxStats= idxStats.communicate()
        bamChromListControl = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]]
        bamChromList = [chrom for chrom in bamChromList if bamChromListControl.count(chrom) != 0]



    #now make sure no genes have a bad chrom 
    overallGeneList = [gene for gene in overallGeneList if bamChromList.count(startDict[gene]['chr']) != 0]

    
    #now make an enhancer collection of all transcripts    
    enhancerGeneCollection = utils.makeTranscriptCollection(
        annotFile, 5000, 5000, 500, overallGeneList)

    enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection)

    # dump the gff to file
    enhancerFolder = utils.getParentFolder(enhancerFile)
    gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome)
    enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName)
    utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t')

    # now we need to run bamToGFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = 'bamliquidator_batch'


    print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS')
    # map density at genes in the +/- 5kb tss region
    # first on the rankBy bam
    bamName = rankByBamFile.split('/')[-1]
    mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName)
    mappedRankByFile = "%s%s_%s_%s/matrix.txt" % (enhancerFolder,enhancerName, gffRootName, bamName)
    cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile)
    print("Mapping rankby bam %s" % (rankByBamFile))
    print(cmd)
    os.system(cmd)

    #check for completion
    if utils.checkOutput(mappedRankByFile,0.2,5):
        print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile))
    else:
        print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile))
        sys.exit()

    # next on the control bam if it exists
    if len(controlBamFile) > 0:
        controlName = controlBamFile.split('/')[-1]
        mappedControlFolder = "%s%s_%s_%s/" % (
            enhancerFolder, enhancerName,gffRootName, controlName)
        mappedControlFile = "%s%s_%s_%s/matrix.txt" % (
            enhancerFolder, enhancerName,gffRootName, controlName)
        cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile)
        print("Mapping control bam %s" % (controlBamFile))
        print(cmd)
        os.system(cmd)

        #check for completion
        if utils.checkOutput(mappedControlFile,0.2,5):
            print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile))
        else:
            print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile))
            sys.exit()

    # now get the appropriate output files
    if len(controlBamFile) > 0:
        print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" %
              (mappedRankByFile, mappedControlFile))
        if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signalDict = makeSignalDict(mappedRankByFile, mappedControlFile)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()
    else:
        print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile))
        if utils.checkOutput(mappedRankByFile, 1, 30):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signalDict = makeSignalDict(mappedRankByFile)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()

    # use enhancer rank to order

    rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList])

    usedNames = []

    # make a new dict to hold TSS signal by max per geneName
    geneNameSigDict = defaultdict(list)
    print('MAKING GENE TABLE')
    for i in rankOrder:
        refID = overallGeneList[i]
        geneName = startDict[refID]['name']
        if usedNames.count(geneName) > 0 and uniqueGenes == True:
            continue
        else:
            usedNames.append(geneName)

        proxEnhancers = geneDict['overlapping'][
            refID] + geneDict['proximal'][refID]

        superStatus = max(superDict[refID])
        enhancerRanks = join([str(x) for x in rankDict[refID]], ',')

        enhancerSignal = signalDict[refID]
        geneNameSigDict[geneName].append(enhancerSignal)

        newLine = [geneName, refID, join(
            proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal]
        geneToEnhancerTable.append(newLine)
    #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t')
    print('MAKING ENHANCER TO TOP GENE TABLE')

    if noFormatTable:
        enhancerToTopGeneTable = [
            enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']]
    else:
        enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [
            'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]]

    for line in enhancerToGeneTable[1:]:

        geneList = []
        if noFormatTable:
            geneList += line[-3].split(',')
            geneList += line[-2].split(',')

        else:
            geneList += line[10].split(',')
            geneList += line[11].split(',')

        geneList = utils.uniquify([x for x in geneList if len(x) > 0])
        if len(geneList) > 0:
            try:
                sigVector = [max(geneNameSigDict[x]) for x in geneList]
                maxIndex = sigVector.index(max(sigVector))
                maxGene = geneList[maxIndex]
                maxSig = sigVector[maxIndex]
                if maxSig == 0.0:
                    maxGene = 'NONE'
                    maxSig = 'NONE'
            except ValueError:
                if len(geneList) == 1:
                    maxGene = geneList[0]
                    maxSig = 'NONE'    
                else:
                    maxGene = 'NONE'
                    maxSig = 'NONE'    
        else:
            maxGene = 'NONE'
            maxSig = 'NONE'
        if noFormatTable:
            newLine = line + [maxGene, maxSig]
        else:
            newLine = line[0:12] + [maxGene, maxSig] + line[-2:]
        enhancerToTopGeneTable.append(newLine)

    # resort enhancerToGeneTable
    if noFormatTable:
        return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable
    else:
        enhancerOrder = utils.order([int(line[-2])
                                    for line in enhancerToGeneTable[1:]])
        sortedTable = [enhancerToGeneTable[0]]
        sortedTopGeneTable = [enhancerToTopGeneTable[0]]
        for i in enhancerOrder:
            sortedTable.append(enhancerToGeneTable[(i + 1)])
            sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)])

        return sortedTable, sortedTopGeneTable, geneToEnhancerTable
Example #19
0
def mapCollection(stitchedCollection, referenceCollection, bamFileList,
                  mappedFolder, output, refName):
    '''
    makes a table of factor density in a stitched locus and ranks table by number of loci stitched together
    '''

    print('FORMATTING TABLE')
    loci = stitchedCollection.getLoci()

    locusTable = [[
        'REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE'
    ]]

    lociLenList = []

    # strip out any that are in chrY
    for locus in list(loci):
        if locus.chr() == 'chrY':
            loci.remove(locus)

    for locus in loci:
        # numLociList.append(int(stitchLocus.ID().split('_')[1]))
        lociLenList.append(locus.len())
        # numOrder = order(numLociList,decreasing=True)
    lenOrder = utils.order(lociLenList, decreasing=True)
    ticker = 0
    for i in lenOrder:
        ticker += 1
        if ticker % 1000 == 0:
            print(ticker)
        locus = loci[i]

        # First get the size of the enriched regions within the stitched locus
        refEnrichSize = 0
        refOverlappingLoci = referenceCollection.getOverlap(locus, 'both')
        for refLocus in refOverlappingLoci:
            refEnrichSize += refLocus.len()

        try:
            stitchCount = int(locus.ID().split('_')[0])
        except ValueError:
            stitchCount = 1
        coords = [int(x) for x in locus.coords()]

        locusTable.append([
            locus.ID(),
            locus.chr(),
            min(coords),
            max(coords), stitchCount, refEnrichSize
        ])

    print('GETTING MAPPED DATA')
    print("USING A BAMFILE LIST:")
    print(bamFileList)
    for bamFile in bamFileList:

        bamFileName = bamFile.split('/')[-1]

        print('GETTING MAPPING DATA FOR  %s' % bamFile)
        # assumes standard convention for naming enriched region gffs

        # opening up the mapped GFF
        print('OPENING %s%s_%s_MAPPED/matrix.txt' %
              (mappedFolder, refName, bamFileName))

        mappedGFF = utils.parseTable(
            '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName),
            '\t')

        signalDict = defaultdict(float)
        print('MAKING SIGNAL DICT FOR %s' % (bamFile))
        mappedLoci = []
        for line in mappedGFF[1:]:

            chrom = line[1].split('(')[0]
            start = int(line[1].split(':')[-1].split('-')[0])
            end = int(line[1].split(':')[-1].split('-')[1])
            mappedLoci.append(utils.Locus(chrom, start, end, '.', line[0]))
            try:
                signalDict[line[0]] = float(line[2]) * (abs(end - start))
            except ValueError:
                print('WARNING NO SIGNAL FOR LINE:')
                print(line)
                continue

        mappedCollection = utils.LocusCollection(mappedLoci, 500)
        locusTable[0].append(bamFileName)

        for i in range(1, len(locusTable)):
            signal = 0.0
            line = locusTable[i]
            lineLocus = utils.Locus(line[1], line[2], line[3], '.')
            overlappingRegions = mappedCollection.getOverlap(lineLocus,
                                                             sense='both')
            for region in overlappingRegions:
                signal += signalDict[region.ID()]
            locusTable[i].append(signal)

    utils.unParseTable(locusTable, output, '\t')
Example #20
0
    def visit_character_set(self, node, children) -> Any:
        negated = children[0] == '^'
        if negated:
            children = children[1:]

        classes, values = set(), set()
        if children[0] in ['-', ']']:
            values.add(ord(children[0]))
            children = children[1:]
        for child in children:
            if isinstance(child, dict):
                ident = child['top']
                label = child['nodes'][ident]['label']
                try:
                    values.update(order(label))
                except TypeError:
                    classes.add(label)
            else:
                values.update(child)

        graph = add_node(
            f"{'^' if negated else ''}charset",
            font=Font.ITALIC,
            shape=Shape.TRAPEZIUM,
            color=NEGATED if negated else None,
        )
        source = graph['top']

        for class_ in sorted(classes):
            child = add_node(class_, shape=Shape.BOX, style=Style.FILLED)
            graph = merge(graph, child)
            graph = add_edge(source, child['top'], graph)

        for group, symbol in [
            (BUT_SPACE, '\\S'),
            (BUT_DIGIT, '\\D'),
            (BUT_WORD, '\\W'),
            (WORD, '\\w'),
            (DIGIT, '\\d'),
            (SPACE, '\\s'),
        ]:
            if group in values:
                child = add_node(symbol, shape=Shape.BOX, style=Style.FILLED)
                graph = merge(graph, child)
                graph = add_edge(source, child['top'], graph)
                values -= group

        start, last = None, None
        for value in sorted(values):
            if last is None:
                start, last = value, value
            elif value == last + 1:
                last = value
            else:
                label = normal(
                    start
                ) if last == start else f"{normal(start)}-{normal(last)}"
                child = add_node(label, shape=Shape.BOX)
                graph = merge(graph, child)
                graph = add_edge(source, child['top'], graph)
                start, last = None, None

        if last is not None:
            label = normal(
                start) if last == start else f"{normal(start)}-{normal(last)}"
            child = add_node(label, shape=Shape.BOX)
            graph = merge(graph, child)
            graph = add_edge(source, child['top'], graph)

        return graph
Example #21
0
def mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, output, refName):
    '''
    makes a table of factor density in a stitched locus and ranks table by number of loci stitched together
    '''

    print('FORMATTING TABLE')
    loci = stitchedCollection.getLoci()

    locusTable = [['REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE']]

    lociLenList = []

    # strip out any that are in chrY
    for locus in list(loci):
        if locus.chr() == 'chrY':
            loci.remove(locus)

    for locus in loci:
        # numLociList.append(int(stitchLocus.ID().split('_')[1]))
        lociLenList.append(locus.len())
        # numOrder = order(numLociList,decreasing=True)
    lenOrder = utils.order(lociLenList, decreasing=True)
    ticker = 0
    for i in lenOrder:
        ticker += 1
        if ticker % 1000 == 0:
            print(ticker)
        locus = loci[i]

        # First get the size of the enriched regions within the stitched locus
        refEnrichSize = 0
        refOverlappingLoci = referenceCollection.getOverlap(locus, 'both')
        for refLocus in refOverlappingLoci:
            refEnrichSize += refLocus.len()

        try:
            stitchCount = int(locus.ID().split('_')[0])
        except ValueError:
            stitchCount = 1
        coords = [int(x) for x in locus.coords()]

        locusTable.append([locus.ID(), locus.chr(), min(coords), max(coords), stitchCount, refEnrichSize])

    print('GETTING MAPPED DATA')
    print("USING A BAMFILE LIST:")
    print(bamFileList)
    for bamFile in bamFileList:

        bamFileName = bamFile.split('/')[-1]

        print('GETTING MAPPING DATA FOR  %s' % bamFile)
        # assumes standard convention for naming enriched region gffs

        # opening up the mapped GFF
        print('OPENING %s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName))

        mappedGFF = utils.parseTable('%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName), '\t')

        signalDict = defaultdict(float)
        print('MAKING SIGNAL DICT FOR %s' % (bamFile))
        mappedLoci = []
        for line in mappedGFF[1:]:

            chrom = line[1].split('(')[0]
            start = int(line[1].split(':')[-1].split('-')[0])
            end = int(line[1].split(':')[-1].split('-')[1])
            mappedLoci.append(utils.Locus(chrom, start, end, '.', line[0]))
            try:
                signalDict[line[0]] = float(line[2]) * (abs(end - start))
            except ValueError:
                print('WARNING NO SIGNAL FOR LINE:')
                print(line)
                continue

        mappedCollection = utils.LocusCollection(mappedLoci, 500)
        locusTable[0].append(bamFileName)

        for i in range(1, len(locusTable)):
            signal = 0.0
            line = locusTable[i]
            lineLocus = utils.Locus(line[1], line[2], line[3], '.')
            overlappingRegions = mappedCollection.getOverlap(lineLocus, sense='both')
            for region in overlappingRegions:
                signal += signalDict[region.ID()]
            locusTable[i].append(signal)

    utils.unParseTable(locusTable, output, '\t')
Example #22
0
def mapEnhancerToGene(annotFile,enhancerFile,transcribedFile='',uniqueGenes=True,searchWindow =50000,noFormatTable = False):
    
    '''
    maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq
    '''
    startDict = utils.makeStartDict(annotFile)
    enhancerTable = utils.parseTable(enhancerFile,'\t')

    #internal parameter for debugging
    byRefseq = False


    if len(transcribedFile) > 0:
        transcribedTable = utils.parseTable(transcribedFile,'\t')
        transcribedGenes = [line[1] for line in transcribedTable]
    else:
        transcribedGenes = startDict.keys()

    print('MAKING TRANSCRIPT COLLECTION')
    transcribedCollection = utils.makeTranscriptCollection(annotFile,0,0,500,transcribedGenes)


    print('MAKING TSS COLLECTION')
    tssLoci = []
    for geneID in transcribedGenes:
        tssLoci.append(utils.makeTSSLocus(geneID,startDict,0,0))


    #this turns the tssLoci list into a LocusCollection
    #50 is the internal parameter for LocusCollection and doesn't really matter
    tssCollection = utils.LocusCollection(tssLoci,50)

    

    geneDict = {'overlapping':defaultdict(list),'proximal':defaultdict(list)}

    #dictionaries to hold ranks and superstatus of gene nearby enhancers
    rankDict = defaultdict(list)
    superDict= defaultdict(list)

    #list of all genes that appear in this analysis
    overallGeneList = []

    if noFormatTable:
        #set up the output tables
        #first by enhancer
        enhancerToGeneTable = [enhancerTable[0]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE']]

        
    else:
        #set up the output tables
        #first by enhancer
        enhancerToGeneTable = [enhancerTable[0][0:9]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE'] + enhancerTable[5][-2:]]

        #next by gene
        geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS']]

    #next make the gene to enhancer table
    geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS','ENHANCER_RANKS','IS_SUPER']]

        


    for line in enhancerTable:
        if line[0][0] =='#' or line[0][0] == 'R':
            continue

        enhancerString = '%s:%s-%s' % (line[1],line[2],line[3])
        
        enhancerLocus = utils.Locus(line[1],line[2],line[3],'.',line[0])

        #overlapping genes are transcribed genes whose transcript is directly in the stitchedLocus         
        overlappingLoci = transcribedCollection.getOverlap(enhancerLocus,'both')           
        overlappingGenes =[]
        for overlapLocus in overlappingLoci:                
            overlappingGenes.append(overlapLocus.ID())

        #proximalGenes are transcribed genes where the tss is within 50kb of the boundary of the stitched loci
        proximalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancerLocus,searchWindow,searchWindow),'both')           
        proximalGenes =[]
        for proxLocus in proximalLoci:
            proximalGenes.append(proxLocus.ID())


        distalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancerLocus,1000000,1000000),'both')           
        distalGenes =[]
        for proxLocus in distalLoci:
            distalGenes.append(proxLocus.ID())

            
            
        overlappingGenes = utils.uniquify(overlappingGenes)
        proximalGenes = utils.uniquify(proximalGenes)
        distalGenes = utils.uniquify(distalGenes)
        allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes
        #these checks make sure each gene list is unique.
        #technically it is possible for a gene to be overlapping, but not proximal since the
        #gene could be longer than the 50kb window, but we'll let that slide here
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)

        for refID in proximalGenes:
            if distalGenes.count(refID) == 1:
                distalGenes.remove(refID)


        #Now find the closest gene
        if len(allEnhancerGenes) == 0:
            closestGene = ''
        else:
            #get enhancerCenter
            enhancerCenter = (int(line[2]) + int(line[3]))/2

            #get absolute distance to enhancer center
            distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes]
            #get the ID and convert to name
            closestGene = startDict[allEnhancerGenes[distList.index(min(distList))]]['name']

        #NOW WRITE THE ROW FOR THE ENHANCER TABLE
        if noFormatTable:

            newEnhancerLine = list(line)
            newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),','))
            newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]),','))
            newEnhancerLine.append(closestGene)

        else:
            newEnhancerLine = line[0:9]
            newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),','))
            newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]),','))
            newEnhancerLine.append(closestGene)
            newEnhancerLine += line[-2:]

        enhancerToGeneTable.append(newEnhancerLine)
        #Now grab all overlapping and proximal genes for the gene ordered table

        overallGeneList +=overlappingGenes
        for refID in overlappingGenes:
            geneDict['overlapping'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))
            
        overallGeneList+=proximalGenes
        for refID in proximalGenes:
            geneDict['proximal'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))



    #End loop through
    
    #Make table by gene
    overallGeneList = utils.uniquify(overallGeneList)  

    #use enhancer rank to order
    rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList])
        
    usedNames = []
    for i in rankOrder:
        refID = overallGeneList[i]
        geneName = startDict[refID]['name']
        if usedNames.count(geneName) > 0 and uniqueGenes == True:

            continue
        else:
            usedNames.append(geneName)
        
        proxEnhancers = geneDict['overlapping'][refID]+geneDict['proximal'][refID]
        
        superStatus = max(superDict[refID])
        enhancerRanks = join([str(x) for x in rankDict[refID]],',')
    
        newLine = [geneName,refID,join(proxEnhancers,','),enhancerRanks,superStatus]
        geneToEnhancerTable.append(newLine)

    #resort enhancerToGeneTable
    if noFormatTable:
        return enhancerToGeneTable,geneToEnhancerTable
    else:
        enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]])
        sortedTable = [enhancerToGeneTable[0]]
        for i in enhancerOrder:
            sortedTable.append(enhancerToGeneTable[(i+1)])

        return sortedTable,geneToEnhancerTable
Example #23
0
def rank_eboxes(nb_all_chip_dataFile,mycn_gff_path,macsFolder,genomeDirectory,window = 100):

    '''
    uses the  conserved MYCN sites and ranks eboxes within them
    by average background subtracted signal
    searches 100bp (window variable)  from mycn summits
    '''
    
    window = int(window)

    #bring in the conserved mycn region
    print('making gff of nb mycn summits')
    nb_mycn_gff = utils.parseTable(mycn_gff_path,'\t')

    nb_mycn_collection = utils.gffToLocusCollection(nb_mycn_gff,50)

    dataDict =pipeline_dfci.loadDataTable(nb_all_chip_dataFile)
    names_list = [name for name in dataDict.keys() if name.count('MYCN') == 1]
    names_list.sort()

    summit_loci = []
    #first makes a gff of all summits +/- 100bp for all nb mycn datasets
    for name in names_list:
        summit_bed_path = '%s%s/%s_summits.bed' % (macsFolder,name,name)
        summit_bed = utils.parseTable(summit_bed_path,'\t')
        for line in summit_bed:
            summit_locus = utils.Locus(line[0],int(line[1])-window,int(line[2])+window,'.',line[3])
            if len(nb_mycn_collection.getOverlap(summit_locus)) > 0:
                summit_loci.append(summit_locus)

    summit_collection =utils.LocusCollection(summit_loci,50)
    summit_merged_collection = summit_collection.stitchCollection()
    
    summit_gff = utils.locusCollectionToGFF(summit_merged_collection)
    summit_gff_path = '%sHG19_NB_MYCN_SUMMITS_-%s_+%s.gff' % (gffFolder,window,window)
    utils.unParseTable(summit_gff,summit_gff_path,'\t')

    #this is borrowed from above and maps chip-seq signal to the gff
    print('mapping to nb mycn summits and making signal dict')
    gffList = [summit_gff_path]
    summit_signal_path = pipeline_dfci.map_regions(nb_all_chip_dataFile,gffList)


    mycnSignalTable = utils.parseTable(summit_signal_path,'\t')

    #making a signal dictionary for MYCN binding
    names_list = ['BE2C_MYCN','KELLY_MYCN','NGP_MYCN','SHEP21_0HR_MYCN_NOSPIKE']
    background_list = [dataDict[name]['background'] for name in names_list]
    header = mycnSignalTable[0]
    chip_columns = [header.index(name) for name in names_list]
    background_columns = [header.index(background_name) for background_name in background_list]
    
    mycn_sig_dict = {}
    for line in mycnSignalTable[1:]:
        line_sig = []
        for i in range(len(names_list)):
            line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]]))
        region_id = line[1]
        coords = [int(x) for x in line[1].split(':')[-1].split('-')]
        line_length = coords[1]-coords[0]
        mycn_sig_dict[region_id] = numpy.mean(line_sig)*line_length

    #now for each region find the eboxes and then add up the signal
    print('making ebox ranking')
    ebox_list = ['CACGTG','CAGTTG','CAAGTG','CAGGTG','CAATTG','CAAATG','CATCTG','CAGCTG','CATGTG','CATATG']
    eboxDict = {}
    for ebox in ebox_list:
        eboxDict[ebox] = []
    ticker = 0
    for line in summit_gff:
        if ticker % 1000 == 0:
            print(ticker)
        ticker+=1

        chrom = line[0]
        sense = '.'

        start = int(line[3])
        end = int(line[4])
        region_id = '%s(%s):%s-%s' % (line[0],line[6],line[3],line[4])
        signal = mycn_sig_dict[region_id]

        sequenceLine = utils.fetchSeq(genomeDirectory,chrom,start,end,True)
        
        motifVector = []
        matches = re.finditer('CA..TG',str.upper(sequenceLine))
        if matches:
            for match in matches:
                motifVector.append(match.group())
        
        #count only 1 of each motif type per line
        #motifVector = utils.uniquify(motifVector)
        for motif in motifVector:
            if ebox_list.count(motif) > 0:
                eboxDict[motif].append(signal)
            else:
                eboxDict[utils.revComp(motif)].append(signal)


    eboxTable =[]
    eboxTableOrdered =[['EBOX','OCCURENCES','AVG_HEIGHT']]
    for ebox in eboxDict.keys():
        newLine = [ebox,len(eboxDict[ebox]),numpy.mean(eboxDict[ebox])]
        eboxTable.append(newLine)


    occurenceOrder = utils.order([line[2] for line in eboxTable],decreasing=True)
    
    for x in occurenceOrder:
        eboxTableOrdered.append(eboxTable[x])
    print(eboxTableOrdered)
    ebox_outfile = '%sHG19_NB_MYCN_CONSERVED_SUMMITS_-%s_+%s_EBOX_RANK.txt' % (tableFolder,window,window)
    utils.unParseTable(eboxTableOrdered,ebox_outfile,'\t')
    return ebox_outfile
Example #24
0
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False):
    '''
    maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq
    '''
    startDict = utils.makeStartDict(annotFile)
    enhancerName = enhancerFile.split('/')[-1].split('.')[0]
    enhancerTable = utils.parseTable(enhancerFile, '\t')

    # internal parameter for debugging
    byRefseq = False

    if len(transcribedFile) > 0:
        transcribedTable = utils.parseTable(transcribedFile, '\t')
        transcribedGenes = [line[1] for line in transcribedTable]
    else:
        transcribedGenes = startDict.keys()

    print('MAKING TRANSCRIPT COLLECTION')
    transcribedCollection = utils.makeTranscriptCollection(
        annotFile, 0, 0, 500, transcribedGenes)

    print('MAKING TSS COLLECTION')
    tssLoci = []
    for geneID in transcribedGenes:
        tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0))

    # this turns the tssLoci list into a LocusCollection
    # 50 is the internal parameter for LocusCollection and doesn't really
    # matter
    tssCollection = utils.LocusCollection(tssLoci, 50)

    geneDict = {'overlapping': defaultdict(
        list), 'proximal': defaultdict(list)}

    # dictionaries to hold ranks and superstatus of gene nearby enhancers
    rankDict = defaultdict(list)
    superDict = defaultdict(list)

    # list of all genes that appear in this analysis
    overallGeneList = []

    # find the damn header
    for line in enhancerTable:
        if line[0][0] == '#':
            continue
        else:
            header = line
            break

    if noFormatTable:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [
            header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']]

    else:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [
            header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]]

        # next by gene
        geneToEnhancerTable = [
            ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']]

    # next make the gene to enhancer table
    geneToEnhancerTable = [
        ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']]

    for line in enhancerTable:
        if line[0][0] == '#' or line[0][0] == 'R':
            continue

        enhancerString = '%s:%s-%s' % (line[1], line[2], line[3])

        enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0])

        # overlapping genes are transcribed genes whose transcript is directly
        # in the stitchedLocus
        overlappingLoci = transcribedCollection.getOverlap(
            enhancerLocus, 'both')
        overlappingGenes = []
        for overlapLocus in overlappingLoci:
            overlappingGenes.append(overlapLocus.ID())

        # proximalGenes are transcribed genes where the tss is within 50kb of
        # the boundary of the stitched loci
        proximalLoci = tssCollection.getOverlap(
            utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both')
        proximalGenes = []
        for proxLocus in proximalLoci:
            proximalGenes.append(proxLocus.ID())

        distalLoci = tssCollection.getOverlap(
            utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both')
        distalGenes = []
        for proxLocus in distalLoci:
            distalGenes.append(proxLocus.ID())

        overlappingGenes = utils.uniquify(overlappingGenes)
        proximalGenes = utils.uniquify(proximalGenes)
        distalGenes = utils.uniquify(distalGenes)
        allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes
        # these checks make sure each gene list is unique.
        # technically it is possible for a gene to be overlapping, but not proximal since the
        # gene could be longer than the 50kb window, but we'll let that slide
        # here
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)

        for refID in proximalGenes:
            if distalGenes.count(refID) == 1:
                distalGenes.remove(refID)

        # Now find the closest gene
        if len(allEnhancerGenes) == 0:
            closestGene = ''
        else:
            # get enhancerCenter
            enhancerCenter = (int(line[2]) + int(line[3])) / 2

            # get absolute distance to enhancer center
            distList = [abs(enhancerCenter - startDict[geneID]['start'][0])
                        for geneID in allEnhancerGenes]
            # get the ID and convert to name
            closestGene = startDict[
                allEnhancerGenes[distList.index(min(distList))]]['name']

        # NOW WRITE THE ROW FOR THE ENHANCER TABLE
        if noFormatTable:

            newEnhancerLine = list(line)
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)

        else:
            newEnhancerLine = line[0:9]
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)
            newEnhancerLine += line[-2:]

        enhancerToGeneTable.append(newEnhancerLine)
        # Now grab all overlapping and proximal genes for the gene ordered
        # table

        overallGeneList += overlappingGenes
        for refID in overlappingGenes:
            geneDict['overlapping'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))

        overallGeneList += proximalGenes
        for refID in proximalGenes:
            geneDict['proximal'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))

    # End loop through
    # Make table by gene
    print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION')
    overallGeneList = utils.uniquify(overallGeneList)

    enhancerGeneCollection = utils.makeTranscriptCollection(
        annotFile, 5000, 5000, 500, overallGeneList)

    enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection)

    # dump the gff to file
    enhancerFolder = utils.getParentFolder(enhancerFile)
    gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome)
    enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName)
    utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t')

    # now we need to run bamToGFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py'
    if not os.path.isfile(bamliquidator_path):
        bamliquidator_path = 'bamliquidator_batch.py'
        if not os.path.isfile(bamliquidator_path):
            raise ValueError('bamliquidator_batch.py not found in path')

    print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS')
    # map density at genes in the +/- 5kb tss region
    # first on the rankBy bam
    bamName = rankByBamFile.split('/')[-1]
    mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName)
    mappedRankByFile = "%s%s_%s_%s/matrix.gff" % (enhancerFolder,enhancerName, gffRootName, bamName)
    cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile)
    print("Mapping rankby bam %s" % (rankByBamFile))
    print(cmd)

    outputRank = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
    outputRank = outputRank.communicate()
    if len(outputRank[0]) > 0:  # test if mapping worked correctly
        print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile))
    else:
        print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile))
        sys.exit()

    # next on the control bam if it exists
    if len(controlBamFile) > 0:
        controlName = controlBamFile.split('/')[-1]
        mappedControlFolder = "%s%s_%s_%s/" % (
            enhancerFolder, enhancerName,gffRootName, controlName)
        mappedControlFile = "%s%s_%s_%s/matrix.gff" % (
            enhancerFolder, enhancerName,gffRootName, controlName)
        cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile)
        print("Mapping control bam %s" % (controlBamFile))
        print(cmd)
        outputControl = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
        outputControl = outputControl.communicate()
        if len(outputControl[0]) > 0:  # test if mapping worked correctly
            print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile))
        else:
            print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile))
            sys.exit()

    # now get the appropriate output files
    if len(controlBamFile) > 0:
        print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" %
              (mappedRankByFile, mappedControlFile))
        if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signalDict = makeSignalDict(mappedRankByFile, mappedControlFile)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()
    else:
        print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile))
        if utils.checkOutput(mappedRankByFile, 1, 30):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signalDict = makeSignalDict(mappedRankByFile)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()

    # use enhancer rank to order

    rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList])

    usedNames = []

    # make a new dict to hold TSS signal by max per geneName
    geneNameSigDict = defaultdict(list)
    print('MAKING GENE TABLE')
    for i in rankOrder:
        refID = overallGeneList[i]
        geneName = startDict[refID]['name']
        if usedNames.count(geneName) > 0 and uniqueGenes == True:
            continue
        else:
            usedNames.append(geneName)

        proxEnhancers = geneDict['overlapping'][
            refID] + geneDict['proximal'][refID]

        superStatus = max(superDict[refID])
        enhancerRanks = join([str(x) for x in rankDict[refID]], ',')

        enhancerSignal = signalDict[refID]
        geneNameSigDict[geneName].append(enhancerSignal)

        newLine = [geneName, refID, join(
            proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal]
        geneToEnhancerTable.append(newLine)
    #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t')
    print('MAKING ENHANCER TO TOP GENE TABLE')

    if noFormatTable:
        enhancerToTopGeneTable = [
            enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']]
    else:
        enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [
            'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]]

    for line in enhancerToGeneTable[1:]:

        geneList = []
        if noFormatTable:
            geneList += line[-3].split(',')
            geneList += line[-2].split(',')

        else:
            geneList += line[10].split(',')
            geneList += line[11].split(',')

        geneList = utils.uniquify([x for x in geneList if len(x) > 0])
        if len(geneList) > 0:
            try:
                sigVector = [max(geneNameSigDict[x]) for x in geneList]
                maxIndex = sigVector.index(max(sigVector))
                maxGene = geneList[maxIndex]
                maxSig = sigVector[maxIndex]
                if maxSig == 0.0:
                    maxGene = 'NONE'
                    maxSig = 'NONE'
            except ValueError:
                if len(geneList) == 1:
                    maxGene = geneList[0]
                    maxSig = 'NONE'    
                else:
                    maxGene = 'NONE'
                    maxSig = 'NONE'    
        else:
            maxGene = 'NONE'
            maxSig = 'NONE'
        if noFormatTable:
            newLine = line + [maxGene, maxSig]
        else:
            newLine = line[0:12] + [maxGene, maxSig] + line[-2:]
        enhancerToTopGeneTable.append(newLine)

    # resort enhancerToGeneTable
    if noFormatTable:
        return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable
    else:
        enhancerOrder = utils.order([int(line[-2])
                                    for line in enhancerToGeneTable[1:]])
        sortedTable = [enhancerToGeneTable[0]]
        sortedTopGeneTable = [enhancerToTopGeneTable[0]]
        for i in enhancerOrder:
            sortedTable.append(enhancerToGeneTable[(i + 1)])
            sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)])

        return sortedTable, sortedTopGeneTable, geneToEnhancerTable
Example #25
0
def mapEnhancerToGene(annotFile,enhancerFile,transcribedFile='',uniqueGenes=True,searchWindow =50000,noFormatTable = False):
    
    '''
    maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq
    '''
    startDict = utils.makeStartDict(annotFile)
    enhancerTable = utils.parseTable(enhancerFile,'\t')

    #internal parameter for debugging
    byRefseq = False


    if len(transcribedFile) > 0:
        transcribedTable = utils.parseTable(transcribedFile,'\t')
        transcribedGenes = [line[1] for line in transcribedTable]
    else:
        transcribedGenes = startDict.keys()

    print('MAKING TRANSCRIPT COLLECTION')
    transcribedCollection = utils.makeTranscriptCollection(annotFile,0,0,500,transcribedGenes)


    print('MAKING TSS COLLECTION')
    tssLoci = []
    for geneID in transcribedGenes:
        tssLoci.append(utils.makeTSSLocus(geneID,startDict,0,0))


    #this turns the tssLoci list into a LocusCollection
    #50 is the internal parameter for LocusCollection and doesn't really matter
    tssCollection = utils.LocusCollection(tssLoci,50)

    

    geneDict = {'overlapping':defaultdict(list),'proximal':defaultdict(list)}

    #dictionaries to hold ranks and superstatus of gene nearby enhancers
    rankDict = defaultdict(list)
    superDict= defaultdict(list)

    #list of all genes that appear in this analysis
    overallGeneList = []

    if noFormatTable:
        #set up the output tables
        #first by enhancer
        enhancerToGeneTable = [enhancerTable[0]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE']]

        
    else:
        #set up the output tables
        #first by enhancer
        enhancerToGeneTable = [enhancerTable[0][0:9]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE'] + enhancerTable[5][-2:]]

        #next by gene
        geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS']]

    #next make the gene to enhancer table
    geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS','ENHANCER_RANKS','IS_SUPER']]

        


    for line in enhancerTable:
        if line[0][0] =='#' or line[0][0] == 'R':
            continue

        enhancerString = '%s:%s-%s' % (line[1],line[2],line[3])
        
        enhancerLocus = utils.Locus(line[1],line[2],line[3],'.',line[0])

        #overlapping genes are transcribed genes whose transcript is directly in the stitchedLocus         
        overlappingLoci = transcribedCollection.getOverlap(enhancerLocus,'both')           
        overlappingGenes =[]
        for overlapLocus in overlappingLoci:                
            overlappingGenes.append(overlapLocus.ID())

        #proximalGenes are transcribed genes where the tss is within 50kb of the boundary of the stitched loci
        proximalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancerLocus,searchWindow,searchWindow),'both')           
        proximalGenes =[]
        for proxLocus in proximalLoci:
            proximalGenes.append(proxLocus.ID())


        distalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancerLocus,1000000,1000000),'both')           
        distalGenes =[]
        for proxLocus in distalLoci:
            distalGenes.append(proxLocus.ID())

            
            
        overlappingGenes = utils.uniquify(overlappingGenes)
        proximalGenes = utils.uniquify(proximalGenes)
        distalGenes = utils.uniquify(distalGenes)
        allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes
        #these checks make sure each gene list is unique.
        #technically it is possible for a gene to be overlapping, but not proximal since the
        #gene could be longer than the 50kb window, but we'll let that slide here
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)

        for refID in proximalGenes:
            if distalGenes.count(refID) == 1:
                distalGenes.remove(refID)


        #Now find the closest gene
        if len(allEnhancerGenes) == 0:
            closestGene = ''
        else:
            #get enhancerCenter
            enhancerCenter = (int(line[2]) + int(line[3]))/2

            #get absolute distance to enhancer center
            distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes]
            #get the ID and convert to name
            closestGene = startDict[allEnhancerGenes[distList.index(min(distList))]]['name']

        #NOW WRITE THE ROW FOR THE ENHANCER TABLE
        if noFormatTable:

            newEnhancerLine = list(line)
            newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),','))
            newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]),','))
            newEnhancerLine.append(closestGene)

        else:
            newEnhancerLine = line[0:9]
            newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),','))
            newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]),','))
            newEnhancerLine.append(closestGene)
            newEnhancerLine += line[-2:]

        enhancerToGeneTable.append(newEnhancerLine)
        #Now grab all overlapping and proximal genes for the gene ordered table

        overallGeneList +=overlappingGenes
        for refID in overlappingGenes:
            geneDict['overlapping'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))
            
        overallGeneList+=proximalGenes
        for refID in proximalGenes:
            geneDict['proximal'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))



    #End loop through
    
    #Make table by gene
    overallGeneList = utils.uniquify(overallGeneList)  

    #use enhancer rank to order
    rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList])
        
    usedNames = []
    for i in rankOrder:
        refID = overallGeneList[i]
        geneName = startDict[refID]['name']
        if usedNames.count(geneName) > 0 and uniqueGenes == True:

            continue
        else:
            usedNames.append(geneName)
        
        proxEnhancers = geneDict['overlapping'][refID]+geneDict['proximal'][refID]
        
        superStatus = max(superDict[refID])
        enhancerRanks = join([str(x) for x in rankDict[refID]],',')
    
        newLine = [geneName,refID,join(proxEnhancers,','),enhancerRanks,superStatus]
        geneToEnhancerTable.append(newLine)

    #resort enhancerToGeneTable
    if noFormatTable:
        return enhancerToGeneTable,geneToEnhancerTable
    else:
        enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]])
        sortedTable = [enhancerToGeneTable[0]]
        for i in enhancerOrder:
            sortedTable.append(enhancerToGeneTable[(i+1)])

        return sortedTable,geneToEnhancerTable