def computeROC(positivef, negativef): """Computes AUC and ROC curve""" allData = mData.r2Col(positivef) posSamples = set(allData.keys()) allData = mData.r2Col(negativef, appendData = allData) negSamples = set(allData.keys()) - posSamples allSamples = allData.keys() allSamples.sort(lambda x, y: cmp(allData[y],allData[x])) x = [0.0] y = [0.0] TP = 0 FP = 0 TN = len(negSamples) FN = len(posSamples) for i in allSamples: if i in posSamples: TP += 1 FN -= 1 else: TN -= 1 FP += 1 x.append(float(FP)/len(negSamples)) y.append(float(TP)/len(posSamples)) AUC = 0.0 for i in range(len(x)-1): AUC += (x[i+1]-x[i])*((y[i+1]+y[i])/2) return(AUC, x, y)
def computeROC(positivef, negativef): """Computes AUC and ROC curve""" allData = mData.r2Col(positivef) posSamples = set(allData.keys()) allData = mData.r2Col(negativef, appendData=allData) negSamples = set(allData.keys()) - posSamples allSamples = allData.keys() allSamples.sort(lambda x, y: cmp(allData[y], allData[x])) x = [0.0] y = [0.0] TP = 0 FP = 0 TN = len(negSamples) FN = len(posSamples) for i in allSamples: if i in posSamples: TP += 1 FN -= 1 else: TN -= 1 FP += 1 x.append(float(FP) / len(negSamples)) y.append(float(TP) / len(posSamples)) AUC = 0.0 for i in range(len(x) - 1): AUC += (x[i + 1] - x[i]) * ((y[i + 1] + y[i]) / 2) return (AUC, x, y)
def rSIF(inf, typef = None, reverse = False): """read .sif""" readPathway = Pathway(dict(), dict()) inNodes = dict() #Dictionary with (A : type) inInteractions = dict() #Dictionary with (A : (B : interaction)) nodeMap = dict() if typef != None: nodeMap = mData.r2Col(typef, delim = " = ", header = True) f = open(inf, "r") for line in f: if line.isspace(): continue line = line.rstrip("\r\n") pline = re.split("\s*\t\s*", line) if pline[0] not in inNodes: if pline[0] in nodeMap: inNodes[pline[0]] = nodeMap[pline[0]] else: inNodes[pline[0]] = "concept" if pline[2] not in inNodes: if pline[2] in nodeMap: inNodes[pline[2]] = nodeMap[pline[2]] else: inNodes[pline[2]] = "concept" if reverse: if pline[2] not in inInteractions: inInteractions[pline[2]] = dict() if pline[0] not in inInteractions[pline[2]]: inInteractions[pline[2]][pline[0]] = pline[1] else: inInteractions[pline[2]][pline[0]] += ";"+pline[1] else: if pline[0] not in inInteractions: inInteractions[pline[0]] = dict() if pline[2] not in inInteractions[pline[0]]: inInteractions[pline[0]][pline[2]] = pline[1] else: inInteractions[pline[0]][pline[2]] += ";"+pline[1] f.close() return(inNodes, inInteractions)
def rSIF(inf, typef=None, reverse=False): """read .sif""" readPathway = Pathway(dict(), dict()) inNodes = dict() #Dictionary with (A : type) inInteractions = dict() #Dictionary with (A : (B : interaction)) nodeMap = dict() if typef != None: nodeMap = mData.r2Col(typef, delim=" = ", header=True) f = open(inf, "r") for line in f: if line.isspace(): continue line = line.rstrip("\r\n") pline = re.split("\s*\t\s*", line) if pline[0] not in inNodes: if pline[0] in nodeMap: inNodes[pline[0]] = nodeMap[pline[0]] else: inNodes[pline[0]] = "concept" if pline[2] not in inNodes: if pline[2] in nodeMap: inNodes[pline[2]] = nodeMap[pline[2]] else: inNodes[pline[2]] = "concept" if reverse: if pline[2] not in inInteractions: inInteractions[pline[2]] = dict() if pline[0] not in inInteractions[pline[2]]: inInteractions[pline[2]][pline[0]] = pline[1] else: inInteractions[pline[2]][pline[0]] += ";" + pline[1] else: if pline[0] not in inInteractions: inInteractions[pline[0]] = dict() if pline[2] not in inInteractions[pline[0]]: inInteractions[pline[0]][pline[2]] = pline[1] else: inInteractions[pline[0]][pline[2]] += ";" + pline[1] f.close() return (inNodes, inInteractions)
minCol = rgb(255, 0, 0) log("Color: meth\n") elif circleFiles[i].startswith("mut."): maxCol = rgb(0, 0, 0) minCol = rgb(255, 255, 255) log("Color: mut\n") circleColors.append( (minCol, zerCol, maxCol) ) if sampleFile == None: samples = list(set(cols) | set(samples)) if featureFile == None: features = list(set(rows) | set(features)) ## read centerFile centerData = None if centerFile != None: centerData = mData.r2Col(centerFile, header = True) ## sort if orderFeature != None: if len(orderFiles) > 0: orderData = [] orderColors = [] for i in range(len(orderFiles)): orderData.append(mData.rCRSData(orderFiles[i])) minCol = rgb(255, 255, 255) zerCol = rgb(255, 255, 255) maxCol = rgb(0, 0, 0) orderColors.append( (minCol, zerCol, maxCol) ) else: orderData = circleData samples.sort(lambda x, y: scmp(x, y, orderFeature, orderData))
def cli_routine(outputDir, circleFiles, orderFiles, sampleFile, featureFile, orderFeature, centerFile, colorscaleFile, printLabel, verbose, cohortMinMax=False, purpleHack = True): """Routine for program execution via command-line.""" # I've tried not to touch this method as much as possible. # I don't want to break the way it was working for Sam Ng. # chrisw ## execute samples = [] features = [] if sampleFile != None: samples = mData.rList(sampleFile) if featureFile != None: features = mData.rList(featureFile) # end section for getting lists of samples and features ## read circleFiles # circleData is a list of dict[col][row]=score from each circleFile circleData = [] # circleColorsPalette is a list of (minColor),(zeroColor),(maxColor) circleColorsPalette = [] ## read colorscaleFile # the format is as follows - header compulsory: # min/max color coding color1 color2 color 3 # -2,2 rgb 155,155,155 255,255,255 0,0,0, # - rgb 155,0,155 255,0,255 0,0,0, # the "color format" is intended to support more color format, as I have # seen the html-colors in the code. # Michael ([email protected]) colorscaleData = None if colorscaleFile != None: if cohortMinMax: log("WARNING: The -k option overrides -m") colorscaleData = mData.retRows(colorscaleFile,aslist=True) line=1 for cs in colorscaleData: line = line + 1 if len(cs) != 5: log("ERROR: color scale needs five fields: datapoints, colorcoding(rgb) and three colors\n", die = True) try: cs[0] = [float(x) for x in cs[0].split(",")] except ValueError: pass if len(cs[0]) != 2 and cs[0] != "-": print cs[0] log("ERROR: Two data points or dash needed for color scale\n", die = True) if cs[1].lower() == "rgb": try: cs[2] = rgb(*[float(x) for x in cs[2].split(",")]) cs[3] = rgb(*[float(x) for x in cs[3].split(",")]) cs[4] = rgb(*[float(x) for x in cs[4].split(",")]) except TypeError: log("ERROR: RGB needs three values on line " + str(line) + "\n", die = True) except ValueError: log("ERROR: RGB color not correctly defined on line " + str(line) + "\n", die=True) else: log("ERROR: Unknown color coding on line " + str(line) + ": " + str(cs[1]) + "\n", die=True) for i in xrange(len(circleFiles)): # get data, samples, and features from each circleFile # data is a dict[col][row]=score # cols is a list of sample names # features is a list of feature names (data, cols, rows) = mData.rCRSData(circleFiles[i], retFeatures=True) circleData.append(data) minCol = lightBlueRGB zerCol = whiteRGB maxCol = redRGB if colorscaleFile != None and i<len(colorscaleData): #get colors from specified colorscaleFile minCol = colorscaleData[i][2] zerCol = colorscaleData[i][3] maxCol = colorscaleData[i][4] # special cases for -meth and -mut # if circleFiles[i].endswith("meth"): # maxCol = blueRGB # minCol = redRGB # log("Color: meth\n") # elif circleFiles[i].endswith("mut"): # maxCol = blackRGB # minCol = whiteRGB # log("Color: mut\n") circleColorsPalette.append((minCol, zerCol, maxCol)) # if no sampleFile/featureFile, default to using samples/features from circleFiles if sampleFile == None: samples = list(set(cols) | set(samples)) if featureFile == None: features = list(set(rows) | set(features)) # end section for reading circleFiles ## read centerFile centerData = None if centerFile != None: centerData = mData.r2Col(centerFile, header=True) ## sort if orderFeature != None: if len(orderFiles) > 0: orderData = [] orderColors = [] for i in xrange(len(orderFiles)): orderData.append(mData.rCRSData(orderFiles[i])) minCol = whiteRGB zerCol = whiteRGB maxCol = blackRGB orderColors.append((minCol, zerCol, maxCol)) else: orderData = circleData # sort samples based on sample score in orderData # priority of sorting determined by orderFiles parameter samples.sort(lambda x, y: scmp(x, y, orderFeature, orderData)) ## cohort png # cgi will probably not use orderFiles if len(orderFiles) > 0: imgFile = "%s/Cohort.png" % (outputDir) label = "Cohort" centerCol = whiteRGB.tohex() cohortCircleCols = [] for i in xrange(len(orderData)): ringCols = [] ringVals = [] for sample in samples: if sample in orderData[i]: if orderFeature in orderData[i][sample]: ringVals.append(orderData[i][sample][orderFeature]) elif "*" in orderData[i][sample]: ringVals.append(orderData[i][sample]["*"]) minVal = min([-0.01] + mData.floatList(ringVals)) maxVal = max([0.01] + mData.floatList(ringVals)) for sample in samples: if sample in orderData[i]: if orderFeature in orderData[i][sample]: ringCols.append(getColor(orderData[i][sample][orderFeature], minVal, maxVal, minColor=orderColors[i][0], zeroColor=orderColors[i][1], maxColor=orderColors[i][2])) elif "*" in orderData[i][sample]: ringCols.append(getColor(orderData[i][sample]["*"], minVal, maxVal, minColor=orderColors[i][0], zeroColor=orderColors[i][1], maxColor=orderColors[i][2])) else: ringCols.append(greyRGB.tohex()) else: ringCols.append(greyRGB.tohex()) cohortCircleCols.append(ringCols) plotCircle(imgFile, label=label, centerCol=centerCol, circleCols=cohortCircleCols, innerRadTotal=0.2, outerRadTotal=0.5, width=5) # end section for sample ordering ## plot images if centerData != None: centerDataFloatList = mData.floatList(centerData.values()) centerDataMinVal = min([-0.01] + centerDataFloatList) centerDataMaxVal = max([0.01] + centerDataFloatList) # get min/max values for datasets if cohortMinMax: (minValList, maxValList) = getCohortMinMaxValues(features, samples, circleData) else: (minValList, maxValList) = (None, None) if colorscaleData != None: (minValList, maxValList) = getColorScaleMinMaxValues(minValList, maxValList, len(circleData), colorscaleData) for feature in features: log("Drawing %s\n" % (feature)) centerColHex = None if centerData != None: if feature in centerData: centerColHex = getColor(centerData[feature], centerDataMinVal, centerDataMaxVal, minColor=lightBlueRGB, zeroColor=whiteRGB, purple0Hack=purpleHack) imgFile = "%s/%s.png" % (outputDir, re.sub("[/:]", "_", feature)) label = "" if printLabel: label = feature image_width = 5.0 drawCircleImageForFeature(feature, samples, label, imgFile, circleData, circleColorsPalette, width=image_width, centerColHex=centerColHex, minValList=minValList, maxValList=maxValList, purple0Hack=purpleHack) for sample in samples: log("ordered samples: %s\n" % (sample))
if os.path.exists("%s/img" % (feature)): customImage = True ## identify nets with feature sifFile = None for i in os.listdir("%s" % (feature+"/")): if i.endswith(netExtension): sifFile = feature+"/"+i break (allNodes, allInteractions) = mPathway.rSIF(sifFile) idMap = dict() nodeMap = dict() for i, node in enumerate(allNodes.keys()): idMap[node] = i+1 nodeMap[i+1] = node labelMap = mData.r2Col("LABEL.NA", delim = " = ", header = True) typeMap = mData.r2Col("TYPE.NA", delim = " = ", header = True) scoreMap = mData.r2Col("%s_SCORE.NA" % (feature), delim = " = ", header = True) nodes = nodeMap.keys() nodes.sort() ## create graphml structure graphmlContent = """<graphml>\\ <key id="name" for="node" attr.name="name" attr.type="string"/>\\ <key id="label" for="node" attr.name="label" attr.type="string"/>\\ <key id="type" for="node" attr.name="type" attr.type="string"/>\\ <key id="color" for="node" attr.name="color" attr.type="string"/>\\ <key id="size" for="node" attr.name="size" attr.type="double"/>\\ <key id="score" for="node" attr.name="score" attr.type="double"/>\\ <key id="image" for="node" attr.name="image" attr.type="string"/>\\ <key id="interaction" for="edge" attr.name="interaction" attr.type="string"/>\\
def cli_routine(outputDir, circleFiles, orderFiles, sampleFile, featureFile, orderFeature, centerFile, colorscaleFile, printLabel, verbose, cohortMinMax=False, purpleHack=True): """Routine for program execution via command-line.""" # I've tried not to touch this method as much as possible. # I don't want to break the way it was working for Sam Ng. # chrisw ## execute samples = [] features = [] if sampleFile != None: samples = mData.rList(sampleFile) if featureFile != None: features = mData.rList(featureFile) # end section for getting lists of samples and features ## read circleFiles # circleData is a list of dict[col][row]=score from each circleFile circleData = [] # circleColorsPalette is a list of (minColor),(zeroColor),(maxColor) circleColorsPalette = [] ## read colorscaleFile # the format is as follows - header compulsory: # min/max color coding color1 color2 color 3 # -2,2 rgb 155,155,155 255,255,255 0,0,0, # - rgb 155,0,155 255,0,255 0,0,0, # the "color format" is intended to support more color format, as I have # seen the html-colors in the code. # Michael ([email protected]) colorscaleData = None if colorscaleFile != None: if cohortMinMax: log("WARNING: The -k option overrides -m") colorscaleData = mData.retRows(colorscaleFile, aslist=True) line = 1 for cs in colorscaleData: line = line + 1 if len(cs) != 5: log("ERROR: color scale needs five fields: datapoints, colorcoding(rgb) and three colors\n", die=True) try: cs[0] = [float(x) for x in cs[0].split(",")] except ValueError: pass if len(cs[0]) != 2 and cs[0] != "-": print cs[0] log("ERROR: Two data points or dash needed for color scale\n", die=True) if cs[1].lower() == "rgb": try: cs[2] = rgb(*[float(x) for x in cs[2].split(",")]) cs[3] = rgb(*[float(x) for x in cs[3].split(",")]) cs[4] = rgb(*[float(x) for x in cs[4].split(",")]) except TypeError: log("ERROR: RGB needs three values on line " + str(line) + "\n", die=True) except ValueError: log("ERROR: RGB color not correctly defined on line " + str(line) + "\n", die=True) else: log("ERROR: Unknown color coding on line " + str(line) + ": " + str(cs[1]) + "\n", die=True) for i in xrange(len(circleFiles)): # get data, samples, and features from each circleFile # data is a dict[col][row]=score # cols is a list of sample names # features is a list of feature names (data, cols, rows) = mData.rCRSData(circleFiles[i], retFeatures=True) circleData.append(data) minCol = lightBlueRGB zerCol = whiteRGB maxCol = redRGB if colorscaleFile != None and i < len(colorscaleData): #get colors from specified colorscaleFile minCol = colorscaleData[i][2] zerCol = colorscaleData[i][3] maxCol = colorscaleData[i][4] # special cases for -meth and -mut # if circleFiles[i].endswith("meth"): # maxCol = blueRGB # minCol = redRGB # log("Color: meth\n") # elif circleFiles[i].endswith("mut"): # maxCol = blackRGB # minCol = whiteRGB # log("Color: mut\n") circleColorsPalette.append((minCol, zerCol, maxCol)) # if no sampleFile/featureFile, default to using samples/features from circleFiles if sampleFile == None: samples = list(set(cols) | set(samples)) if featureFile == None: features = list(set(rows) | set(features)) # end section for reading circleFiles ## read centerFile centerData = None if centerFile != None: centerData = mData.r2Col(centerFile, header=True) ## sort if orderFeature != None: if len(orderFiles) > 0: orderData = [] orderColors = [] for i in xrange(len(orderFiles)): orderData.append(mData.rCRSData(orderFiles[i])) minCol = whiteRGB zerCol = whiteRGB maxCol = blackRGB orderColors.append((minCol, zerCol, maxCol)) else: orderData = circleData # sort samples based on sample score in orderData # priority of sorting determined by orderFiles parameter samples.sort(lambda x, y: scmp(x, y, orderFeature, orderData)) ## cohort png # cgi will probably not use orderFiles if len(orderFiles) > 0: imgFile = "%s/Cohort.png" % (outputDir) label = "Cohort" centerCol = whiteRGB.tohex() cohortCircleCols = [] for i in xrange(len(orderData)): ringCols = [] ringVals = [] for sample in samples: if sample in orderData[i]: if orderFeature in orderData[i][sample]: ringVals.append(orderData[i][sample][orderFeature]) elif "*" in orderData[i][sample]: ringVals.append(orderData[i][sample]["*"]) minVal = min([-0.01] + mData.floatList(ringVals)) maxVal = max([0.01] + mData.floatList(ringVals)) for sample in samples: if sample in orderData[i]: if orderFeature in orderData[i][sample]: ringCols.append( getColor(orderData[i][sample][orderFeature], minVal, maxVal, minColor=orderColors[i][0], zeroColor=orderColors[i][1], maxColor=orderColors[i][2])) elif "*" in orderData[i][sample]: ringCols.append( getColor(orderData[i][sample]["*"], minVal, maxVal, minColor=orderColors[i][0], zeroColor=orderColors[i][1], maxColor=orderColors[i][2])) else: ringCols.append(greyRGB.tohex()) else: ringCols.append(greyRGB.tohex()) cohortCircleCols.append(ringCols) plotCircle(imgFile, label=label, centerCol=centerCol, circleCols=cohortCircleCols, innerRadTotal=0.2, outerRadTotal=0.5, width=5) # end section for sample ordering ## plot images if centerData != None: centerDataFloatList = mData.floatList(centerData.values()) centerDataMinVal = min([-0.01] + centerDataFloatList) centerDataMaxVal = max([0.01] + centerDataFloatList) # get min/max values for datasets if cohortMinMax: (minValList, maxValList) = getCohortMinMaxValues(features, samples, circleData) else: (minValList, maxValList) = (None, None) if colorscaleData != None: (minValList, maxValList) = getColorScaleMinMaxValues(minValList, maxValList, len(circleData), colorscaleData) for feature in features: log("Drawing %s\n" % (feature)) centerColHex = None if centerData != None: if feature in centerData: centerColHex = getColor(centerData[feature], centerDataMinVal, centerDataMaxVal, minColor=lightBlueRGB, zeroColor=whiteRGB, purple0Hack=purpleHack) imgFile = "%s/%s.png" % (outputDir, re.sub("[/:]", "_", feature)) label = "" if printLabel: label = feature image_width = 5.0 drawCircleImageForFeature(feature, samples, label, imgFile, circleData, circleColorsPalette, width=image_width, centerColHex=centerColHex, minValList=minValList, maxValList=maxValList, purple0Hack=purpleHack) for sample in samples: log("ordered samples: %s\n" % (sample))
if os.path.exists("%s/img" % (feature)): customImage = True ## identify nets with feature sifFile = None for i in os.listdir("%s" % (feature + "/")): if i.endswith(netExtension): sifFile = feature + "/" + i break (allNodes, allInteractions) = mPathway.rSIF(sifFile) idMap = dict() nodeMap = dict() for i, node in enumerate(allNodes.keys()): idMap[node] = i + 1 nodeMap[i + 1] = node labelMap = mData.r2Col("LABEL.NA", delim=" = ", header=True) typeMap = mData.r2Col("TYPE.NA", delim=" = ", header=True) scoreMap = mData.r2Col("%s_SCORE.NA" % (feature), delim=" = ", header=True) nodes = nodeMap.keys() nodes.sort() ## create graphml structure graphmlContent = """<graphml>\\ <key id="name" for="node" attr.name="name" attr.type="string"/>\\ <key id="label" for="node" attr.name="label" attr.type="string"/>\\ <key id="type" for="node" attr.name="type" attr.type="string"/>\\ <key id="color" for="node" attr.name="color" attr.type="string"/>\\ <key id="size" for="node" attr.name="size" attr.type="double"/>\\ <key id="score" for="node" attr.name="score" attr.type="double"/>\\ <key id="image" for="node" attr.name="image" attr.type="string"/>\\ <key id="interaction" for="edge" attr.name="interaction" attr.type="string"/>\\
log("ERROR: incorrect number of arguments", die = True) featureFile = args[0] pathwayFile = args[1] scoreFile = args[2] global verbose for o, a in opts: if o == "-q": verbose = False ## execute featureList = mData.rList(featureFile) (gNodes, gInteractions) = mPathway.rPathway(pathwayFile) scoreMap = {} scoreMap[sessionName] = mData.r2Col(scoreFile) ## find connected connectList = set() for source in featureList: if source not in gNodes: continue for target in featureList: if target not in gNodes: continue if source == target: continue paths = mPathway.shortestPath(source, target, gInteractions, maxDistance = maxDistance) if len(paths) == 0: log("%s /> %s\n" % (source, target)) else: