def main(argv): version = "v0.17" analysisName = "" analysisFolder = "" varianceSeed = 0.001 FDRLimit = 0.01 varianceSeedProvided = False removeDuplicateUpper = False tags = "!out" outlierTag = "out" logicOperatorsAsWords = False dataFile = "" relationsFile = "" newRelFile = "" removedRelFile = "" defaultDataFile = "data" defaultRelationsFile = "rels" defaultTaggedRelFile = "tagged" defaultNewRelFile = "cleaned" defaultRemovedRelFile = "outliers" defaultOutputInfo = "infoFile" infoFile = "" varFile = "" defaultTableExtension = ".tsv" defaultTextExtension = ".txt" defaultGraphExtension = ".png" verbose = True oldWay = False # instead of tagging outliers, separating relations files, the old way modeUsed = mode.onePerHigher logList = [["SanXoTSieve " + version], ["Start: " + strftime("%Y-%m-%d %H:%M:%S")]] try: opts, args = getopt.getopt(argv, "a:p:v:d:r:n:L:V:f:ubDhH", ["analysis=", "folder=", "varianceseed=", "datafile=", "relfile=", "newrelfile=", "outlierrelfile=", "infofile=", "varfile=", "fdrlimit=", "one-to-one", "no-verbose", "randomise", "removeduplicateupper", "help", "advanced-help", "tags=", "outliertag=", "oldway", "word-operators"]) except getopt.GetoptError: logList.append(["Error while getting parameters."]) stats.saveFile(infoFile, logList, "INFO FILE") sys.exit(2) if len(opts) == 0: printHelp(version) sys.exit() for opt, arg in opts: if opt in ("-a", "--analysis"): analysisName = arg if opt in ("-p", "--place", "--folder"): analysisFolder = arg if opt in ("-v", "--var", "--varianceseed"): varianceSeed = float(arg) varianceSeedProvided = True elif opt in ("-d", "--datafile"): dataFile = arg elif opt in ("-r", "--relfile", "--relationsfile"): relationsFile = arg elif opt in ("-n", "--newrelfile"): removedRelFile = arg elif opt in ("-L", "--infofile"): infoFile = arg elif opt in ("-V", "--varfile"): varFile = arg elif opt in ("-u", "--one-to-one"): modeUsed = mode.onlyOne elif opt in ("-b", "--no-verbose"): verbose = False elif opt in ("--oldway"): oldWay = True elif opt in ("-f", "--fdrlimit"): FDRLimit = float(arg) elif opt in ("-D", "--removeduplicateupper"): removeDuplicateUpper = True elif opt in ("--tags"): if arg.strip().lower() != "!out": tags = "!out&(" + arg + ")" elif opt in ("--word-operators"): logicOperatorsAsWords = True elif opt in ("--outliertag"): outlierTag = "out" elif opt in ("-h", "--help"): printHelp(version) sys.exit() elif opt in ("-H", "--advanced-help"): printHelp(version, advanced = True) sys.exit() # REGION: FILE NAMES SETUP if len(analysisName) == 0: if len(dataFile) > 0: analysisName = os.path.splitext(os.path.basename(dataFile))[0] else: analysisName = defaultAnalysisName if len(os.path.dirname(analysisName)) > 0: analysisNameFirstPart = os.path.dirname(analysisName) analysisName = os.path.basename(analysisName) if len(analysisFolder) == 0: analysisFolder = analysisNameFirstPart if len(dataFile) > 0 and len(analysisFolder) == 0: if len(os.path.dirname(dataFile)) > 0: analysisFolder = os.path.dirname(dataFile) # input if len(dataFile) == 0: dataFile = os.path.join(analysisFolder, analysisName + "_" + defaultDataFile + defaultTableExtension) if len(os.path.dirname(dataFile)) == 0 and len(analysisFolder) > 0: dataFile = os.path.join(analysisFolder, dataFile) if len(os.path.dirname(varFile)) == 0 and len(os.path.basename(varFile)) > 0: varFile = os.path.join(analysisFolder, varFile) if len(varFile) > 0 and not varianceSeedProvided: varianceSeed, varianceOk = stats.extractVarianceFromVarFile(varFile, verbose = verbose, defaultSeed = varianceSeed) if not varianceOk: logList.append(["Variance not found in text file."]) stats.saveFile(infoFile, logList, "INFO FILE") sys.exit() if len(relationsFile) == 0: relationsFile = os.path.join(analysisFolder, analysisName + "_" + defaultRelationsFile + defaultTableExtension) if len(os.path.dirname(relationsFile)) == 0: relationsFile = os.path.join(analysisFolder, relationsFile) # output if len(newRelFile) == 0: if oldWay: # suffix: "cleaned" newRelFile = os.path.join(analysisFolder, analysisName + "_" + defaultNewRelFile + defaultTableExtension) else: # suffix: "tagged" newRelFile = os.path.join(analysisFolder, analysisName + "_" + defaultTaggedRelFile + defaultTableExtension) if len(removedRelFile) == 0: removedRelFile = os.path.join(analysisFolder, analysisName + "_" + defaultRemovedRelFile + defaultTableExtension) if len(os.path.dirname(newRelFile)) == 0: newRelFile = os.path.join(analysisFolder, newRelFile) if len(os.path.dirname(removedRelFile)) == 0: removedRelFile = os.path.join(analysisFolder, removedRelFile) if len(infoFile) == 0: infoFile = os.path.join(analysisFolder, analysisName + "_" + defaultOutputInfo + defaultTextExtension) logList.append(["Variance seed = " + str(varianceSeed)]) logList.append(["Input data file: " + dataFile]) logList.append(["Input relations file: " + relationsFile]) if oldWay: logList.append(["Output relations file without outliers: " + newRelFile]) logList.append(["Output relations file with outliers only: " + removedRelFile]) logList.append(["Removing duplicate higher level elements: " + str(removeDuplicateUpper)]) logList.append(["OldWay option activated: outliers are removed instead of tagged"]) else: logList.append(["Relations file tagging outliers: " + newRelFile]) logList.append(["Tags to filter relations: " + tags]) logList.append(["Tag used for outliers: " + outlierTag]) # pp.pprint(logList) # sys.exit() # END REGION: FILE NAMES SETUP relations = stats.loadRelationsFile(relationsFile) data = stats.loadInputDataFile(dataFile) if oldWay: # only for backward compatibility. Note that tags are not supported newRelations, removedRelations, logResults = \ getRelationsWithoutOutliers(data, relations, varianceSeed, FDRLimit = FDRLimit, modeUsed = modeUsed, removeDuplicateUpper = removeDuplicateUpper) else: newRelations, removedRelations, logResults = \ tagRelationsWithoutOutliers(data, relations, varianceSeed, FDRLimit = FDRLimit, modeUsed = modeUsed, removeDuplicateUpper = removeDuplicateUpper, tags = tags, outlierTag = outlierTag, logicOperatorsAsWords = logicOperatorsAsWords) if oldWay: stats.saveFile(newRelFile, newRelations, "idsup\tidinf") else: stats.saveFile(newRelFile, newRelations, "idsup\tidinf\ttags") stats.saveFile(infoFile, logList, "INFO FILE") if oldWay: stats.saveFile(removedRelFile, removedRelations, "idsup\tidinf")
def processFiles(tagList, dataFileList, verbose = False, separator = "_", dataFile = "", relsFile = "", relsHeader = "idsup\tidinf", dataHeader = "idinf\tX'inf\tVinf"): message = [] newRelsList = [] newDataList = [] newRelsListPart = [] newDataListPart = [] time1 = 0 time2 = 0 relsWriter = open(relsFile, "w") dataWriter = open(dataFile, "w") if len(relsHeader) > 0: relsWriter.write(relsHeader + "\n") if len(dataHeader) > 0: dataWriter.write(dataHeader + "\n") for i in range(len(dataFileList)): if os.path.isfile(dataFileList[i]): # pdb.set_trace() dataList = stats.loadInputDataFile(dataFileList[i]) for row in dataList: oldId = row[0].strip() newId = oldId + separator + tagList[i] newRelRow = [oldId, newId] newDataRow = [newId, row[1], row[2]] stats.saveRow(relsWriter, newRelRow) stats.saveRow(dataWriter, newDataRow) now = datetime.now() time1 = float(now.strftime("%H")) * 3600 + float(now.strftime("%M")) * 60 + float(now.strftime("%S.%f")) if verbose: if time1 != 0 and time2 != 0: print() print("Total time: %f" % (time1 - time2)) print(("Reading file #%i: " + dataFileList[i]) % i) print("dataList: %i" % len(dataList)) dataList = [] gc.collect() else: msg = "Error: looks like input file %s does not exist." % dataFileList[i] message.append(msg) print() print(msg) print() gc.collect() now = datetime.now() time2 = float(now.strftime("%H")) * 3600.0 + float(now.strftime("%M")) * 60.0 + float(now.strftime("%S.%f")) # from v0.03, no duplicate removal is performed # if verbose: # print "Removing duplicates..." # newDataList = stats.removeDuplicates(newDataList) # newRelsList = stats.removeDuplicates(newRelsList) # if verbose: # print "Duplicates removed" relsWriter.close() dataWriter.close() return message
def calibrate(inputRawData = None, inputRelations = None, rawDataFile = "", relationsFile = "", kSeed = 1, varianceSeed = 0.001, medianSide = 100, maxIterations = 0, verbose = False, showGraph = False, showSumSQ = False, forceParameters = False, alphaSeed = 1.0, showRank = True, useCooperativity = False, graphFileVRank = "", graphFileVValue = "", graphDataFile = "", graphDPI = None): extraLog = [] extraLog.append([]) if verbose: print() print("loading input raw data file") extraLog.append(["loading input raw data file"]) if inputRawData == None: if len(rawDataFile) == 0: print("Error: no input raw data") extraLog.append(["Error: no input raw data"]) else: inputRawData = stats.loadInputDataFile(rawDataFile) if verbose: print("loading relations file") extraLog.append(["loading relations file"]) if inputRelations == None: if len(relationsFile) == 0: if not forceParameters: print("Error: no relations file") extraLog.append(["Error: no relations file"]) return None, None, None, None, extraLog else: inputRelations = stats.loadRelationsFile(relationsFile) #### calculate k and variance #### alpha = 1.0 if not forceParameters: if verbose: print("calculating K and variance") extraLog.append(["calculating K and variance"]) # *** just to see it result = getKandVariance(inputRawData, inputRelations, kSeed = kSeed, varianceSeed = varianceSeed, maxIterations = maxIterations, verbose = True, showSumSQ = True, medianSide = medianSide, alphaSeed = alphaSeed, useCooperativity = useCooperativity) k = result[0] variance = result[1] if useCooperativity: alpha = result[2] else: k = kSeed variance = varianceSeed alpha = alphaSeed # save VRank graph showGraphTool(inputRawData, inputRelations, k, variance, alpha, medianSide, showRank = True, graphFile = graphFileVRank, graphData = graphDataFile, dpi = graphDPI, showGraph = showGraph) # save VValue graph showGraphTool(inputRawData, inputRelations, k, variance, alpha, medianSide, showRank = False, graphFile = graphFileVValue, dpi = graphDPI, showGraph = showGraph) # get calibrated idXV idXV = idXVcal(inputRawData, k, alpha) return idXV, variance, k, alpha, extraLog
def calibrate(inputRawData=None, inputRelations=None, rawDataFile="", relationsFile="", kSeed=1, varianceSeed=0.001, medianSide=100, maxIterations=0, verbose=False, showGraph=False, showSumSQ=False, forceParameters=False, alphaSeed=1.0, showRank=True, useCooperativity=False, graphFileVRank="", graphFileVValue="", graphDataFile="", graphDPI=None): extraLog = [] extraLog.append([]) if verbose: print() print("loading input raw data file") extraLog.append(["loading input raw data file"]) if inputRawData == None: if len(rawDataFile) == 0: print("Error: no input raw data") extraLog.append(["Error: no input raw data"]) else: inputRawData = stats.loadInputDataFile(rawDataFile) if verbose: print("loading relations file") extraLog.append(["loading relations file"]) if inputRelations == None: if len(relationsFile) == 0: if not forceParameters: print("Error: no relations file") extraLog.append(["Error: no relations file"]) return None, None, None, None, extraLog else: inputRelations = stats.loadRelationsFile(relationsFile) #### calculate k and variance #### alpha = 1.0 if not forceParameters: if verbose: print("calculating K and variance") extraLog.append(["calculating K and variance"]) # *** just to see it result = getKandVariance(inputRawData, inputRelations, kSeed=kSeed, varianceSeed=varianceSeed, maxIterations=maxIterations, verbose=True, showSumSQ=True, medianSide=medianSide, alphaSeed=alphaSeed, useCooperativity=useCooperativity) k = result[0] variance = result[1] if useCooperativity: alpha = result[2] else: k = kSeed variance = varianceSeed alpha = alphaSeed # save VRank graph showGraphTool(inputRawData, inputRelations, k, variance, alpha, medianSide, showRank=True, graphFile=graphFileVRank, graphData=graphDataFile, dpi=graphDPI, showGraph=showGraph) # save VValue graph showGraphTool(inputRawData, inputRelations, k, variance, alpha, medianSide, showRank=False, graphFile=graphFileVValue, dpi=graphDPI, showGraph=showGraph) # get calibrated idXV idXV = idXVcal(inputRawData, k, alpha) return idXV, variance, k, alpha, extraLog
def main(argv): version = "v0.17" analysisName = "" analysisFolder = "" varianceSeed = 0.001 FDRLimit = 0.01 varianceSeedProvided = False removeDuplicateUpper = False tags = "!out" outlierTag = "out" logicOperatorsAsWords = False dataFile = "" relationsFile = "" newRelFile = "" removedRelFile = "" defaultDataFile = "data" defaultRelationsFile = "rels" defaultTaggedRelFile = "tagged" defaultNewRelFile = "cleaned" defaultRemovedRelFile = "outliers" defaultOutputInfo = "infoFile" infoFile = "" varFile = "" defaultTableExtension = ".tsv" defaultTextExtension = ".txt" defaultGraphExtension = ".png" verbose = True oldWay = False # instead of tagging outliers, separating relations files, the old way modeUsed = mode.onePerHigher logList = [["SanXoTSieve " + version], ["Start: " + strftime("%Y-%m-%d %H:%M:%S")]] try: opts, args = getopt.getopt(argv, "a:p:v:d:r:n:L:V:f:ubDhH", [ "analysis=", "folder=", "varianceseed=", "datafile=", "relfile=", "newrelfile=", "outlierrelfile=", "infofile=", "varfile=", "fdrlimit=", "one-to-one", "no-verbose", "randomise", "removeduplicateupper", "help", "advanced-help", "tags=", "outliertag=", "oldway", "word-operators" ]) except getopt.GetoptError: logList.append(["Error while getting parameters."]) stats.saveFile(infoFile, logList, "INFO FILE") sys.exit(2) if len(opts) == 0: printHelp(version) sys.exit() for opt, arg in opts: if opt in ("-a", "--analysis"): analysisName = arg if opt in ("-p", "--place", "--folder"): analysisFolder = arg if opt in ("-v", "--var", "--varianceseed"): varianceSeed = float(arg) varianceSeedProvided = True elif opt in ("-d", "--datafile"): dataFile = arg elif opt in ("-r", "--relfile", "--relationsfile"): relationsFile = arg elif opt in ("-n", "--newrelfile"): removedRelFile = arg elif opt in ("-L", "--infofile"): infoFile = arg elif opt in ("-V", "--varfile"): varFile = arg elif opt in ("-u", "--one-to-one"): modeUsed = mode.onlyOne elif opt in ("-b", "--no-verbose"): verbose = False elif opt in ("--oldway"): oldWay = True elif opt in ("-f", "--fdrlimit"): FDRLimit = float(arg) elif opt in ("-D", "--removeduplicateupper"): removeDuplicateUpper = True elif opt in ("--tags"): if arg.strip().lower() != "!out": tags = "!out&(" + arg + ")" elif opt in ("--word-operators"): logicOperatorsAsWords = True elif opt in ("--outliertag"): outlierTag = "out" elif opt in ("-h", "--help"): printHelp(version) sys.exit() elif opt in ("-H", "--advanced-help"): printHelp(version, advanced=True) sys.exit() # REGION: FILE NAMES SETUP if len(analysisName) == 0: if len(dataFile) > 0: analysisName = os.path.splitext(os.path.basename(dataFile))[0] else: analysisName = defaultAnalysisName if len(os.path.dirname(analysisName)) > 0: analysisNameFirstPart = os.path.dirname(analysisName) analysisName = os.path.basename(analysisName) if len(analysisFolder) == 0: analysisFolder = analysisNameFirstPart if len(dataFile) > 0 and len(analysisFolder) == 0: if len(os.path.dirname(dataFile)) > 0: analysisFolder = os.path.dirname(dataFile) # input if len(dataFile) == 0: dataFile = os.path.join( analysisFolder, analysisName + "_" + defaultDataFile + defaultTableExtension) if len(os.path.dirname(dataFile)) == 0 and len(analysisFolder) > 0: dataFile = os.path.join(analysisFolder, dataFile) if len(os.path.dirname(varFile)) == 0 and len( os.path.basename(varFile)) > 0: varFile = os.path.join(analysisFolder, varFile) if len(varFile) > 0 and not varianceSeedProvided: varianceSeed, varianceOk = stats.extractVarianceFromVarFile( varFile, verbose=verbose, defaultSeed=varianceSeed) if not varianceOk: logList.append(["Variance not found in text file."]) stats.saveFile(infoFile, logList, "INFO FILE") sys.exit() if len(relationsFile) == 0: relationsFile = os.path.join( analysisFolder, analysisName + "_" + defaultRelationsFile + defaultTableExtension) if len(os.path.dirname(relationsFile)) == 0: relationsFile = os.path.join(analysisFolder, relationsFile) # output if len(newRelFile) == 0: if oldWay: # suffix: "cleaned" newRelFile = os.path.join( analysisFolder, analysisName + "_" + defaultNewRelFile + defaultTableExtension) else: # suffix: "tagged" newRelFile = os.path.join( analysisFolder, analysisName + "_" + defaultTaggedRelFile + defaultTableExtension) if len(removedRelFile) == 0: removedRelFile = os.path.join( analysisFolder, analysisName + "_" + defaultRemovedRelFile + defaultTableExtension) if len(os.path.dirname(newRelFile)) == 0: newRelFile = os.path.join(analysisFolder, newRelFile) if len(os.path.dirname(removedRelFile)) == 0: removedRelFile = os.path.join(analysisFolder, removedRelFile) if len(infoFile) == 0: infoFile = os.path.join( analysisFolder, analysisName + "_" + defaultOutputInfo + defaultTextExtension) logList.append(["Variance seed = " + str(varianceSeed)]) logList.append(["Input data file: " + dataFile]) logList.append(["Input relations file: " + relationsFile]) if oldWay: logList.append( ["Output relations file without outliers: " + newRelFile]) logList.append( ["Output relations file with outliers only: " + removedRelFile]) logList.append([ "Removing duplicate higher level elements: " + str(removeDuplicateUpper) ]) logList.append([ "OldWay option activated: outliers are removed instead of tagged" ]) else: logList.append(["Relations file tagging outliers: " + newRelFile]) logList.append(["Tags to filter relations: " + tags]) logList.append(["Tag used for outliers: " + outlierTag]) # pp.pprint(logList) # sys.exit() # END REGION: FILE NAMES SETUP relations = stats.loadRelationsFile(relationsFile) data = stats.loadInputDataFile(dataFile) if oldWay: # only for backward compatibility. Note that tags are not supported newRelations, removedRelations, logResults = \ getRelationsWithoutOutliers(data, relations, varianceSeed, FDRLimit = FDRLimit, modeUsed = modeUsed, removeDuplicateUpper = removeDuplicateUpper) else: newRelations, removedRelations, logResults = \ tagRelationsWithoutOutliers(data, relations, varianceSeed, FDRLimit = FDRLimit, modeUsed = modeUsed, removeDuplicateUpper = removeDuplicateUpper, tags = tags, outlierTag = outlierTag, logicOperatorsAsWords = logicOperatorsAsWords) if oldWay: stats.saveFile(newRelFile, newRelations, "idsup\tidinf") else: stats.saveFile(newRelFile, newRelations, "idsup\tidinf\ttags") stats.saveFile(infoFile, logList, "INFO FILE") if oldWay: stats.saveFile(removedRelFile, removedRelations, "idsup\tidinf")