Esempio n. 1
0
def main(argv):
	
	version = "v0.17"
	analysisName = ""
	analysisFolder = ""
	varianceSeed = 0.001
	FDRLimit = 0.01
	varianceSeedProvided = False
	removeDuplicateUpper = False
	tags = "!out"
	outlierTag = "out"
	logicOperatorsAsWords = False
	dataFile = ""
	relationsFile = ""
	newRelFile = ""
	removedRelFile = ""
	defaultDataFile = "data"
	defaultRelationsFile = "rels"
	defaultTaggedRelFile = "tagged"
	defaultNewRelFile = "cleaned"
	defaultRemovedRelFile = "outliers"
	defaultOutputInfo = "infoFile"
	infoFile = ""
	varFile = ""
	defaultTableExtension = ".tsv"
	defaultTextExtension = ".txt"
	defaultGraphExtension = ".png"
	verbose = True
	oldWay = False # instead of tagging outliers, separating relations files, the old way
	modeUsed = mode.onePerHigher
	logList = [["SanXoTSieve " + version], ["Start: " + strftime("%Y-%m-%d %H:%M:%S")]]

	try:
		opts, args = getopt.getopt(argv, "a:p:v:d:r:n:L:V:f:ubDhH", ["analysis=", "folder=", "varianceseed=", "datafile=", "relfile=", "newrelfile=", "outlierrelfile=", "infofile=", "varfile=", "fdrlimit=", "one-to-one", "no-verbose", "randomise", "removeduplicateupper", "help", "advanced-help", "tags=", "outliertag=", "oldway", "word-operators"])
	except getopt.GetoptError:
		logList.append(["Error while getting parameters."])
		stats.saveFile(infoFile, logList, "INFO FILE")
		sys.exit(2)
	
	if len(opts) == 0:
		printHelp(version)
		sys.exit()

	for opt, arg in opts:
		if opt in ("-a", "--analysis"):
			analysisName = arg
		if opt in ("-p", "--place", "--folder"):
			analysisFolder = arg
		if opt in ("-v", "--var", "--varianceseed"):
			varianceSeed = float(arg)
			varianceSeedProvided = True
		elif opt in ("-d", "--datafile"):
			dataFile = arg
		elif opt in ("-r", "--relfile", "--relationsfile"):
			relationsFile = arg
		elif opt in ("-n", "--newrelfile"):
			removedRelFile = arg
		elif opt in ("-L", "--infofile"):
			infoFile = arg
		elif opt in ("-V", "--varfile"):
			varFile = arg
		elif opt in ("-u", "--one-to-one"):
			modeUsed = mode.onlyOne
		elif opt in ("-b", "--no-verbose"):
			verbose = False
		elif opt in ("--oldway"):
			oldWay = True
		elif opt in ("-f", "--fdrlimit"):
			FDRLimit = float(arg)
		elif opt in ("-D", "--removeduplicateupper"):
			removeDuplicateUpper = True
		elif opt in ("--tags"):
			if arg.strip().lower() != "!out":
				tags = "!out&(" + arg + ")"
		elif opt in ("--word-operators"):
			logicOperatorsAsWords = True
		elif opt in ("--outliertag"):
			outlierTag = "out"
		elif opt in ("-h", "--help"):
			printHelp(version)
			sys.exit()
		elif opt in ("-H", "--advanced-help"):
			printHelp(version, advanced = True)
			sys.exit()
	
# REGION: FILE NAMES SETUP
			
	if len(analysisName) == 0:
		if len(dataFile) > 0:
			analysisName = os.path.splitext(os.path.basename(dataFile))[0]
		else:
			analysisName = defaultAnalysisName

	if len(os.path.dirname(analysisName)) > 0:
		analysisNameFirstPart = os.path.dirname(analysisName)
		analysisName = os.path.basename(analysisName)
		if len(analysisFolder) == 0:
			analysisFolder = analysisNameFirstPart
			
	if len(dataFile) > 0 and len(analysisFolder) == 0:
		if len(os.path.dirname(dataFile)) > 0:
			analysisFolder = os.path.dirname(dataFile)

	# input
	if len(dataFile) == 0:
		dataFile = os.path.join(analysisFolder, analysisName + "_" + defaultDataFile + defaultTableExtension)
		
	if len(os.path.dirname(dataFile)) == 0 and len(analysisFolder) > 0:
		dataFile = os.path.join(analysisFolder, dataFile)
	
	if len(os.path.dirname(varFile)) == 0 and len(os.path.basename(varFile)) > 0:
		varFile = os.path.join(analysisFolder, varFile)
		
	if len(varFile) > 0 and not varianceSeedProvided:
		varianceSeed, varianceOk = stats.extractVarianceFromVarFile(varFile, verbose = verbose, defaultSeed = varianceSeed)
		if not varianceOk:
			logList.append(["Variance not found in text file."])
			stats.saveFile(infoFile, logList, "INFO FILE")
			sys.exit()
	
	if len(relationsFile) == 0:
		relationsFile = os.path.join(analysisFolder, analysisName + "_" + defaultRelationsFile + defaultTableExtension)
	
	if len(os.path.dirname(relationsFile)) == 0:
		relationsFile = os.path.join(analysisFolder, relationsFile)
	
	# output
	if len(newRelFile) == 0:
		if oldWay: # suffix: "cleaned"
			newRelFile = os.path.join(analysisFolder, analysisName + "_" + defaultNewRelFile + defaultTableExtension)
		else: # suffix: "tagged"
			newRelFile = os.path.join(analysisFolder, analysisName + "_" + defaultTaggedRelFile + defaultTableExtension)
	
	if len(removedRelFile) == 0:
		removedRelFile = os.path.join(analysisFolder, analysisName + "_" + defaultRemovedRelFile + defaultTableExtension)
	
	if len(os.path.dirname(newRelFile)) == 0:
		newRelFile = os.path.join(analysisFolder, newRelFile)
		
	if len(os.path.dirname(removedRelFile)) == 0:
		removedRelFile = os.path.join(analysisFolder, removedRelFile)
	
	if len(infoFile) == 0:
		infoFile = os.path.join(analysisFolder, analysisName + "_" + defaultOutputInfo + defaultTextExtension)
	
	logList.append(["Variance seed = " + str(varianceSeed)])
	logList.append(["Input data file: " + dataFile])
	logList.append(["Input relations file: " + relationsFile])
	if oldWay:
		logList.append(["Output relations file without outliers: " + newRelFile])
		logList.append(["Output relations file with outliers only: " + removedRelFile])
		logList.append(["Removing duplicate higher level elements: " + str(removeDuplicateUpper)])
		logList.append(["OldWay option activated: outliers are removed instead of tagged"])
	else:
		logList.append(["Relations file tagging outliers: " + newRelFile])
		logList.append(["Tags to filter relations: " + tags])
		logList.append(["Tag used for outliers: " + outlierTag])

	# pp.pprint(logList)
	# sys.exit()

# END REGION: FILE NAMES SETUP
	
	relations = stats.loadRelationsFile(relationsFile)
	data = stats.loadInputDataFile(dataFile)
	
	if oldWay:
		# only for backward compatibility. Note that tags are not supported
		newRelations, removedRelations, logResults = \
								getRelationsWithoutOutliers(data,
										relations,
										varianceSeed,
										FDRLimit = FDRLimit,
										modeUsed = modeUsed,
										removeDuplicateUpper = removeDuplicateUpper)
	else:
		newRelations, removedRelations, logResults = \
								tagRelationsWithoutOutliers(data,
										relations,
										varianceSeed,
										FDRLimit = FDRLimit,
										modeUsed = modeUsed,
										removeDuplicateUpper = removeDuplicateUpper,
										tags = tags,
										outlierTag = outlierTag,
										logicOperatorsAsWords = logicOperatorsAsWords)
		
	if oldWay:
		stats.saveFile(newRelFile, newRelations, "idsup\tidinf")
	else:
		stats.saveFile(newRelFile, newRelations, "idsup\tidinf\ttags")
		
	stats.saveFile(infoFile, logList, "INFO FILE")
	
	if oldWay:
		stats.saveFile(removedRelFile, removedRelations, "idsup\tidinf")
Esempio n. 2
0
def processFiles(tagList, dataFileList, verbose = False, separator = "_", dataFile = "", relsFile = "", relsHeader = "idsup\tidinf", dataHeader = "idinf\tX'inf\tVinf"):

	message = []
	newRelsList = []
	newDataList = []
	newRelsListPart = []
	newDataListPart = []
	
	time1 = 0
	time2 = 0
	
	relsWriter = open(relsFile, "w")
	dataWriter = open(dataFile, "w")
	
	if len(relsHeader) > 0:
		relsWriter.write(relsHeader + "\n")
	if len(dataHeader) > 0:
		dataWriter.write(dataHeader + "\n")
	for i in range(len(dataFileList)):
		
		if os.path.isfile(dataFileList[i]):
			# pdb.set_trace()
		
			dataList = stats.loadInputDataFile(dataFileList[i])
			for row in dataList:
				oldId = row[0].strip()
				newId = oldId + separator + tagList[i]
				newRelRow = [oldId, newId]
				newDataRow = [newId, row[1], row[2]]
				
				stats.saveRow(relsWriter, newRelRow)
				stats.saveRow(dataWriter, newDataRow)
				
			now = datetime.now()
			time1 = float(now.strftime("%H")) * 3600 + float(now.strftime("%M")) * 60 + float(now.strftime("%S.%f"))
			
			if verbose:
				if time1 != 0 and time2 != 0:
					print()
					print("Total time: %f" % (time1 - time2))
				print(("Reading file #%i: " + dataFileList[i]) % i)
				print("dataList: %i" % len(dataList))			
				
			dataList = []
			gc.collect()
			
		else:
			msg = "Error: looks like input file %s does not exist." % dataFileList[i]
			message.append(msg)
			print()
			print(msg)
			print()

		gc.collect()
		
		now = datetime.now()
		time2 = float(now.strftime("%H")) * 3600.0 + float(now.strftime("%M")) * 60.0 + float(now.strftime("%S.%f"))
	
	# from v0.03, no duplicate removal is performed
	# if verbose:
		# print "Removing duplicates..."
	# newDataList = stats.removeDuplicates(newDataList)
	# newRelsList = stats.removeDuplicates(newRelsList)
	# if verbose:
		# print "Duplicates removed"
	
	relsWriter.close()
	dataWriter.close()
	
	return message
Esempio n. 3
0
def calibrate(inputRawData = None,
					inputRelations = None,
					rawDataFile = "",
					relationsFile = "",
					kSeed = 1,
					varianceSeed = 0.001,
					medianSide = 100,
					maxIterations = 0,
					verbose = False,
					showGraph = False,
					showSumSQ = False,
					forceParameters = False,
					alphaSeed = 1.0,
					showRank = True,
					useCooperativity = False,
					graphFileVRank = "",
					graphFileVValue = "",
					graphDataFile = "",
					graphDPI = None):

	extraLog = []
	extraLog.append([])
	if verbose:
		print()
		print("loading input raw data file")
		extraLog.append(["loading input raw data file"])
	
	if inputRawData == None:
		if len(rawDataFile) == 0:
			print("Error: no input raw data")
			extraLog.append(["Error: no input raw data"])
			
		else:
			inputRawData = stats.loadInputDataFile(rawDataFile)
	
	if verbose:
		print("loading relations file")
		extraLog.append(["loading relations file"])
	
	if inputRelations == None:
		if len(relationsFile) == 0:
			if not forceParameters:
				print("Error: no relations file")
				extraLog.append(["Error: no relations file"])
				return None, None, None, None, extraLog
		else:
			inputRelations = stats.loadRelationsFile(relationsFile)
	
	#### calculate k and variance ####
	
	alpha = 1.0
	if not forceParameters:
		if verbose:
			print("calculating K and variance")
			extraLog.append(["calculating K and variance"])

		# *** just to see it
		result = getKandVariance(inputRawData, inputRelations, kSeed = kSeed, varianceSeed = varianceSeed, maxIterations = maxIterations, verbose = True, showSumSQ = True, medianSide = medianSide, alphaSeed = alphaSeed, useCooperativity = useCooperativity)
		
		k = result[0]
		variance = result[1]
		if useCooperativity: alpha = result[2]
	else:
		k = kSeed
		variance = varianceSeed
		alpha = alphaSeed
	
	# save VRank graph
	showGraphTool(inputRawData, inputRelations, k, variance, alpha, medianSide, showRank = True, graphFile = graphFileVRank, graphData = graphDataFile, dpi = graphDPI, showGraph = showGraph)
	# save VValue graph
	showGraphTool(inputRawData, inputRelations, k, variance, alpha, medianSide, showRank = False, graphFile = graphFileVValue, dpi = graphDPI, showGraph = showGraph)
	
	# get calibrated idXV
	
	idXV = idXVcal(inputRawData, k, alpha)
	
	return idXV, variance, k, alpha, extraLog
Esempio n. 4
0
def calibrate(inputRawData=None,
              inputRelations=None,
              rawDataFile="",
              relationsFile="",
              kSeed=1,
              varianceSeed=0.001,
              medianSide=100,
              maxIterations=0,
              verbose=False,
              showGraph=False,
              showSumSQ=False,
              forceParameters=False,
              alphaSeed=1.0,
              showRank=True,
              useCooperativity=False,
              graphFileVRank="",
              graphFileVValue="",
              graphDataFile="",
              graphDPI=None):

    extraLog = []
    extraLog.append([])
    if verbose:
        print()
        print("loading input raw data file")
        extraLog.append(["loading input raw data file"])

    if inputRawData == None:
        if len(rawDataFile) == 0:
            print("Error: no input raw data")
            extraLog.append(["Error: no input raw data"])

        else:
            inputRawData = stats.loadInputDataFile(rawDataFile)

    if verbose:
        print("loading relations file")
        extraLog.append(["loading relations file"])

    if inputRelations == None:
        if len(relationsFile) == 0:
            if not forceParameters:
                print("Error: no relations file")
                extraLog.append(["Error: no relations file"])
                return None, None, None, None, extraLog
        else:
            inputRelations = stats.loadRelationsFile(relationsFile)

    #### calculate k and variance ####

    alpha = 1.0
    if not forceParameters:
        if verbose:
            print("calculating K and variance")
            extraLog.append(["calculating K and variance"])

        # *** just to see it
        result = getKandVariance(inputRawData,
                                 inputRelations,
                                 kSeed=kSeed,
                                 varianceSeed=varianceSeed,
                                 maxIterations=maxIterations,
                                 verbose=True,
                                 showSumSQ=True,
                                 medianSide=medianSide,
                                 alphaSeed=alphaSeed,
                                 useCooperativity=useCooperativity)

        k = result[0]
        variance = result[1]
        if useCooperativity: alpha = result[2]
    else:
        k = kSeed
        variance = varianceSeed
        alpha = alphaSeed

    # save VRank graph
    showGraphTool(inputRawData,
                  inputRelations,
                  k,
                  variance,
                  alpha,
                  medianSide,
                  showRank=True,
                  graphFile=graphFileVRank,
                  graphData=graphDataFile,
                  dpi=graphDPI,
                  showGraph=showGraph)
    # save VValue graph
    showGraphTool(inputRawData,
                  inputRelations,
                  k,
                  variance,
                  alpha,
                  medianSide,
                  showRank=False,
                  graphFile=graphFileVValue,
                  dpi=graphDPI,
                  showGraph=showGraph)

    # get calibrated idXV

    idXV = idXVcal(inputRawData, k, alpha)

    return idXV, variance, k, alpha, extraLog
Esempio n. 5
0
def main(argv):

    version = "v0.17"
    analysisName = ""
    analysisFolder = ""
    varianceSeed = 0.001
    FDRLimit = 0.01
    varianceSeedProvided = False
    removeDuplicateUpper = False
    tags = "!out"
    outlierTag = "out"
    logicOperatorsAsWords = False
    dataFile = ""
    relationsFile = ""
    newRelFile = ""
    removedRelFile = ""
    defaultDataFile = "data"
    defaultRelationsFile = "rels"
    defaultTaggedRelFile = "tagged"
    defaultNewRelFile = "cleaned"
    defaultRemovedRelFile = "outliers"
    defaultOutputInfo = "infoFile"
    infoFile = ""
    varFile = ""
    defaultTableExtension = ".tsv"
    defaultTextExtension = ".txt"
    defaultGraphExtension = ".png"
    verbose = True
    oldWay = False  # instead of tagging outliers, separating relations files, the old way
    modeUsed = mode.onePerHigher
    logList = [["SanXoTSieve " + version],
               ["Start: " + strftime("%Y-%m-%d %H:%M:%S")]]

    try:
        opts, args = getopt.getopt(argv, "a:p:v:d:r:n:L:V:f:ubDhH", [
            "analysis=", "folder=", "varianceseed=", "datafile=", "relfile=",
            "newrelfile=", "outlierrelfile=", "infofile=", "varfile=",
            "fdrlimit=", "one-to-one", "no-verbose", "randomise",
            "removeduplicateupper", "help", "advanced-help", "tags=",
            "outliertag=", "oldway", "word-operators"
        ])
    except getopt.GetoptError:
        logList.append(["Error while getting parameters."])
        stats.saveFile(infoFile, logList, "INFO FILE")
        sys.exit(2)

    if len(opts) == 0:
        printHelp(version)
        sys.exit()

    for opt, arg in opts:
        if opt in ("-a", "--analysis"):
            analysisName = arg
        if opt in ("-p", "--place", "--folder"):
            analysisFolder = arg
        if opt in ("-v", "--var", "--varianceseed"):
            varianceSeed = float(arg)
            varianceSeedProvided = True
        elif opt in ("-d", "--datafile"):
            dataFile = arg
        elif opt in ("-r", "--relfile", "--relationsfile"):
            relationsFile = arg
        elif opt in ("-n", "--newrelfile"):
            removedRelFile = arg
        elif opt in ("-L", "--infofile"):
            infoFile = arg
        elif opt in ("-V", "--varfile"):
            varFile = arg
        elif opt in ("-u", "--one-to-one"):
            modeUsed = mode.onlyOne
        elif opt in ("-b", "--no-verbose"):
            verbose = False
        elif opt in ("--oldway"):
            oldWay = True
        elif opt in ("-f", "--fdrlimit"):
            FDRLimit = float(arg)
        elif opt in ("-D", "--removeduplicateupper"):
            removeDuplicateUpper = True
        elif opt in ("--tags"):
            if arg.strip().lower() != "!out":
                tags = "!out&(" + arg + ")"
        elif opt in ("--word-operators"):
            logicOperatorsAsWords = True
        elif opt in ("--outliertag"):
            outlierTag = "out"
        elif opt in ("-h", "--help"):
            printHelp(version)
            sys.exit()
        elif opt in ("-H", "--advanced-help"):
            printHelp(version, advanced=True)
            sys.exit()

# REGION: FILE NAMES SETUP

    if len(analysisName) == 0:
        if len(dataFile) > 0:
            analysisName = os.path.splitext(os.path.basename(dataFile))[0]
        else:
            analysisName = defaultAnalysisName

    if len(os.path.dirname(analysisName)) > 0:
        analysisNameFirstPart = os.path.dirname(analysisName)
        analysisName = os.path.basename(analysisName)
        if len(analysisFolder) == 0:
            analysisFolder = analysisNameFirstPart

    if len(dataFile) > 0 and len(analysisFolder) == 0:
        if len(os.path.dirname(dataFile)) > 0:
            analysisFolder = os.path.dirname(dataFile)

    # input
    if len(dataFile) == 0:
        dataFile = os.path.join(
            analysisFolder,
            analysisName + "_" + defaultDataFile + defaultTableExtension)

    if len(os.path.dirname(dataFile)) == 0 and len(analysisFolder) > 0:
        dataFile = os.path.join(analysisFolder, dataFile)

    if len(os.path.dirname(varFile)) == 0 and len(
            os.path.basename(varFile)) > 0:
        varFile = os.path.join(analysisFolder, varFile)

    if len(varFile) > 0 and not varianceSeedProvided:
        varianceSeed, varianceOk = stats.extractVarianceFromVarFile(
            varFile, verbose=verbose, defaultSeed=varianceSeed)
        if not varianceOk:
            logList.append(["Variance not found in text file."])
            stats.saveFile(infoFile, logList, "INFO FILE")
            sys.exit()

    if len(relationsFile) == 0:
        relationsFile = os.path.join(
            analysisFolder,
            analysisName + "_" + defaultRelationsFile + defaultTableExtension)

    if len(os.path.dirname(relationsFile)) == 0:
        relationsFile = os.path.join(analysisFolder, relationsFile)

    # output
    if len(newRelFile) == 0:
        if oldWay:  # suffix: "cleaned"
            newRelFile = os.path.join(
                analysisFolder,
                analysisName + "_" + defaultNewRelFile + defaultTableExtension)
        else:  # suffix: "tagged"
            newRelFile = os.path.join(
                analysisFolder, analysisName + "_" + defaultTaggedRelFile +
                defaultTableExtension)

    if len(removedRelFile) == 0:
        removedRelFile = os.path.join(
            analysisFolder,
            analysisName + "_" + defaultRemovedRelFile + defaultTableExtension)

    if len(os.path.dirname(newRelFile)) == 0:
        newRelFile = os.path.join(analysisFolder, newRelFile)

    if len(os.path.dirname(removedRelFile)) == 0:
        removedRelFile = os.path.join(analysisFolder, removedRelFile)

    if len(infoFile) == 0:
        infoFile = os.path.join(
            analysisFolder,
            analysisName + "_" + defaultOutputInfo + defaultTextExtension)

    logList.append(["Variance seed = " + str(varianceSeed)])
    logList.append(["Input data file: " + dataFile])
    logList.append(["Input relations file: " + relationsFile])
    if oldWay:
        logList.append(
            ["Output relations file without outliers: " + newRelFile])
        logList.append(
            ["Output relations file with outliers only: " + removedRelFile])
        logList.append([
            "Removing duplicate higher level elements: " +
            str(removeDuplicateUpper)
        ])
        logList.append([
            "OldWay option activated: outliers are removed instead of tagged"
        ])
    else:
        logList.append(["Relations file tagging outliers: " + newRelFile])
        logList.append(["Tags to filter relations: " + tags])
        logList.append(["Tag used for outliers: " + outlierTag])

    # pp.pprint(logList)
    # sys.exit()

# END REGION: FILE NAMES SETUP

    relations = stats.loadRelationsFile(relationsFile)
    data = stats.loadInputDataFile(dataFile)

    if oldWay:
        # only for backward compatibility. Note that tags are not supported
        newRelations, removedRelations, logResults = \
              getRelationsWithoutOutliers(data,
                relations,
                varianceSeed,
                FDRLimit = FDRLimit,
                modeUsed = modeUsed,
                removeDuplicateUpper = removeDuplicateUpper)
    else:
        newRelations, removedRelations, logResults = \
              tagRelationsWithoutOutliers(data,
                relations,
                varianceSeed,
                FDRLimit = FDRLimit,
                modeUsed = modeUsed,
                removeDuplicateUpper = removeDuplicateUpper,
                tags = tags,
                outlierTag = outlierTag,
                logicOperatorsAsWords = logicOperatorsAsWords)

    if oldWay:
        stats.saveFile(newRelFile, newRelations, "idsup\tidinf")
    else:
        stats.saveFile(newRelFile, newRelations, "idsup\tidinf\ttags")

    stats.saveFile(infoFile, logList, "INFO FILE")

    if oldWay:
        stats.saveFile(removedRelFile, removedRelations, "idsup\tidinf")