def drawGraphForDatasets(datasets, fileName, item, trainBlockSizes, metric, ylim = []): plt.subplot(111) plt.figure(item) plt.title(metric) plt.xlabel("% of dataset") plt.ylabel("score: %s" % metric) plt.grid() for load in datasets: # load it lazy data, label, desc = load() # for test - make dataset smaller initialReduceBlockSize = np.arange(0.5, 0.7, 0.1) trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = factory.splitDatasetInBlocks(data, np.array(label), initialReduceBlockSize, testSetPercentage) data = trainDataBlocks[0][0] label = trainLabelBlocks[0][0] analyze(data, label) maxItemsInDataset = len(label) testSetPercentage = 0.02 trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = factory.splitDatasetInBlocks(data, np.array(label), trainBlockSizes, testSetPercentage) x = list() y = list() for i in range(len(trainDataBlocks)): trainData = trainDataBlocks[i] trainLabel = trainLabelBlocks[i] # testData = testDataBlocks[i] # testLabel = testLabelBlocks[i] numInstances = np.shape(trainData[0]) score = calcScore(metric, trainData[0], trainLabel[0]) xPercentage = (numInstances[0] * 100) / maxItemsInDataset x.append(xPercentage) #y.append(float("%.4f" % score)) y.append(score) #print "x:%s, y:%s" % (numInstances[0], score) print "------------------------" print y print np.mean(y) print "------------------------" plt.plot(x, y, label=desc) plt.legend(loc="best") if len(ylim) > 0: plt.ylim(ylim) plt.savefig("performance/output/%s_%s.png" % (fileName, metric), dpi=320)
def loadData(experiment): if experiment.has_key("size"): size = experiment["size"] else: size = 0 data, label, description, reduce = experiment["dataset"]() if size > 0: initialReduceBlockSize = np.arange(size, size + 0.2, 0.1) testSetPercentage = 0.2 trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = data_factory.splitDatasetInBlocks( data, np.array(label), initialReduceBlockSize, testSetPercentage) data = trainDataBlocks[0][0] label = trainLabelBlocks[0][0] # if required (cancer datasets) perform binary encoding if experiment['binary_encode']: print "perform binary encode" analyze(data, label, "before encode") # encode features (one-hot-encoder / dummy coding) enc = OneHotEncoder() enc.fit(data) data = enc.transform(data).toarray() analyze(data, label, "after encode") return data, label, description, reduce
def loadData(experiment): if experiment.has_key("size"): size = experiment["size"] else: size = 0 data, label, description, reduce = experiment["dataset"]() if size > 0: initialReduceBlockSize = np.arange(size, size+0.2, 0.1) testSetPercentage = 0.2 trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = data_factory.splitDatasetInBlocks(data, np.array(label), initialReduceBlockSize, testSetPercentage) data = trainDataBlocks[0][0] label = trainLabelBlocks[0][0] # if required (cancer datasets) perform binary encoding if experiment['binary_encode']: print "perform binary encode" analyze(data, label, "before encode") # encode features (one-hot-encoder / dummy coding) enc = OneHotEncoder() enc.fit(data) data = enc.transform(data).toarray() analyze(data, label, "after encode") return data, label, description, reduce
def getLearningCurve(data, label): trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = data_factory.splitDatasetInBlocks(data, np.array(label), trainBlockSizes, testSetPercentage) x = list() yTrain = list() yTest = list() for i in range(len(trainDataBlocks)): trainData = trainDataBlocks[i] trainLabel = trainLabelBlocks[i] testData = testDataBlocks[i] testLabel = testLabelBlocks[i] trainScores = list() testScores = list() for j in range(len(trainData)): trainS, testS = rocAuc(trainData[j], trainLabel[j], testData[j], testLabel[j]) trainScores.append(trainS) testScores.append(testS) yTrain.append(np.mean(trainScores)) yTest.append(np.mean(testScores)) if hasattr(trainData[0], "indices"): numInstances = np.shape(trainData[0]) numInstances = numInstances[0] else: numInstances = np.shape(trainData) numInstances = numInstances[1] x.append(numInstances) return x, yTrain, yTest
def getXYForDataLabel(data, label): maxItemsInDataset = len(label) trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = data_factory.splitDatasetInBlocks(data, np.array(label), trainBlockSizes, testSetPercentage) x = list() y = list() for i in range(len(trainDataBlocks)): trainData = trainDataBlocks[i] trainLabel = trainLabelBlocks[i] scores = list() for j in range(len(trainData)): s = rocAuc(trainData[j], trainLabel[j]) scores.append(s) numInstances = np.shape(trainData[0]) xPercentage = (numInstances[0] * 100) / maxItemsInDataset x.append(xPercentage) y.append(np.mean(scores)) return x, y
estimatedDimension = mle(data) print "estimated dimension without binary encoding: %s" % estimatedDimension enc = OneHotEncoder() enc.fit(data) data = enc.transform(data).toarray() analyze(data, label, "after binary encode") estimatedDimensionWithBinaryEncode = mle(data) print "estimated dimension with binary encoding: %s" % estimatedDimensionWithBinaryEncode print ''' data, label, desc, size = df.loadFirstPlistaDataset() trainBlockSizes = np.arange(0.001, 0.005, 0.001) testSetPercentage = 0.2 trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = df.splitDatasetInBlocks( data, np.array(label), trainBlockSizes, testSetPercentage) # estimate for different sizes of the data set x = list() yDuration = list() yDimension = list() for i in range(len(trainDataBlocks)): estimatedDimension = list() duration = list() shape = np.shape(trainDataBlocks[i][0]) x.append(shape[0]) for j in range(len(trainDataBlocks[i])): print j data = trainDataBlocks[i][j]
def getThirdPlistaData(): data, label, _, _ = df.loadThridPlistaDataset() initialReduceBlockSize = np.arange(0.01, 0.3, 0.1) trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = df.splitDatasetInBlocks( data, np.array(label), initialReduceBlockSize, 0.1) return trainDataBlocks[0][0], trainLabelBlocks[0][0], _, _
import numpy as np import time from sklearn import linear_model import data_factory as df import matplotlib.pyplot as plt import dr plt.figure() plt.xlabel('dimensions') plt.ylabel('duration (seconds)') plt.title('ROC Curves') data, label, desc, size = df.loadFirstPlistaDataset() initialReduceBlockSize = np.arange(0.1, 0.7, 0.1) trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = df.splitDatasetInBlocks(data, np.array(label), initialReduceBlockSize, 0.1) data = trainDataBlocks[0][0] label = trainLabelBlocks[0][0] x = list() sparse_y = list() dense_y = list() algos = dr.getAllAlgosExlude(["tsne"]) algodurations = {} dimension_range = np.arange(50, 250, 50) for i in dimension_range: # dimensions x.append(i)
def getThirdPlistaData(): data, label, _, _ = df.loadThridPlistaDataset() initialReduceBlockSize = np.arange(0.01, 0.3, 0.1) trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = df.splitDatasetInBlocks(data, np.array(label), initialReduceBlockSize, 0.1) return trainDataBlocks[0][0], trainLabelBlocks[0][0],_,_
numInstances = np.shape(trainData[0]) xPercentage = (numInstances[0] * 100) / maxItemsInDataset x.append(xPercentage) y.append(np.mean(scores)) return x, y dataSets = data_factory.getSmallDatasets() for i in range(len(dataSets)): load = dataSets[i] data, label, desc, size = load() if size > 0: initialReduceBlockSize = np.arange(size, size+0.2, 0.1) trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = data_factory.splitDatasetInBlocks(data, np.array(label), initialReduceBlockSize, testSetPercentage) data = trainDataBlocks[0][0] label = trainLabelBlocks[0][0] print np.shape(data) print np.shape(label) analyze(data, label) plt.subplot(111) plt.figure(i) plt.title("%s - %s" % (desc, metric)) plt.xlabel("% of dataset") plt.ylabel(metric) plt.grid()