Python splitDatasetInBlocksの例、data_factory.splitDatasetInBlocks Pythonの例

コード例 #1

0

ファイルを表示

ファイル: accuracy.py プロジェクト: ajgappmark/master-thesis

def drawGraphForDatasets(datasets, fileName, item, trainBlockSizes, metric, ylim = []):

    plt.subplot(111)
    plt.figure(item)
    plt.title(metric)
    plt.xlabel("% of dataset")
    plt.ylabel("score: %s" % metric)

    plt.grid()

    for load in datasets:
        # load it lazy
        data, label, desc = load()

        # for test - make dataset smaller
        initialReduceBlockSize = np.arange(0.5, 0.7, 0.1)
        trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = factory.splitDatasetInBlocks(data, np.array(label), initialReduceBlockSize, testSetPercentage)
        data = trainDataBlocks[0][0]
        label = trainLabelBlocks[0][0]

        analyze(data, label)

        maxItemsInDataset = len(label)

        testSetPercentage = 0.02

        trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = factory.splitDatasetInBlocks(data, np.array(label), trainBlockSizes, testSetPercentage)

        x = list()
        y = list()

        for i in range(len(trainDataBlocks)):
            trainData = trainDataBlocks[i]
            trainLabel = trainLabelBlocks[i]
            # testData = testDataBlocks[i]
            # testLabel = testLabelBlocks[i]

            numInstances = np.shape(trainData[0])
            score = calcScore(metric, trainData[0], trainLabel[0])


            xPercentage = (numInstances[0] * 100) / maxItemsInDataset
            x.append(xPercentage)

            #y.append(float("%.4f" % score))
            y.append(score)
            #print "x:%s, y:%s" % (numInstances[0], score)
        print "------------------------"
        print y
        print np.mean(y)
        print "------------------------"

        plt.plot(x, y, label=desc)


    plt.legend(loc="best")
    if len(ylim) > 0:
        plt.ylim(ylim)
    plt.savefig("performance/output/%s_%s.png" % (fileName, metric), dpi=320)

コード例 #2

0

ファイルを表示

def loadData(experiment):
    if experiment.has_key("size"):
        size = experiment["size"]
    else:
        size = 0
    data, label, description, reduce = experiment["dataset"]()

    if size > 0:
        initialReduceBlockSize = np.arange(size, size + 0.2, 0.1)
        testSetPercentage = 0.2
        trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = data_factory.splitDatasetInBlocks(
            data, np.array(label), initialReduceBlockSize, testSetPercentage)

        data = trainDataBlocks[0][0]
        label = trainLabelBlocks[0][0]

    # if required (cancer datasets) perform binary encoding
    if experiment['binary_encode']:
        print "perform binary encode"
        analyze(data, label, "before encode")
        # encode features (one-hot-encoder / dummy coding)
        enc = OneHotEncoder()
        enc.fit(data)
        data = enc.transform(data).toarray()
        analyze(data, label, "after encode")

    return data, label, description, reduce

コード例 #3

0

ファイルを表示

ファイル: experiment_run.py プロジェクト: sebastian-alfers/master-thesis

def loadData(experiment):
    if experiment.has_key("size"):
        size = experiment["size"]
    else:
        size = 0
    data, label, description, reduce = experiment["dataset"]()

    if size > 0:
        initialReduceBlockSize = np.arange(size, size+0.2, 0.1)
        testSetPercentage = 0.2
        trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = data_factory.splitDatasetInBlocks(data, np.array(label), initialReduceBlockSize, testSetPercentage)

        data = trainDataBlocks[0][0]
        label = trainLabelBlocks[0][0]

    # if required (cancer datasets) perform binary encoding
    if experiment['binary_encode']:
        print "perform binary encode"
        analyze(data, label, "before encode")
        # encode features (one-hot-encoder / dummy coding)
        enc = OneHotEncoder()
        enc.fit(data)
        data = enc.transform(data).toarray()
        analyze(data, label, "after encode")

    return data, label, description, reduce

コード例 #4

0

ファイルを表示

ファイル: lc.py プロジェクト: ajgappmark/master-thesis

def getLearningCurve(data, label):
    trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = data_factory.splitDatasetInBlocks(data, np.array(label), trainBlockSizes, testSetPercentage)
    x = list()
    yTrain = list()
    yTest = list()
    for i in range(len(trainDataBlocks)):
        trainData = trainDataBlocks[i]
        trainLabel = trainLabelBlocks[i]
        testData = testDataBlocks[i]
        testLabel = testLabelBlocks[i]

        trainScores = list()
        testScores = list()
        for j in range(len(trainData)):
             trainS, testS = rocAuc(trainData[j], trainLabel[j], testData[j], testLabel[j])
             trainScores.append(trainS)
             testScores.append(testS)

        yTrain.append(np.mean(trainScores))
        yTest.append(np.mean(testScores))

        if hasattr(trainData[0], "indices"):
            numInstances = np.shape(trainData[0])
            numInstances = numInstances[0]
        else:
            numInstances = np.shape(trainData)
            numInstances = numInstances[1]

        x.append(numInstances)

    return x, yTrain, yTest

コード例 #5

0

ファイルを表示

ファイル: compare_datasets_dr.py プロジェクト: hxsylzpf/master-thesis

def getXYForDataLabel(data, label):
    maxItemsInDataset = len(label)

    trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = data_factory.splitDatasetInBlocks(data, np.array(label), trainBlockSizes, testSetPercentage)
    x = list()
    y = list()
    for i in range(len(trainDataBlocks)):
        trainData = trainDataBlocks[i]
        trainLabel = trainLabelBlocks[i]

        scores = list()
        for j in range(len(trainData)):
            s = rocAuc(trainData[j], trainLabel[j])
            scores.append(s)

        numInstances = np.shape(trainData[0])
        xPercentage = (numInstances[0] * 100) / maxItemsInDataset
        x.append(xPercentage)
        y.append(np.mean(scores))

    return x, y

コード例 #6

0

ファイルを表示

ファイル: mle.py プロジェクト: ajgappmark/master-thesis

    estimatedDimension = mle(data)
    print "estimated dimension without binary encoding: %s" % estimatedDimension
    enc = OneHotEncoder()
    enc.fit(data)
    data = enc.transform(data).toarray()
    analyze(data, label, "after binary encode")
    estimatedDimensionWithBinaryEncode = mle(data)
    print "estimated dimension with binary encoding: %s" % estimatedDimensionWithBinaryEncode
    print
'''

data, label, desc, size = df.loadFirstPlistaDataset()

trainBlockSizes = np.arange(0.001, 0.005, 0.001)
testSetPercentage = 0.2
trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = df.splitDatasetInBlocks(
    data, np.array(label), trainBlockSizes, testSetPercentage)

# estimate for different sizes of the data set
x = list()
yDuration = list()
yDimension = list()
for i in range(len(trainDataBlocks)):
    estimatedDimension = list()
    duration = list()

    shape = np.shape(trainDataBlocks[i][0])
    x.append(shape[0])

    for j in range(len(trainDataBlocks[i])):
        print j
        data = trainDataBlocks[i][j]

コード例 #7

0

ファイルを表示

ファイル: quality-roc-auc.py プロジェクト: ajgappmark/master-thesis

def getThirdPlistaData():
    data, label, _, _ = df.loadThridPlistaDataset()
    initialReduceBlockSize = np.arange(0.01, 0.3, 0.1)
    trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = df.splitDatasetInBlocks(
        data, np.array(label), initialReduceBlockSize, 0.1)
    return trainDataBlocks[0][0], trainLabelBlocks[0][0], _, _

コード例 #8

0

ファイルを表示

ファイル: compare.py プロジェクト: ajgappmark/master-thesis

import numpy as np
import time
from sklearn import linear_model
import data_factory as df
import matplotlib.pyplot as plt
import dr

plt.figure()
plt.xlabel('dimensions')
plt.ylabel('duration (seconds)')
plt.title('ROC Curves')

data, label, desc, size = df.loadFirstPlistaDataset()

initialReduceBlockSize = np.arange(0.1, 0.7, 0.1)
trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = df.splitDatasetInBlocks(data, np.array(label), initialReduceBlockSize, 0.1)
data = trainDataBlocks[0][0]
label = trainLabelBlocks[0][0]

x = list()
sparse_y = list()
dense_y = list()

algos = dr.getAllAlgosExlude(["tsne"])

algodurations = {}

dimension_range = np.arange(50, 250, 50)
for i in dimension_range: # dimensions

    x.append(i)

コード例 #9

0

ファイルを表示

ファイル: quality-roc-auc.py プロジェクト: sebastian-alfers/master-thesis

def getThirdPlistaData():
    data, label, _, _ = df.loadThridPlistaDataset()
    initialReduceBlockSize = np.arange(0.01, 0.3, 0.1)
    trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = df.splitDatasetInBlocks(data, np.array(label), initialReduceBlockSize, 0.1)
    return trainDataBlocks[0][0], trainLabelBlocks[0][0],_,_

コード例 #10

0

ファイルを表示

ファイル: compare_datasets_dr.py プロジェクト: hxsylzpf/master-thesis

        numInstances = np.shape(trainData[0])
        xPercentage = (numInstances[0] * 100) / maxItemsInDataset
        x.append(xPercentage)
        y.append(np.mean(scores))

    return x, y

dataSets = data_factory.getSmallDatasets()
for i in range(len(dataSets)):
    load = dataSets[i]
    data, label, desc, size = load()

    if size > 0:
        initialReduceBlockSize = np.arange(size, size+0.2, 0.1)
        trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = data_factory.splitDatasetInBlocks(data, np.array(label), initialReduceBlockSize, testSetPercentage)

        data = trainDataBlocks[0][0]
        label = trainLabelBlocks[0][0]

    print np.shape(data)
    print np.shape(label)
    analyze(data, label)

    plt.subplot(111)
    plt.figure(i)
    plt.title("%s - %s" % (desc, metric))
    plt.xlabel("% of dataset")
    plt.ylabel(metric)

    plt.grid()