Example #1
0
	def loaddata_thread(self,fn):
		start = time.time()
		self.dataset = loaddata.load_data(fn)
		for i in self.dataset:
			self.init_data_Text.insert(INSERT,str(i))
			self.init_data_Text.insert(INSERT,'\n')
		end = time.time()
		self.log_data_Text.insert(INSERT,'加载完毕!共'+str(len(self.dataset))+'项	耗时:'+str(round(end-start,2))+'s\n')
Example #2
0
	def click1(self):
		fn = tkFiledialog.askopenfilename()    #选择文件夹
		# fnlist = os.walk( fn )                  #列出目录
		# print(fn)
		self.dataset = loaddata.load_data(fn)
		print(self.dataset)
		# self.init_data_Text.insert(INSERT,'aojifoajeifjoaeijfoa')
	# 	for i in self.dataset:
	# 		for j in i:
	# 			self.init_data_Text.insert(INSERT,j)
	# 		self.init_data_Text.insert(INSERT,'\n')
		for i in self.dataset:
			self.init_data_Text.insert(INSERT,i)
			self.init_data_Text.insert(INSERT,'\n')
Example #3
0
def train():
    best_acc = 0.0
    saver = tf.train.Saver()
    with tf.Session() as sess:
        init = tf.global_variables_initializer()
        writer = tf.summary.FileWriter('logs', sess.graph)
        sess.run(init)
        c = []
        X_train, y_train = loaddata.load_data()
        total_batch = int(X_train.shape[0] / config.batch_size)
        for i in range(config.training_iters):
            avg_cost = 0
            for batch in range(total_batch):
                batch_x = X_train[batch * config.batch_size:(batch + 1) *
                                  config.batch_size, :]
                batch_y = y_train[batch * config.batch_size:(batch + 1) *
                                  config.batch_size, :]
                _, co = sess.run([model.optimizer, model.cost],
                                 feed_dict={
                                     model.x: batch_x,
                                     model.y: batch_y,
                                     model.keep_prob: 0.5
                                 })

                avg_cost += co

                accuet, out = sess.run([model.accuracy, model.softmax],
                                       feed_dict={
                                           model.x: batch_x,
                                           model.y: batch_y,
                                           model.keep_prob: 1.0
                                       })
                print("train accuracy=" + "{:.6f}".format(accuet))
            #print(out)
            c.append(avg_cost)
            if (i + 1) % config.display_step == 0:
                print("Iter " + str(i + 1) + ", Training Loss= " +
                      "{:.6f}".format(avg_cost))
            # if i>13:
            #     if accuet>best_acc:
            #         best_acc=accuet
        saver.save(sess=sess, save_path="./ckpt/test-model.ckpt")

        for variable in tf.trainable_variables():
            print(variable)

        print("Optimization Finished!")
        writer.close()
def test_model():
    # some declared variables
    inputImageShape = (224, 224, 3)
    num_of_output_classes = 2
    _, testX, _, testY = load_data()
    model = cnn_model_structure(input_shape=inputImageShape, num_classes=num_of_output_classes)
    weights = ''
    for w in glob.glob('models\\*.h5'):
        weights = w
    model.load_weights(weights)
    eval = model.predict(testX)
    out_class=np.array([np.argmax(out) for out in eval])
    ref_class = np.array([np.argmax(out) for out in testY])
    print(out_class)
    print(ref_class)
    print('Acc = '+str((1-float(np.count_nonzero(ref_class-out_class))/float(len(ref_class)))*100))
Example #5
0
def train_model():
    # some declared variables
    randomSeed = 42
    networkInitialize = glorot_normal()
    inputImageShape = (224, 224, 3)
    epoch = 200
    btachSize = 32
    num_of_output_classes = 2
    random.seed(randomSeed)
    learningRate = 0.01

    trainX, testX, trainY, testY = load_data()
    # augmentation process
    augmentaion = ImageDataGenerator(rotation_range=30,
                                     width_shift_range=0.1,
                                     height_shift_range=0.1,
                                     shear_range=0.2,
                                     zoom_range=0.2,
                                     horizontal_flip=True,
                                     fill_mode="nearest")

    checkpoint = ModelCheckpoint(
        'models\\model-{epoch:03d}-{acc:03f}-{val_acc:03f}.h5',
        verbose=1,
        monitor='val_acc',
        save_best_only=True,
        mode='auto')
    csv_logger = CSVLogger('report\\log_' + str(learningRate) + '.csv',
                           append=False,
                           separator=';')
    # training
    # compile the model
    model = cnn_model_structure(input_shape=inputImageShape,
                                num_classes=num_of_output_classes)
    model.compile(loss='categorical_crossentropy',
                  optimizer='Adam',
                  metrics=['accuracy'])
    # print(model.summary())
    model = model.fit_generator(augmentaion.flow(trainX,
                                                 trainY,
                                                 batch_size=btachSize),
                                validation_data=(testX, testY),
                                steps_per_epoch=len(trainX),
                                epochs=epoch,
                                callbacks=[csv_logger, checkpoint])
Example #6
0
def prepareData():
    #Load data
    all_data = loaddata.load_data()
    data = all_data[1]

    #Stem attributes
    data['search_term'] = data['search_term'].map(lambda x:stem(x))
    data['product_title'] = data['product_title'].map(lambda x:stem(x))
    data['product_description'] = data['product_description'].map(lambda x:stem(x))
    print('start brand info!')
    data['brand'] = data['brand'].map(lambda x:stem(x))
    print('finish brand info!')
    data['bullet1'] = data['bullet1'].map(lambda x:stem(x))
    data['bullet2'] = data['bullet2'].map(lambda x:stem(x))
    data['bullet3'] = data['bullet3'].map(lambda x:stem(x))
    data['bullet4'] = data['bullet4'].map(lambda x:stem(x))
    data['material'] = data['material'].map(lambda x:stem(x))

    data['product_info'] = data['search_term']+"\t"+data['product_title'] +"\t"+data['product_description']

    # Calculate length
    data['len_of_query'] = data['search_term'].map(lambda x:len(x.split())).astype(np.int64)
    data['len_of_title'] = data['product_title'].map(lambda x:len(x.split())).astype(np.int64)
    data['len_of_description'] = data['product_description'].map(lambda x:len(x.split())).astype(np.int64)
    data['len_of_brand'] = data['brand'].map(lambda x:len(x.split())).astype(np.int64)
    data['len_of_b1'] = data['bullet1'].map(lambda x:len(x.split())).astype(np.int64)
    data['len_of_b2'] = data['bullet2'].map(lambda x:len(x.split())).astype(np.int64)
    data['len_of_b3'] = data['bullet3'].map(lambda x:len(x.split())).astype(np.int64)
    data['len_of_b4'] = data['bullet4'].map(lambda x:len(x.split())).astype(np.int64)

    # Search and Query
    data['search_term'] = data['product_info'].map(lambda x:seg_words(x.split('\t')[0],x.split('\t')[1]))
    data['attr'] = data['search_term']+"\t"+data['brand']
    data['bullets'] = data['search_term']+"\t"+data['bullet1']+"\t"+data['bullet2']+"\t"+data['bullet3']+"\t"+data['bullet4']

    data.to_csv('features.csv', sep='\t', encoding='ISO-8859-1')
    return all_data
Example #7
0
import sklearn.neighbors
import matplotlib.pyplot as plt
from scipy import cluster
from sklearn import tree
from sklearn.externals.six import StringIO
import pydotplus

## Cargamos el fichero loaddata donde tenemos los métodos para importar los datos
import loaddata
# Cargamos en la variable data los datos que vamos a utilizar para hacer el custering
# Cargamos en la variable alldata los datos completos con todas las columnas para
# más adelante obtener los resultados

FILE = "../../Datos/DATATHON_2015_Processed.csv"

data = loaddata.load_data(FILE)
alldata = numpy.asarray(loaddata.load_data(FILE))
datastr = numpy.asarray(loaddata.load_all_data(FILE))

## 1. Normalization of the data
# http://scikit-learn.org/stable/modules/preprocessing.html
min_max_scaler = preprocessing.MinMaxScaler()
data = min_max_scaler.fit_transform(data)

# 2. Compute the similarity matrix
dist = sklearn.neighbors.DistanceMetric.get_metric('euclidean')
matsim = dist.pairwise(data)

# 3. Building the Dendrogram
# http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
clusters = cluster.hierarchy.linkage(matsim, method='ward')
def trainandsave():

    # 加载数据
    root_path = "./data/cifar10"
    data_folder = "train"
    batch_size = 64
    data_type = "train"
    trainloader = load_data(root_path, data_folder, batch_size, data_type)

    # 神经网络结构
    # 输入是32*32*3=3072维度, 中间层分别是1500, 200, 输出10个维度(10个分类)

    # net = Batch_Net(32*32, 1500, 200, 10)

    # net = vgg16_bn()

    net = LeNet()

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("device = ", device)
    net.to(device=device)

    # 优化器 # 学习率为0.001
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
    # 损失函数 # 损失函数也可以自己定义,我们这里用的交叉熵损失函数
    celoss = nn.CrossEntropyLoss()

    # 训练部分
    print("trainloader = ", trainloader)
    writer = SummaryWriter('runs/train2')  # 记录
    for epoch in range(250):  # 训练的数据量为5个epoch,每个epoch为一个循环

        running_loss = 0.0  # 定义一个变量方便我们对loss进行输出
        # 这里我们遇到了第一步中出现的trailoader,代码传入数据
        for i, data in enumerate(trainloader, 0):
            # enumerate是python的内置函数,既获得索引也获得数据
            # get the inputs
            # data是从enumerate返回的data,包含数据和标签信息,分别赋值给inputs和labels
            inputs, labels = data

            #print("inputs = ", inputs)
            #print("labels = ", labels)
            # wrap them in Variable
            # 转换数据格式用Variable
            inputs, labels = Variable(inputs), Variable(labels)
            # 梯度置零,因为反向传播过程中梯度会累加上一次循环的梯度
            optimizer.zero_grad()

            # inputs 需要从32*32的图像展开成1024

            # forward + backward + optimize
            inputs = inputs.to(device)
            labels = labels.to(device)
            # 把数据输进CNN网络net
            outputs = net(inputs)

            loss = celoss(outputs, labels)  # 计算损失值
            loss.backward()  # loss反向传播 计算反向梯度
            optimizer.step()  # 利用反向梯度 参数更新
            #running_loss += loss.data[0]       # loss累加
            running_loss += loss.item()  # loss累加
            # 每个epoch要训练所有的图片,每训练完成200张便打印一下训练的效果(loss值)
            if (i + 1) % 200 == 0:
                localtime = time.asctime(time.localtime(time.time()))
                writer.add_scalar('running_loss',
                                  running_loss / 200,
                                  global_step=((epoch * 600) + (i + 1)))
                print(localtime, '[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1,
                       running_loss / 200))  # 然后再除以200,就得到这两百次的平均损失值
                running_loss = 0.0  # 这一个200次结束后,就把running_loss归零,下一个200次继续使用
        # 每50个epoch保存一次参数
        if (epoch + 1) % 50 == 0:
            save_name = "net_params" + str(epoch + 1) + ".pkl"
            torch.save(net.state_dict(), save_name)

    print('Finished Training')
    # 保存神经网络
    torch.save(net, 'net.pkl')  # 保存整个神经网络的结构和模型参数
Example #9
0
def diagnosticPlot(name, values):
    args = values['generation-args'][1]
    errors = []
    for key in ['responsewindow']:
        label, value = args[key]
        value = parsematlab.parse(value)
        if isinstance(value, str):
            errors.append(label + '\n    ' + value.replace('\n', '\n    '))
        args[key] = value
    if len(errors) > 0:
        Error('\n\n'.join(errors))
        return
    response_window = args['responsewindow']
    fnames = values['flist'][1]
    removeanomalies = args['removeanomalies'][1]
    weightfile = values['weightfile'][1]
    data = []
    type = []
    samplingrate = None
    try:
        for fname in fnames:
            result = loaddata.load_data(fname, response_window, None,
                removeanomalies = removeanomalies)
            if isinstance(result, str):
                Error(result)
                return
            if samplingrate == None:
                samplingrate = result[2]
            if samplingrate != result[2]:
                Error('Not all data files have the same sampling rate.')
                return
            data.append(result[0])
            type.append(result[1])
        if len(data) == 0 or len(type) == 0:
            Error('You must select some data to plot.')
            return
        try:
            data = np.concatenate(data)
        except ValueError:
            Error('Not all data files have the same number of channels.')
            return
        type = np.concatenate(type)
        if weightfile:
            weights = loaddata.load_weights(weightfile)
            if isinstance(weights, str):
                Error(weights)
                return
            classifier = np.zeros(data.shape[1:])
            classifier[:weights.shape[0], :weights.shape[1]] = weights
            classifier_max = max(abs(classifier.max()), abs(classifier.min()))
        else:
            classifier = None
        if isinstance(classifier, str):
            Error(classifier)
            return
        num_plots = 3 if classifier == None else 4
        signed_r = np.zeros(data.shape[1:])
        for row in range(signed_r.shape[0]):
            for col in range(signed_r.shape[1]):
                signed_r[row, col] = stats.linregress(
                    data[:, row, col], type
                )[2]
        signed_r_max = max(abs(signed_r.max()), abs(signed_r.min()))
        x = np.arange(data.shape[1]) * 1000 / samplingrate
        target = data[type.nonzero()[0]].mean(axis = 0)
        nontarget = data[(~type).nonzero()[0]].mean(axis = 0)
        vmin, vmax = ylim = [min(target.min(), nontarget.min()),
            max(target.max(), nontarget.max())]
        fig = pylab.figure()
        fig.subplots_adjust(bottom = 0.06, top = 0.93, hspace = 0.45)
        master_ax = ax = pylab.subplot(num_plots, 1, 1)
        pylab.title('Target', fontsize = 'medium')
        pylab.imshow(target.transpose(), interpolation = 'nearest',
            cmap = 'PRGn', aspect = 'auto', vmin = vmin, vmax = vmax,
            origin = 'lower', extent = (
                0,
                data.shape[1] * 1000 / samplingrate,
                -0.5,
                data.shape[2] - 0.5
            )
        )
        pylab.xticks(fontsize = 'small')
        pylab.yticks(range(data.shape[2]),
            [str(i) for i in range(1, data.shape[2] + 1)],
            fontsize = 'small')
        pylab.axes(pylab.colorbar().ax)
        pylab.yticks(fontsize = 'small')
        ax = pylab.subplot(num_plots, 1, 2, sharex = master_ax,
            sharey = master_ax)
        pylab.title('Non-Target', fontsize = 'medium')
        pylab.imshow(nontarget.transpose(), interpolation = 'nearest',
            cmap = 'PRGn', aspect = 'auto', vmin = vmin, vmax = vmax,
            origin = 'lower', extent = (
                0,
                data.shape[1] * 1000 / samplingrate,
                -0.5,
                data.shape[2] - 0.5
            )
        )
        pylab.xticks(fontsize = 'small')
        pylab.yticks(range(data.shape[2]),
            [str(i) for i in range(1, data.shape[2] + 1)],
            fontsize = 'small')
        pylab.axes(pylab.colorbar().ax)
        pylab.yticks(fontsize = 'small')
        ax = pylab.subplot(num_plots, 1, 3, sharex = master_ax,
            sharey = master_ax)
        pylab.title('Correlation Coefficient', fontsize = 'medium')
        pylab.imshow(signed_r.transpose(), interpolation = 'nearest',
            cmap = 'PRGn', aspect = 'auto', vmin = -signed_r_max,
            vmax = signed_r_max, origin = 'lower', extent = (
                0,
                data.shape[1] * 1000 / samplingrate,
                -0.5,
                data.shape[2] - 0.5
            )
        )
        pylab.xticks(fontsize = 'small')
        pylab.yticks(range(data.shape[2]),
            [str(i) for i in range(1, data.shape[2] + 1)],
            fontsize = 'small')
        pylab.axes(pylab.colorbar().ax)
        pylab.yticks(fontsize = 'small')

        if classifier == None:
            return

        ax = pylab.subplot(num_plots, 1, 4, sharex = master_ax,
            sharey = master_ax)
        pylab.title('Classifier Weights', fontsize = 'medium')
        pylab.imshow(classifier.transpose(), interpolation = 'nearest',
            cmap = 'PRGn', aspect = 'auto', vmin = -classifier_max,
            vmax = classifier_max, origin = 'lower', extent = (
                0,
                data.shape[1] * 1000 / samplingrate,
                -0.5,
                data.shape[2] - 0.5
            )
        )
        pylab.xticks(fontsize = 'small')
        pylab.yticks(range(data.shape[2]),
            [str(i) for i in range(1, data.shape[2] + 1)],
            fontsize = 'small')
        pylab.axes(pylab.colorbar().ax)
        pylab.yticks(fontsize = 'small')
    except MemoryError:
        Error('Could not fit all the selected data in memory.\n' + \
            'Try loading fewer data files.')
        return
Example #10
0
def testWeights(name, values):
    flistwidget, fnames = values['flist']
    weightfile = values['weightfile'][1]
    if not weightfile:
        Error('You must first generate weights or select a file from which ' + \
            'to load the weights.')
        return
    errors = []
    label, value = values['test-args'][1]['matrixshape']
    matrixshape = parsematlab.parse(value.lower().replace('x', ' '))
    if isinstance(matrixshape, str):
        errors.append(label + '\n    ' + value.replace('\n', '\n    '))
    if np.isscalar(matrixshape):
        matrixshape = [matrixshape]
    label, value = values['test-args'][1]['repetitions']
    repetitions = parsematlab.parse(value)
    if isinstance(repetitions, str):
        errors.append(label + '\n    ' + value.replace('\n', '\n    '))
    if len(errors) > 0:
        Error('\n\n'.join(errors))
        return
    classifier = loaddata.load_weights(weightfile)
    if isinstance(classifier, str):
        Error(classifier)
        return
    removeanomalies = values['generation-args'][1]['removeanomalies'][1]
    data = []
    type = []
    samplingrate = None
    try:
        for fname in fnames:
            result = loaddata.load_data(fname, [0, classifier.shape[0]],
                None, True, removeanomalies = removeanomalies)
            if isinstance(result, str):
                Error(result)
                return
            if samplingrate == None:
                samplingrate = result[2]
            if samplingrate != result[2]:
                Error('Not all data files have the same sampling rate.')
                return
            data.append(result[0])
            type.append(result[1])
        if len(data) == 0 or len(type) == 0:
            Error('You must select some data upon which to test the weights.')
            return
        try:
            data = np.concatenate(data)
        except ValueError:
            Error('Not all data files have the same number of channels.')
            return
        type = np.concatenate(type)
        result = testweights.test_weights(data, type, classifier,
            matrixshape, repetitions)
        if isinstance(result, str):
            Error(result)
            return
        score, correctness = result
        message = '\n'.join(fnames)
        message += '\n\n%s\n\nExpected accuracy for a %s matrix:\n\n' % \
            (
                weightfile,
                'x'.join(str(i) for i in matrixshape)
            )
        for i in range(len(repetitions)):
            if repetitions[i] != 1:
                message += '%i repetitions: %0.1f%%\n' % \
                    (repetitions[i], correctness[i] * 100)
            else:
                message += '1 repetition: %0.1f%%\n' % (correctness[i] * 100)
        message += '\nTarget STDEV: %f\nNontarget STDEV: %f\n' % score
        Info(message)
    except MemoryError:
        Error('Could not fit all the selected data in memory.\n' + \
            'Try loading fewer data files.')
        return
def trainandsave():

    # 加载数据
    root_path = "./data/cifar10"
    data_folder = "train"
    batch_size = 32
    data_type = "train"
    trainloader = load_data(root_path, data_folder, batch_size, data_type)

    # 加载自定义模型 默认的分类数量就是10
    net = vgg19_bn()

    if torch.cuda.is_available():
        device = torch.device('cuda')
        net = net.cuda()
    else:
        device = torch.device('cpu')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("device = ", device)
    net.to(device=device)

    # 优化器
    optimizer = optim.SGD(net.parameters(), lr=1e-3, momentum=0.9)

    # 损失函数 # 损失函数也可以自己定义,我们这里用的交叉熵损失函数
    celoss = nn.CrossEntropyLoss()

    # 训练部分
    print("trainloader = ", trainloader)
    for epoch in range(300):  # 训练的数据量为5个epoch,每个epoch为一个循环

        running_loss = 0.0  # 定义一个变量方便我们对loss进行输出
        # 这里我们遇到了第一步中出现的trailoader,代码传入数据
        for i, data in enumerate(trainloader, 0):
            # enumerate是python的内置函数,既获得索引也获得数据
            # get the inputs
            # data是从enumerate返回的data,包含数据和标签信息,分别赋值给inputs和labels
            inputs, labels = data

            #print("inputs = ", inputs)
            #print("labels = ", labels)
            # wrap them in Variable
            # 转换数据格式用Variable
            inputs, labels = Variable(inputs), Variable(labels)
            # 梯度置零,因为反向传播过程中梯度会累加上一次循环的梯度
            optimizer.zero_grad()
            # forward + backward + optimize
            inputs = inputs.to(device)
            labels = labels.to(device)
            # 把数据输进CNN网络net
            outputs = net(inputs)

            loss = celoss(outputs, labels)  # 计算损失值
            loss.backward()  # loss反向传播 计算反向梯度
            optimizer.step()  # 利用反向梯度 参数更新
            #running_loss += loss.data[0]       # loss累加
            running_loss += loss.item()  # loss累加
            # 每个epoch要训练所有的图片,每训练完成200张便打印一下训练的效果(loss值)
            if (i + 1) % 200 == 0:
                localtime = time.asctime(time.localtime(time.time()))
                print(localtime, '[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1,
                       running_loss / 200))  # 然后再除以200,就得到这两百次的平均损失值
                running_loss = 0.0  # 这一个200次结束后,就把running_loss归零,下一个200次继续使用
        # 每五十个epoch保存一次参数
        if (epoch + 1) % 50 == 0:
            save_name = "default_net_params_ep" + str(epoch) + ".pkl"
            torch.save(net.state_dict(), save_name)

    print('Finished Training')
    # 保存神经网络
    torch.save(net, 'pretrain_default_net.pkl')  # 保存整个神经网络的结构和模型参数
Example #12
0
if args.gpu:
    # check for cuda availability
    if torch.cuda.is_available:
        print('CUDA is available, use cuda mode')
        architecture = 'cuda'
    else:
        print('Cuda is not available on this system, fallback to cpu mode')
        architecture = 'cpu'
else:
    print('Use cpu mode')
    architecture = 'cpu'

print("...")
print("Import training, test and validation set")
print("...")
dataloader_train, dataloader_test, dataloader_validation, class_to_idx_traing = loaddata.load_data('flowers')
print("...")
print("Building and traing model")
model = network.build_and_train_model(args.model_init, args.hidden_layers, args.epochs, args.dropout, args.lr, architecture, dataloader_train, dataloader_validation, dataloader_test)

print("...")
print("The model looks like:")
print(model)

# run agains test data
def test_model(model, testloader, architecture):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in dataloader_test:
Example #13
0
import keras
import matplotlib.pyplot as plt
import numpy as np
from keras.models import load_model
from loaddata import load_data, load_one
import math

# returns a compiled model
# identical to the previous one
model = load_model('9100.h5')

ximages, yvalues = load_data()


def getimage(index):
    xi = ximages[index]
    i = np.reshape(xi, (xi.shape[0], xi.shape[1]))
    return i


def imshow(index):
    plt.figure()
    plt.imshow(getimage(index), cmap='gray')
    plt.show(block=False)


def imrange(start=0, count=2):
    import math
    s = count + 5
    plt.figure()
    fig0, plots = plt.subplots(count,
Example #14
0
from keras.models import Sequential
import matplotlib.pyplot as plt
from matplotlib import style
import time
import warnings
import numpy as np
from numpy import newaxis
import csv
import pandas as pd
import ast
from numpy import diff
from statistics import mean
import loaddata

#Loading Stock Data from saved CSV.
X_train, y_train, X_test, y_test = loaddata.load_data('./stock/ebay.csv', 55,
                                                      True)
#stocks = AMZON , APPL , citigroup , dowjones , ebay , GOOG , KO , TATA , test


class RecurrentNeuralNetwork:
    def __init__(self, xs, ys, rl, eo, lr):
        #initial input
        self.x = np.zeros(xs)
        #input size
        self.xs = xs
        #expected output
        self.y = np.zeros(ys)
        #output size
        self.ys = ys
        #weight matrix for interpreting results from LSTM cell
        self.w = np.random.random((ys, ys))
Example #15
0
from loaddata import load_data
import csv
from matplotlib import pyplot
import pandas as pd

dataset = load_data('capture20110811.pcap.netflow.labeled')

dataset_botnet = dataset[dataset['Label'] == 'Botnet']
ip_to_analyse = dataset_botnet.iloc[0]['src_ip']

all_data_ip_to_analyse = dataset[dataset['src_ip'] == ip_to_analyse]
botnet_data_ip_to_analyze = dataset_botnet[dataset_botnet['src_ip'] ==
                                           ip_to_analyse]

features = ['Prot', 'Packets']


def get_markov_chain(feature, data):
    transition_counts = {}

    # print(data.size)
    # print(ngram_length)

    old_state = None
    for state in data[feature]:
        # print(state)
        # ngram = str(data[feature][i])
        # for j in range(i, i+ngram_length):
        #     ngram += '{}, '.format(data.at[j, data[feature]])

        if old_state is not None:
Example #16
0
def main():
    best_score = 0

    # Load all stations from the csv file
    skip = False
    stations_data = load_data("data/StationsNationaal.csv", skip)

    # Prompt user for necessary input
    user_choices = user_interface(stations_data)

    # Load the connections data from the csv file (Heel Nederland or Noord- en
    # Zuid Holland), skipping a station if necessary
    data_list = load_data(user_choices["data"], user_choices["skip"])

    for _ in range(user_choices["attempts"]):
        # Load all stations as objects into a dictionary
        stations_objects = load_stations(data_list, stations_data)

        # Load all connections into a dictionary
        connection_objects = load_connections(
            data_list, stations_objects, user_choices["change_connections"])

        # Generate a random solution with chosen heuristics
        solution = random_solution(stations_objects, connection_objects,
                                   user_choices)

        # Run the heuristic of "cutting" trains, if the user chose this option
        if user_choices["cut_connections"]:
            solution = cut(solution)

        # Run the heuristic of "pasting" trains, if the user chose this option
        if user_choices["paste_connections"]:
            solution = paste(solution, user_choices["max_minutes"])

        # Delete empty trains from solution
        solution = delete_trains(solution)

        # Calculate the K of a solution with the given function
        score = calculate(solution)

        # Set score to new values
        if score > best_score:
            best_solution = solution
            best_score = score

    # Open outputfile
    f = open("output.csv", "w")
    f.write("random:\ntrein, lijnvoering\n")

    counter = 0

    # Write random solution to outputfile
    for train in best_solution["trains"]:
        counter += 1
        f.write(f'trein_{counter}, "{train}"\n')
    f.write(f"SCORE:{best_score}\n\n")
    f.close()

    # If simulated annealing is chosen
    if user_choices["sim_annealing"] == True:
        best_solution = simulated_annealing(solution, stations_objects,
                                            user_choices)

        # If heuristic cut is chosen
        if user_choices["SA_cut_connections"]:
            best_solution = cut(best_solution)

        best_solution = delete_trains(best_solution)
        better_score = calculate(best_solution)

        # Open outputfile
        f = open("output.csv", "a+")
        f.write("simulated annealing:\ntrein, lijnvoering\n")
        counter = 0

        # Write simulated annealing solution to outputfile
        for train in best_solution["trains"]:
            counter += 1
            f.write(f'trein_{counter}, "{train}"\n')
        f.write(f"SCORE:{better_score}\n\n")
        f.close()

    # Draw the map
    if user_choices["data"] == "data/ConnectiesHolland.csv":
        draw_train_holland(best_solution, stations_objects)
    else:
        draw_train(best_solution, stations_objects)
Example #17
0
# -*- coding: utf-8 -*-

# 1. Load data
import loaddata
data, names = loaddata.load_data("iquitos-train.csv")
import numpy

#1. Data normalizazion
#http://scikit-learn.org/stable/modules/preprocessing.html
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

rows = 206
cols = 15
print(cols)

datanorm = min_max_scaler.fit_transform(data)

#2. Principal Component Analysis
from sklearn.decomposition import PCA
estimator = PCA(n_components=2)
X_pca = estimator.fit_transform(datanorm)

import matplotlib.pyplot as plt
plt.plot(X_pca[:, 0], X_pca[:, 1], 'x')

#3. Hierarchical Clustering
# 3.1. Compute the similarity matrix
import sklearn.neighbors
import numpy
dist = sklearn.neighbors.DistanceMetric.get_metric('euclidean')
Example #18
0
def trainandsave():

    # 加载数据
    root_path = "./data/cifar10"
    data_folder = "train"
    batch_size = 32
    data_type = "train"
    trainloader = load_data(root_path, data_folder, batch_size, data_type)

    # 加载预训练模型
    net = torchvision.models.vgg19_bn(pretrained=True)

    # 最后输出层从1000个分类,换层10个分类
    net.classifier._modules['6'] = nn.Sequential(nn.Linear(4096, 10),
                                                 nn.Softmax(dim=1))
    # 冻结特征层,分类层进行训练
    param_group = []
    learning_rate = 1e-3
    for name, parameters in net.named_parameters():
        if not name.__contains__('classifier'):
            parameters.requires_grad = False
            param_group += [{'params': parameters, 'lr': learning_rate}]

    if torch.cuda.is_available():
        device = torch.device('cuda')
        net = net.cuda()
    else:
        device = torch.device('cpu')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("device = ", device)
    net.to(device=device)

    # 优化器
    optimizer = optim.SGD(param_group, momentum=0.9)

    # 损失函数 # 损失函数也可以自己定义,我们这里用的交叉熵损失函数
    celoss = nn.CrossEntropyLoss()

    # 训练部分
    print("trainloader = ", trainloader)
    for epoch in range(300):  # 训练的数据量为5个epoch,每个epoch为一个循环

        running_loss = 0.0  # 定义一个变量方便我们对loss进行输出
        # 这里我们遇到了第一步中出现的trailoader,代码传入数据
        for i, data in enumerate(trainloader, 0):
            # enumerate是python的内置函数,既获得索引也获得数据
            # get the inputs
            # data是从enumerate返回的data,包含数据和标签信息,分别赋值给inputs和labels
            inputs, labels = data

            #print("inputs = ", inputs)
            #print("labels = ", labels)
            # wrap them in Variable
            # 转换数据格式用Variable
            inputs, labels = Variable(inputs), Variable(labels)
            # 梯度置零,因为反向传播过程中梯度会累加上一次循环的梯度
            optimizer.zero_grad()
            # forward + backward + optimize
            inputs = inputs.to(device)
            labels = labels.to(device)
            # 把数据输进CNN网络net
            outputs = net(inputs)

            loss = celoss(outputs, labels)  # 计算损失值
            loss.backward()  # loss反向传播 计算反向梯度
            optimizer.step()  # 利用反向梯度 参数更新
            #running_loss += loss.data[0]       # loss累加
            running_loss += loss.item()  # loss累加
            # 每个epoch要训练所有的图片,每训练完成200张便打印一下训练的效果(loss值)
            if (i + 1) % 200 == 0:
                localtime = time.asctime(time.localtime(time.time()))
                print(localtime, '[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1,
                       running_loss / 200))  # 然后再除以200,就得到这两百次的平均损失值
                running_loss = 0.0  # 这一个200次结束后,就把running_loss归零,下一个200次继续使用
        # 每五十个epoch保存一次参数
        if (epoch + 1) % 50 == 0:
            save_name = "classifier_net_params_ep" + str(epoch) + ".pkl"
            torch.save(net.state_dict(), save_name)

    print('Finished Training')
    # 保存神经网络
    torch.save(net, 'pretrain_classifier_net.pkl')  # 保存整个神经网络的结构和模型参数
Example #19
0
import sklearn.neighbors
import pandas as pd
# Loads the data into array 'cases'
import loaddata
#cases = loaddata.load_data()
cases = pd.read_csv('Data/dengue_features_train.csv')

cases = cases.drop(cases.index[[88, 140, 400, 452, 752, 712, 764,
                                495]])  #principal outliers

cases = cases.drop(cases.index[[700, 502, 361, 253, 254, 330, 493]])

cases = cases.fillna(cases.mean())
print cases
cases.to_csv('out.csv', index=False)
cases = loaddata.load_data()

#for i in list:
#    del cases[i]

# Normalization of the data to work with it in clustering
min_max_scaler = preprocessing.MinMaxScaler()
norm_cases = min_max_scaler.fit_transform(cases)
from sklearn.decomposition import PCA
estimator = PCA(n_components=2)
X_pca = estimator.fit_transform(norm_cases)
plt.plot(X_pca[:, 0], X_pca[:, 1], 'x')

# Computing the similarity matrix. Here the distance function that we selected is chosen.
dist = sklearn.neighbors.DistanceMetric.get_metric('euclidean')
matsim = dist.pairwise(norm_cases)
Example #20
0
tfconfig = tf.ConfigProto()
tfconfig.gpu_options.allow_growth=True

import utils

# Configuration settings
squaredIB        = True         # Whether to minimize beta*I(X;T) - I(Y;T) or beta*I(X;T)^2 - I(Y;T)
batchsize        = 256          # Mini batch size
report_every     = 10           # How often to report
patience         = 10           # Early stopping patience -- # epochs to go without improvement on validation data
beta             = 0.05         # IB trade-off parameter


# Load data
import loaddata
data           = loaddata.load_data('MNIST', validation=True)
input_dim      = data['trn_X'].shape[1]
output_dim     = data['trn_Y'].shape[1]


# Build the network
tf.reset_default_graph()

import iblayer
iblayerobj     = iblayer.NoisyIBLayer()

layers = []
layers.append( tf.placeholder(tf.float32, [None,input_dim,], name='X' ) )
layers.append( tf.keras.layers.Dense(128, activation=tf.nn.relu)(layers[-1]) )
layers.append( tf.keras.layers.Dense(128, activation=tf.nn.relu)(layers[-1]) )
layers.append( tf.keras.layers.Dense(10 , activation=None)(layers[-1]) )
Example #21
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 17 12:20:16 2017

@authors: Sergio Alises Mendiola and Raul Gallego de la Sacristana Alises

"""

# -*- coding: utf-8 -*-

# 1. Load data
import loaddata
data = loaddata.load_data("../Data/dengue_features_train_outliers.csv")

#1. Data normalizazion
#http://scikit-learn.org/stable/modules/preprocessing.html
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
datanorm = min_max_scaler.fit_transform(data)

#2. Principal Component Analysis
from sklearn.decomposition import PCA
estimator = PCA(n_components=2)
X_pca = estimator.fit_transform(datanorm)

import matplotlib.pyplot as plt
plt.plot(X_pca[:, 0], X_pca[:, 1], 'x')

#3. Hierarchical Clustering
# 3.1. Compute the similarity matrix
def main():

    years = None
    features_excluded = ['week_start_date']

    _outliers = None

    cities = get_values_of("../data/dengue_features_train.csv", 'city')
    target = ['total_cases']

    all_revelant_features = {}

    for city in cities:
        # Filtering by values of the keys
        _filter = {'city': [city], 'year': years}

        #Load city data
        data = load_data("../data/dengue_features_train.csv",
                         filter_parameters=_filter,
                         excludes_features=features_excluded,
                         outliers=_outliers)

        # Load total cases by city, year and week of year
        data_labels = load_data("../data/dengue_labels_train.csv",
                                filter_parameters=_filter)

        # Adapt data for clustering
        data_test_hiech = data.drop(labels=['city', 'year'],
                                    axis=1,
                                    inplace=False)

        # Outliers will be deleted
        elements, outliers, cut = clustering.hierarchical_clustering(
            data=data_test_hiech)

        n_element = count_elements(elements)
        n_outliers = count_elements(outliers)
        total = n_element + n_outliers

        print 'Analysis in: %s' % (city)

        total_outliers = []
        while (outliers != None):
            total_outliers += outliers
            data_test_hiech.drop(outliers, axis=0, inplace=True)
            elements, outliers, cut = clustering.hierarchical_clustering(
                data_test_hiech, cut=cut, first_total=total)

        if total_outliers:
            print 'Auto-detected Outliers:'
            print total_outliers

        # Join data
        data_without_outliers = data
        data_without_outliers.drop(total_outliers, axis=0, inplace=True)

        merge_data = pd.merge(data_without_outliers,
                              data_labels,
                              on=['city', 'year', 'weekofyear'],
                              how='outer')
        merge_data.drop(labels=['city', 'year'], axis=1, inplace=True)
        merge_data.dropna(inplace=True)

        # Features clustering
        data_for_features = merge_data.drop(labels=target, axis=1)
        clustering.hierarchical_clustering_features(data_for_features)

        # Croos Validation for select features
        feature_selected, max_deph = cros.cross_validation(
            merge_data, algorithm='DecisionTreeRegressor')

        # Regressor for select relevant features
        relevant_features = reg.tree_regressor(merge_data, max_deph,
                                               feature_selected, target, city)

        all_revelant_features[city] = relevant_features

        # For each city, one model KNN
        # Croos Validation for select features
        n_neighbors, X, y = cros.cross_validation(merge_data,
                                                  algorithm='KNN',
                                                  features=relevant_features,
                                                  target=target,
                                                  verbose=True)

        #---------------------------------------------

        # prediction
        data_Test = load_data("../data/dengue_features_test.csv",
                              filter_parameters=_filter,
                              excludes_features=features_excluded,
                              outliers=_outliers)

        #data_Test.dropna(inplace = True)
        test = data_Test[relevant_features]
        test.interpolate(method='linear', inplace=True)

        knn = neighbors.KNeighborsRegressor(n_neighbors, weights='distance')
        prediction = knn.fit(X, y).predict(test)

        # show prediction
        print "\nPREDICTION:"
        xx = np.stack(i for i in range(len(prediction)))
        plt.plot(xx, prediction, c='g', label='prediction')
        plt.axis('tight')
        plt.legend()
        plt.title("KNeighborsRegressor (k = %i, weights = '%s')" %
                  (n_neighbors, 'distance'))

        plt.show()

        # write the results in a csv file
        submission_data = load_data("../data/submission_format.csv",
                                    filter_parameters=_filter)
        final_data = []

        for i in range(len(final_data)):
            row = []

            row.append(submission_data.iloc[i]['city'])
            row.append(submission_data.iloc[i]['year'])
            row.append(submission_data.iloc[i]['weekofyear'])
            row.append(int(prediction[i]))

            final_data.append(row)

        col = ["city", "year", "weekofyear", "total_cases"]
        df = pd.DataFrame(final_data, columns=col)
        df.to_csv('../data/predictions_for_' + city + '.csv',
                  index=False,
                  sep=',',
                  encoding='utf-8')

        #---------------------------------------------

    print '\n\t [ SELECTED FEATURES ]'
    for key, value in all_revelant_features.iteritems():
        print 'City: %s, %2d features: \n\t %s' % (key, len(value), str(value))
Example #23
0
theta = 30
alpha1 = 25
alpha2 = 15
beta1 = 20
beta2 = 25

from loaddata import load_data
check_node_lst, START, END = load_data("附件1:数据集1-终稿.xlsx")
def testmodel():

    # 设置计算设备
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # 加载数据
    root_path = "./data/cifar10_save"
    data_folder = "test"
    batch_size = 8
    data_type = "test"
    testloader = load_data(root_path, data_folder, batch_size, data_type)

    # 加载模型
    model_path = "net.pkl"
    net = torch.load(model_path)
    # 选择模式 测试模式
    # 这是因为这两个方法是针对在网络训练和测试时
    # 采用不同方式的情况,比如Batch Normalization 和 Dropout
    net.eval()

    # 加载标签名字
    label_name_path = "./data/cifar-10-batches-py/batches.meta"
    labelnames = unpickle(label_name_path)
    labelnames = labelnames[b'label_names']
    label_names = []
    for l in labelnames:
        label_names.append(l.decode("utf-8"))
    # 识别出来的数量
    pred_num = dict()
    # 标签总数量
    test_num = dict()
    # 识别对的数量
    correct_num = dict()
    for label in label_names:
        test_num[label] = 0
        pred_num[label] = 0
        correct_num[label] = 0

    with torch.no_grad():
        for n_iter, (image, label) in enumerate(testloader):
            print("iteration: {}\ttotal {} iterations".format(
                n_iter + 1, len(testloader)))

            image = image.to(device)
            label = label.to(device)

            output = net(image)
            _, pred = output.topk(5, 1, largest=True, sorted=True)

            label = label.view(label.size(0), -1).expand_as(pred)
            correct = pred.eq(label).float()

            labels = label.cpu().numpy()
            # test_num[l] += 1
            for p in pred.cpu().numpy():
                pred_num[label_names[p[0]]] += 1
            for i, c in enumerate(correct.cpu().numpy()):
                l = labels[i][0]
                label_name = label_names[l]
                test_num[label_name] += 1
                if int(c[0]) == 1:
                    correct_num[label_name] += 1

    print("=====")
    print(" label name  recall  precision")
    for i in range(10):
        label_name = label_names[i]
        print(
            label_name, " {:.1f}%  {:.1f}%".format(
                float(correct_num[label_name]) / float(test_num[label_name]) *
                100,
                float(correct_num[label_name]) / float(pred_num[label_name]) *
                100))
Example #25
0
Display the Dendrogram of the data.

@author: Ruth Rodríguez-Manzaneque López, Diego Andérica Richard y Laura Jaime Villamayor

"""

import matplotlib.pyplot as plt
import numpy
from scipy import cluster
from sklearn import preprocessing
import sklearn.neighbors

import loaddata

# 0. Load Data.
records, names = loaddata.load_data("../Data/dengue_features_train.csv")

# 1. Normalization of the data.
min_max_scaler = preprocessing.MinMaxScaler()
records = min_max_scaler.fit_transform(records)

# 2. Compute the similarity matrix.
dist = sklearn.neighbors.DistanceMetric.get_metric('chebyshev')
matsim = dist.pairwise(records)
avSim = numpy.average(matsim)
# It is used the Average Distance.
print "%s\t%6.2f" % ('Average Distance', avSim)

# 3. Building the Dendrogram with the method Complete and it is cut at level of 6.
clusters = cluster.hierarchy.linkage(matsim, method='complete')
cut_level = 6
Example #26
0
                        cfg['beta_max'],
                        cfg['beta_npoints'],
                        endpoint=True)[::-1]
run_methods = cfg['methods'].split(',')  # only run specified method

if not os.path.exists(savedir):
    print("Making directory", savedir)
    os.makedirs(savedir)

import loaddata, iblayer, utils
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
tfconfig = tf.ConfigProto()
tfconfig.gpu_options.allow_growth = True

data = loaddata.load_data(cfg['runtype'], validation=True)
input_dim = data['trn_X'].shape[1]
output_dim = data['trn_Y'].shape[1]


def train(sess, method, beta, cfg, data, net, savedir):
    # sess         : TensorFlow session
    # method       : 'ce' (cross-entropy only), 'nlIB' (nonlinear IB), or 'VIB' (variational IB)
    # beta         : beta value
    # cfg          : configuration dictionary
    # data         : data object
    # net          : neural network object
    # savedir      : directory where to save results

    def calcstats(epoch, do_print=False):
        lobj = net.iblayerobj
def generateFeatureWeights(name, values):
    args = values['generation-args'][1]
    errors = []
    for key in args:
        if key in ('removeanomalies', 'classificationmethod'):
            continue
        label, value = args[key]
        value = parsematlab.parse(value)
        if isinstance(value, str):
            errors.append(label + '\n    ' + value.replace('\n', '\n    '))
        args[key] = value
    if len(errors) > 0:
        Error('\n\n'.join(errors))
        return
    response_window = args['responsewindow']
    decimation_frequency = args['decimationfrequency']
    max_model_features = args['maxmodelfeatures']
    penter = args['penter']
    premove = args['premove']
    random_sample_percent = args['randompercent']
    channelset = args['channelset'] - 1
    fnames = values['flist'][1]
    weightwidget = values['weightfile'][0]
    removeanomalies = args['removeanomalies'][1]
    classificationmethod = args['classificationmethod'][1]
    data = []
    type = []
    samplingrate = None
    channels = None
    try:
        for fname in fnames:
            result = loaddata.load_data(fname, response_window, None,
                removeanomalies = removeanomalies)
            if isinstance(result, str):
                Error(result)
                return
            if samplingrate == None:
                samplingrate = result[2]
            if samplingrate != result[2]:
                Error('Not all data files have the same sampling rate.')
                return
            if channels == None:
                channels = result[0].shape[2]
            if channels != result[0].shape[2]:
                Error('Not all data files have the same number of channels.')
                return
            try:
                data.append(result[0][:, :, channelset])
            except IndexError:
                Error('"Channel Set" is not a subset of the available ' + \
                    'channels.')
                return
            type.append(result[1])
        if len(data) == 0 or len(type) == 0:
            Error('You must select some data from which to generate ' + \
                'the weights.')
            return
        data = np.concatenate(data)
        type = np.concatenate(type)
        randomindices = np.arange(data.shape[0], dtype = int)
        np.random.shuffle(randomindices)
        randomindices = randomindices[:data.shape[0] * random_sample_percent // 100]
        randomindices.sort()
        data = data[randomindices]
        type = type[randomindices]
        result = swlda.swlda(data, type, samplingrate, response_window,
            decimation_frequency, max_model_features, penter, premove)
        if isinstance(result, str):
            Error(result)
            return
        channels, weights = result
        channels = channelset[channels - 1] + 1 # Convert from one-based for
            # indexing, and then to one-based for human readability.
        prm = exportToPRM(channels, weights, response_window[1])
        try:
            fname = SaveAs(filetypes = [('Parameter Files', '.prm')],
                defaultextension = 'prm')
            if fname:
                prmfile = open(fname, 'wb')
                prmfile.write(prm)
                prmfile.close()
                weightwidget.setContents(fname)
        except:
            Error('Could not write PRM file.')
            return
    except MemoryError:
        Error('Could not fit all the selected data in memory.\n' + \
            'Try loading fewer data files.')
        return
Example #28
0
import matplotlib.pyplot as plt
import numpy

#http://docs.scipy.org/doc/scipy/reference/cluster.html
from scipy import cluster
from sklearn import preprocessing
import sklearn.neighbors

# 0. Load Data
import loaddata
states, names = loaddata.load_data("iquitos-train_with_name.csv")
features = numpy.transpose(states)

#1. Normalization of the data
#http://scikit-learn.org/stable/modules/preprocessing.html
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
features_norm = min_max_scaler.fit_transform(features)

#1.2. Principal Component Analysis
from sklearn.decomposition import PCA
estimator = PCA(n_components=3)
X_pca = estimator.fit_transform(features_norm)
print("Variance Ratio: ", estimator.explained_variance_ratio_)

import matplotlib.pyplot as plt
fig, ax = plt.subplots()
print(len(names))
for i in range(len(X_pca)):
    print(i)
    plt.text(X_pca[i][0], X_pca[i][1], names[i])
Example #29
0
from sklearn.svm import SVC
from loaddata import load_data

x_tr, y_tr, x_tst = load_data()

clf = SVC()


Example #30
0
# -StartPython.py *- coding: utf-8 -*-
from loaddata import load_data
from loaddata import load_ip_sequence

dataset = load_data(
    'C:/Users/YI/Desktop/TUD/Cyber data analytics/LAB3/Sampling/capture20110811.pcap.netflow.labeled'
)
ip_data = load_ip_sequence(dataset, '147.32.84.229')


#%%
def top10freq(lst):
    from collections import Counter
    d = {}
    for i in lst:
        if d.get(i):
            d[i] += 1
        else:
            d[i] = 1
    occurence = dict(Counter(d).most_common(10))
    frequency = occurence
    for i in frequency:
        frequency[i] = occurence[i] / len(lst)
    return frequency


#
def reservoir_sampling(stream, k):
    import random
    i = 0
    n = len(stream)
Example #31
0
               clim=[0.1, 1.1])


def sample_random_position(grid):
    obstacles_grid = grid[0, :, :, 0]
    x_coords, y_coords = np.argwhere(obstacles_grid == 0).T
    position = np.random.randint(x_coords.size)
    return (x_coords[position], y_coords[position])


if __name__ == '__main__':
    args = parser.parse_args()
    env = environments[args.imsize]

    print("Loading data...")
    x_test, _, _, _ = load_data(env['test_data_file'])

    print("Initializing VIN...")
    VIN = create_VIN(
        env['input_image_shape'],
        n_hidden_filters=150,
        n_state_filters=10,
        k=env['k'],
    )
    print("Loading pre-trained VIN parameterss...")
    storage.load(VIN, env['pretrained_network_file'])

    plt.figure(figsize=(8, 8))
    gridspec = gridspec.GridSpec(5, 4, height_ratios=[0, 2, 2, 2, 2])
    gridspec.update(wspace=0.1, hspace=0.1)
Example #32
0
theta = 20
alpha1 = 20
alpha2 = 10
beta1 = 15
beta2 = 20

from loaddata import load_data
check_node_lst, START, END = load_data("附件2:数据集2-终稿.xlsx")
def main():

    first = True
    name_file = assign_name()
    prediction_path = '../predictions/' + name_file

    if not os.path.exists(prediction_path):
        os.makedirs(prediction_path)

    years = None
    features_excluded = ['week_start_date']

    _outliers = None

    cities = get_values_of("../data/dengue_features_train.csv", 'city')

    target = 'total_cases'

    all_revelant_features = {}
    all_scores = []

    modes = [  #'dropna', 'interpolate', 'mean',
        ['interpolate', 'mean']
    ]  #, ['interpolate', 'dropna']]

    for mode in modes:

        first = True
        scores_city = {}
        for city in cities:

            # Filtering by values of the keys
            _filter = {'city': [city], 'year': years}

            #Load city data
            data = load_data("../data/dengue_features_train.csv",
                             filter_parameters=_filter,
                             excludes_features=features_excluded,
                             outliers=_outliers)

            # Load total cases by city, year and week of year
            data_labels = load_data("../data/dengue_labels_train.csv",
                                    filter_parameters=_filter)

            data_fill = data_fill_mode(data, mode)
            data_labels_fill = data_fill_mode(data_labels, mode)

            # Adapt data for clustering
            data_test_hiech = data_fill.drop(labels=['city', 'year'],
                                             axis=1,
                                             inplace=False)

            # Outliers will be deleted
            elements, outliers, cut = clustering.hierarchical_clustering(
                data=data_test_hiech, verbose=False)

            n_element = count_elements(elements)
            n_outliers = count_elements(outliers)
            total = n_element + n_outliers

            print 'Analysis in: %s on mode %s' % (city, str(mode))

            total_outliers = []
            while (outliers != None):
                total_outliers += outliers
                data_test_hiech.drop(outliers, axis=0, inplace=True)
                elements, outliers, cut = clustering.hierarchical_clustering(
                    data_test_hiech, cut=cut, first_total=total, verbose=False)

            if total_outliers:
                print 'Auto-detected Outliers:'
                print total_outliers

            # Join data
            data_without_outliers = data_fill
            data_without_outliers.drop(total_outliers, axis=0, inplace=True)

            merge_data = pd.merge(data_without_outliers,
                                  data_labels_fill,
                                  on=['city', 'year', 'weekofyear'],
                                  how='inner')
            first_year = merge_data['year'].min()
            last_year = merge_data['year'].max()
            split_year = int(last_year - round((last_year - first_year) * 0.2))

            # Features clustering
            data_for_features = merge_data.drop(labels=['city', 'total_cases'],
                                                axis=1)

            feature_groups = clustering.hierarchical_clustering_features(
                data_for_features, verbose=False)

            # Croos Validation for select features
            features_selected, max_deph = cros.cross_validation(merge_data,
                                                                feature_groups,
                                                                split_year,
                                                                target=target)

            # Regressor for select relevant features
            relevant_features = reg.tree_regressor(merge_data,
                                                   split_year,
                                                   max_deph,
                                                   features_selected,
                                                   target,
                                                   city,
                                                   verbose=False)

            all_revelant_features[city] = relevant_features

            all_features = merge_data.columns.tolist()[1:-1]

            data_Test = load_data("../data/dengue_features_test.csv",
                                  filter_parameters=_filter,
                                  excludes_features=features_excluded,
                                  outliers=_outliers)

            # prediction

            prediction_knn, score_knn = predict.knn_prediction(
                merge_data,
                split_year,
                features_selected,
                target,
                data_Test,
                verbose=True)
            print('Score KNN on %s mode is : %.4f' % (mode, score_knn))

            prediction_rf, score_rf = predict.rf_prediction(merge_data,
                                                            split_year,
                                                            all_features,
                                                            target,
                                                            data_Test,
                                                            verbose=True)
            print('Score RandomForest on %s mode is : %.4f' % (mode, score_rf))

            scores_city[city] = [(mode, 'Knn', score_knn),
                                 (mode, 'RF', score_rf)]

            # Load submission data file.
            submission_data = load_data("../data/submission_format.csv",
                                        filter_parameters=_filter)

            # wr ite the results in a csv file
            # Write result files.
            col = ["city", "year", "weekofyear", "total_cases"]
            write_result(col, submission_data, prediction_knn, prediction_rf,
                         prediction_path, (name_file + str(mode)), first)
            first = False

        all_scores.append(scores_city)

    print all_scores
    """ 
Example #34
0
if args.gpu:
    # check for cuda availability
    if torch.cuda.is_available:
        print('CUDA is available, use cuda mode')
        architecture = 'cuda'
    else:
        print('Cuda is not available on this system, fallback to cpu mode')
        architecture = 'cpu'
else:
    print('Use cpu mode')
    architecture = 'cpu'

print("...")
print("Import training, test and validation set")
print("...")
dataloader_train, dataloader_test, dataloader_validation, class_to_idx_traing = loaddata.load_data(
    'flowers')
print("...")
print("Building and traing model")
model = network.build_and_train_model(args.model_init, args.hidden_layers,
                                      args.epochs, args.dropout, args.lr,
                                      architecture, dataloader_train,
                                      dataloader_validation, dataloader_test)

print("...")
print("The model looks like:")
print(model)


# run agains test data
def test_model(model, testloader, architecture):
    model.eval()
Example #35
0
                    tmp,medoids_ = totalcost(blogwords,distance.distance,medoids_idx)
                    #print tmp,'-------->',medoids_.keys()
                    if tmp < current_cost :
                        best_choice = list(medoids_idx)
                        best_res = dict(medoids_)
                        current_cost = tmp
                    medoids_idx[idx] = swap_temp
        iter_count += 1
        print current_cost,iter_count
        if best_choice == medoids_idx : break
        if current_cost <= pre_cost :
            pre_cost = current_cost
            medoids = best_res
            medoids_idx = best_choice
        
    
    return current_cost, best_choice, best_res

def print_match(best_medoids, blognames) :
    for medoid in best_medoids :
        print blognames[medoid],'----->',
        for m in best_medoids[medoid] :
            print '(',m,blognames[m],')',
        print
        print '---------' * 20 

if __name__ == '__main__' :
    blogwords, blognames = loaddata.load_data()
    best_cost,best_choice,best_medoids = kmedoids(blogwords,8)
    print_match(best_medoids,blognames)
Example #36
0
        if network.last_epoch in steps:
            print("Saving pre-trained VIN model...")
            storage.save(network, env['pretrained_network_file'])

            new_step = steps[network.last_epoch]
            session = tensorflow_session()
            network.variables.step.load(new_step, session)
    return on_epoch_end


if __name__ == '__main__':
    args = parser.parse_args()
    env = environments[args.imsize]

    print("Loading train and test data...")
    x_train, s1_train, s2_train, y_train = load_data(env['train_data_file'])
    x_test, s1_test, s2_test, y_test = load_data(env['test_data_file'])

    print("Initializing VIN...")
    network = algorithms.RMSProp(
        create_VIN(
            env['input_image_shape'],
            n_hidden_filters=150,
            n_state_filters=10,
            k=env['k'],
        ),

        verbose=True,
        error=loss_function,
        epoch_end_signal=on_epoch_end_from_steps(env['steps']),
        **env['training_options']
Example #37
0
    final=open("%seval.csv" % id,'a+')
    final.write("%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s\n" % (votecount,bucketnumber,bsize,workerquality,prop,pcoverage,averagequality,accuracy[0],accuracy[1],accuracy[2],finalvalue[0],finalvalue[1]))
    final.close()
    result.close()

    print "done"

def loop(id,area):
    count=[1,3,5,7]
    wquality=[0.8]
    bsize=[50,100,200]
    proportion=[0.5,0.55,0.6,0.65,0.7,0.75,0.80]
    for k in bsize:
        print k
        for i in count:
            print i
            for j in wquality:
                print j
                for l in proportion:
                    print l
                    processdata(id,i,j,area,k,l)

if __name__ == "__main__":
    id = 54
#    votecount = int(sys.argv[2])
#    workerquality=float(sys.argv[3])
    area=loaddata.load_data(id)
    loop(id,area)