Example #1
0
def part2():
    """randomly choose 5%, 10%, 20%, 50%, 100% samples to train, and choose 10 sets each time"""
    plt.figure()
    for trainFileName, testFileName, key in [
        ('../diabetes_train.arff', '../diabetes_test.arff', 'diabetes'),
        ('../heart_train.arff', '../heart_test.arff', 'heart')
    ]:
        attribute, trainset = data_provider(trainFileName)
        testAttribute, testset = data_provider(testFileName)
        m = 4
        avgPoints = []
        maxPoints = []
        minPoints = []
        for rate in (0.05, 0.1, 0.2, 0.5, 1):
            accuracys = []
            for newTrainset in selectSample(trainset, rate):
                root = TreeNode(newTrainset, attribute)
                curTree = DecisionTree(root)
                curTree.createTree(root, m)
                trueSamples = 0
                falseSamples = 0
                for instance in testset:
                    if curTree.predict(root, instance) == instance[-1]:
                        trueSamples += 1
                    else:
                        falseSamples += 1
                accuracys.append(
                    float(trueSamples) / (trueSamples + falseSamples))
            accuracy = float(sum(accuracys)) / len(accuracys)
            avgPoints.append([int(rate * 100), accuracy])
            maxPoints.append([int(rate * 100), max(accuracys)])
            minPoints.append([int(rate * 100), min(accuracys)])

        mapping = {'diabetes': 1, 'heart': 2}
        ax = plt.subplot(1, 2, mapping[key])
        ax.set_xlim(0, 105)
        ax.set_ylim(0.45, 0.9)
        ax.set_ylabel('accuracy')
        ax.set_title(key)
        ax.plot([x[0] for x in avgPoints], [x[1] for x in avgPoints],
                label='average')
        ax.plot([x[0] for x in maxPoints], [x[1] for x in maxPoints],
                label='maximum')
        ax.plot([x[0] for x in minPoints], [x[1] for x in minPoints],
                label='minimum')
        ax.legend()
    plt.xlabel('dataset sample percentage')
    plt.savefig('../part2.pdf')
Example #2
0
 def __init__(self, offset, days):
     self.offset = offset
     self.days = days
     self.dp = data_provider.data_provider()
     self.left_abs = False
     self.right_abs = False
     self.name = None
	def __init__(self, offset, days):
		self.offset = offset
		self.days = days
		self.dp = data_provider.data_provider()
		self.left_abs = False
		self.right_abs = False
		self.name = None
Example #4
0
    def __init__(self,
                 model_name="model",
                 num_epochs=1000,
                 display_step=100,
                 learning_rate=0.1,
                 batch_size=100,
                 denoising=False,
                 retrain_delay=5,
                 graph_update_epochs=1000,
                 new_poll_weight=0.002,
                 masking=0,
                 num_layers=1,
                 num_hidden_1=155,
                 num_hidden_2=128,
                 continue_from_saved=False,
                 content_collab_hybrid='collab',
                 time_decay=1):

        self.data_provider = data_provider.data_provider(
            '../this_that_export_pretty.json')
        self.data_provider.parse()
        # Interactions are fed as binary but using decimal helps with adding 2 interactions together
        self.interaction_dict = {
            'skips': 16,
            'owns': 8,
            'tracks': 4,
            'comment': 2,
            'vote': 1
        }
        self.regression = regression.regression()
        self.num_engagements = len(self.interaction_dict)
        self.interactions_counter = 0
        self.retrain_delay = retrain_delay  # number of interactions needed before graph is trained more
        self.graph_update_epochs = graph_update_epochs  # how long model is trained for on new interactions
        self.users = self.data_provider.users  #[:50]
        self.polls = self.data_provider.polls  #[:50]
        self.test_polls = self.data_provider.polls  #[500:]
        self.model_name = model_name
        self.num_epochs = num_epochs  # initial training eppchs given training data
        self.display_step = display_step  # display training loss every x epochs
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.denoising = denoising  # whether to add noise to the input vectors - might help with accidental interactions
        self.new_poll_weight = new_poll_weight  # How much weight new polls are given in the output layer (gives new polls some initial traction)
        self.masking = masking  # TODO: Add masking to data to make synthetic users with less interactions and see if it helps
        self.num_layers = num_layers
        self.num_hidden_1 = num_hidden_1
        self.num_hidden_2 = num_hidden_2
        self.continue_from_saved = continue_from_saved
        self.train_proportion = 0.85
        self.train, self.test = [], []
        self.test_users = []
        self.time_decay = time_decay
        self.X = tf.placeholder("float", [None, None])
        self.Y = tf.placeholder("float", [None, None])
        self.saver = None
        self.weights2, self.weights1, self.biases2, self.biases1 = {},{},{},{}
        self.setup_graph()
        self.set_initial_training_and_test_data()
Example #5
0
def utFold():
    attrNum, labels, instances = data_provider('../sonar.arff')
    cv = CrossValidate(10, instances, labels)
    for i in range(10):
        print '>>>>>>>>>>>>>>>>>>> Fold', i, '<<<<<<<<<<<<<<<<<<<<<<'
        train, test = cv.fold(i)
        #print train[1], test[1]
        print train[1].shape[0] + test[1].shape[0]
        print train[0].shape, test[0].shape
Example #6
0
 def __init__(self,
              offset,
              days,
              min_slope=None,
              max_slope=None,
              slope_type='close'):
     self.days = days
     self.dp = data_provider.data_provider()
     self.slope_type = slope_type
     self.min_slope = min_slope
     self.max_slope = max_slope
Example #7
0
def modelOutput(trainFile, testFile, modelType):
    """
    output is:
        (naive bayes) variable name | 'class'
        (tan) variable name | name of its parents
    # empty
    followed by:
        predict class | actual class | posterior probability (12 digits after decimal point)
    # empty
    followed by:
        The number of the test-set examples that were correctly classified.
    """

    attributes, labels, instances = data_provider(trainFile)
    if modelType == 'n':
        model = Bayes(attributes, labels, instances)
    elif modelType == 't':
        model = TAN(attributes, labels, instances)
    else:
        import sys
        print >> sys.stderr, 'model type should be [n] or [t] !!!'
        sys.exit()
    attributes, labels, instances = data_provider(testFile)

    # format output part1: attribute name | 'class'
    model.printTree()
    print

    correctClassCnt = 0
    for test in instances:
        result = model.classify(test)
        if result[0] == result[1]:
            correctClassCnt += 1
        # format output part2: predict class | actual class | posterior probability
        print formatOutput(result)
    print

    # format output part3: correctly classified number of test instances
    print correctClassCnt
def train(data_folder):

    tf.set_random_seed(1)
    g = tf.Graph()
    with g.as_default():

        # Load dataset.
        audio_frames, ground_truth, _ = data_provider(
            data_folder,
            True,
            'train',
            FLAGS.batch_size,
            seq_length=FLAGS.seq_length)

        # Define model graph.
        with slim.arg_scope([slim.layers.batch_norm, slim.layers.dropout],
                            is_training=True):
            prediction = models.get_model(FLAGS.model)(
                audio_frames, hidden_units=FLAGS.hidden_units)

        for i, name in enumerate(['arousal', 'valence']):  #, 'liking']):
            pred_single = tf.reshape(prediction[:, :, i], (-1, ))
            gt_single = tf.reshape(ground_truth[:, :, i], (-1, ))

            loss = losses.concordance_cc(pred_single, gt_single)
            tf.summary.scalar('losses/{} loss'.format(name), loss)

            mse = tf.reduce_mean(tf.square(pred_single - gt_single))
            tf.summary.scalar('losses/mse {} loss'.format(name), mse)

            tf.losses.add_loss(loss / 2.)

        #print(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
        total_loss = tf.losses.get_total_loss()
        tf.summary.scalar('losses/total loss', total_loss)

        optimizer = tf.train.AdamOptimizer(FLAGS.initial_learning_rate,
                                           beta1=0.9,
                                           beta2=0.99)

        with tf.Session(graph=g) as sess:

            train_op = slim.learning.create_train_op(total_loss,
                                                     optimizer,
                                                     summarize_gradients=True)

            logging.set_verbosity(1)
            slim.learning.train(train_op,
                                FLAGS.train_dir,
                                save_summaries_secs=60,
                                save_interval_secs=120)
Example #9
0
def part3():
    points = {}
    plt.figure()
    for trainFileName, testFileName, key in [
        ('../diabetes_train.arff', '../diabetes_test.arff', 'diabetes'),
        ('../heart_train.arff', '../heart_test.arff', 'heart')
    ]:
        attribute, trainset = data_provider(trainFileName)
        testAttribute, testset = data_provider(testFileName)
        root = TreeNode(trainset, attribute)
        curTree = DecisionTree(root)

        points = []
        for m in (2, 5, 10, 20):
            curTree.createTree(root, m)
            trueSamples = 0
            falseSamples = 0
            for instance in testset:
                if curTree.predict(root, instance) == instance[-1]:
                    trueSamples += 1
                else:
                    falseSamples += 1
            points.append(
                [m, float(trueSamples) / (trueSamples + falseSamples)])

        mapping = {'diabetes': 1, 'heart': 2}
        for x, y in points:
            ax = plt.subplot(2, 1, mapping[key])
            ax.set_xlim(0, 22)
            ax.set_ylim(0.6, 0.8)
            ax.set_ylabel('accuracy')
            ax.set_title(key)
            plt.annotate('%.3f' % y, xy=(x - 0.02, y + 0.02))
            plt.annotate('m=%d' % x, xy=(x - 0.02, y - 0.07))
            ax.plot(x, y, 'o-')

    plt.xlabel('tree number m')
    plt.savefig('../part3.pdf')
Example #10
0
def utPredict():
    """
    unit test for function [predict]
    testfiles:
        # trainset: diabetes_train.arff
        # testset: diabetes_test.arff
    """
    from data_provider import data_provider
    attribute, dataset = data_provider('../diabetes_train.arff')
    attribute, testset = data_provider('../diabetes_test.arff')
    root = TreeNode(dataset, attribute)
    curTree = DecisionTree(root)
    curTree.createTree(root, 4)
    try:
        assert (curTree.predict(root, testset[0]) == 'positive')
        assert (curTree.predict(root, testset[22]) == 'positive')
        assert (curTree.predict(root, testset[52]) == 'positive')
        assert (curTree.predict(root, testset[3]) == 'negative')
        assert (curTree.predict(root, testset[78]) == 'negative')
        assert (curTree.predict(root, testset[99]) == 'negative')
        print '[predict] TEST PASS'
    except AssertionError:
        print '[predict] TEST FAILED'
Example #11
0
def utSplitFeature():
    """
    unit test for function [chooseSplitFeature]
    """
    from data_provider import data_provider
    attribute, dataset = data_provider('../test.arff')
    root = TreeNode(dataset, attribute)
    curTree = DecisionTree(root)
    bestFeature = curTree.chooseSplitFeature(root)
    try:
        assert (bestFeature == 0)
        print '[chooseSplitFeature] TEST PASS'
    except AssertionError:
        print '[chooseSplitFeature] TEST FAILED'
Example #12
0
def utCreateTree():
    """
    unit test for function [createTree]
    examine the tree structure
    compared graph with:
        http://pages.cs.wisc.edu/~yliang/cs760_fall18/homework/hw2/diabetes/m=4.txt
    """
    from data_provider import data_provider
    attribute, dataset = data_provider('../diabetes_train.arff')
    root = TreeNode(dataset, attribute)
    curTree = DecisionTree(root)
    curTree.createTree(root, 4)
    curTree.printTree(root, 0)
    print '---------------- please compare this graph with the url ------------------'
    print 'http://pages.cs.wisc.edu/~yliang/cs760_fall18/homework/hw2/diabetes/m=4.txt'
Example #13
0
def utEntropy():
    """
    unit test for function [getEntropy]
    """
    from data_provider import data_provider
    attribute, dataset = data_provider('../test.arff')
    root = TreeNode(dataset, attribute)
    curTree = DecisionTree(root)
    try:
        assert ('%.3f' % curTree.getEntropy(root) == '0.940')
        assert ('%.3f' % (curTree.getEntropy(root) -
                          curTree.getEntropy(root, 0)) == '0.152')
        print '[getEntropy] TEST PASS'
    except AssertionError:
        print '[getEntropy] TEST FAILED'
Example #14
0
def utTreeSplit():
    """
    unit test for function [splitTree]
    """
    from data_provider import data_provider
    attribute, dataset = data_provider('../test.arff')
    root = TreeNode(dataset, attribute)
    curTree = DecisionTree(root)
    children = curTree.splitTree(root)
    try:
        assert ('{} {}'.format(
            children[0],
            children[0].classOutput) == 'Humidity high [4 3] negative')
        assert ('{} {}'.format(
            children[1],
            children[1].classOutput) == 'Humidity normal [1 6] positive')
        print '[splitTree] TEST PASS'
    except AssertionError:
        print '[splitTree] TEST FAILED'
Example #15
0
    def __init__(self):
        logger.info('\n' + '*' * 100 + '\n' + '******init******\n' + '*' * 100)
        self.dataset = 'mnist'
        self.batchsize = 128
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        logger.info('device:' + self.device)
        self.savefile_checkpoint = args.savefile + '/checkpoint'
        self.max_epoch = 100
        self.test_every_k_epoch = 1

        self.best_acc = 0  # best test accuracy
        self.start_epoch = 0  # start from epoch 0 or last checkpoint epoch
        self.train_acc = 0

        self.train_data, self.test_data = data_provider(self.dataset,
                                                        args.data_path,
                                                        self.batchsize,
                                                        download=False)

        self.net = MyLeNet()

        self.criterion = nn.CrossEntropyLoss()
        self.weight_decay = 1e-4
        self.lr_weight = 10.
        self.lr = 0.1
        self.lr_drop = [
            30,
            60,
            80,
        ]

        logger.info('weight decay:' + str(self.weight_decay) + ', lr drop:' +
                    str(self.lr_drop))

        self.optimizer = optim.SGD(self.net.parameters(),
                                   lr=self.lr,
                                   momentum=0.9,
                                   weight_decay=self.weight_decay,
                                   nesterov=True)
Example #16
0
def nfoldTest(nfold = 10):
    attributes, labels, instances = data_provider('../chess-KingRookVKingPawn.arff')
    cv = CrossValidate(nfold, instances, labels)
    accuracy = [{'bayes':0, 'tan':0} for _ in range(nfold)]
    models = {'bayes':Bayes, 'tan':TAN}
    for i in range(nfold):
        train, test = cv.fold(i)
        iTotal = len(test)
        for key in models:
            model = models[key](attributes, labels, train)
            for instance in test:
                result = model.classify(instance)
                print result[0], result[1]
                if result[0] == result[1]:
                    accuracy[i][key] += 1
        for key in accuracy[i]:
            accuracy[i][key] = float(accuracy[i][key]) / iTotal
    print accuracy

    fileout = open('output.txt', 'w+')
    for i in range(len(accuracy)):
        fileout.write('fold{} bayes:{:.16f} tan:{:.16f}\n'.format(i, accuracy[i]['bayes'], accuracy[i]['tan']))
    fileout.close()
Example #17
0
    def __init__(self):
        logger.info('\n' + '*' * 100 + '\n' + '******init******\n' + '*' * 100)
        self.dataset = 'cifar100' if 'cifar100' in args.savefile else 'cifar10'
        if self.dataset == 'cifar10':
            self.num_classes = 10
        else:
            self.num_classes = 100
        self.batchsize = 128
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        logger.info('device:' + self.device)
        self.savefile_checkpoint = args.savefile + '/checkpoint'
        self.max_epoch = 200
        self.test_every_k_epoch = 1

        self.choose_best_acc = False
        self.best_acc = 0  # best test accuracy
        self.start_epoch = 0  # start from epoch 0 or last checkpoint epoch
        self.train_acc = 0
        self.test_acc = 0
        self.train_loss = 0
        arch = np.array([[16, 16, 5], [32, 32, 5], [64, 64, 5]])
        arch[:, 0:2] *= 2
        self.target_arch = np.array([[16, 16, 5], [32, 32, 5], [64, 64, 5]])
        self.target_times = 8
        self.gradual_arch = None
        # # CIFAR 10
        # # para 0.75
        # arch = np.array([[14, 26, 5], [24, 47, 5], [52, 60, 5]])
        # # para 0.5
        # arch = np.array([[12, 18, 5], [27, 36, 5], [65, 60, 5]])
        # # para 0.25
        # arch = np.array([[8, 16, 5], [21, 39, 5], [60, 87, 5]])
        # 6M
        # arch = np.array([[28, 192, 18], [78, 384, 18], [125, 322, 18]])
        # arch = np.array([[19, 181, 18], [42, 384, 18], [142, 380, 18]])
        # # 4.1M
        # arch = np.array([[41, 257, 5], [100, 415, 5], [206, 281, 5]])
        # # 2.5M
        # arch = np.array([[35, 164, 5], [70, 372, 5], [168, 224, 5]])
        # arch = np.array([[30, 116, 5], [62, 298, 5], [164, 392, 5]])
        # # CIFAR 100
        # # para 0.75
        # arch = np.array([[16, 23, 5], [24, 30, 5], [34, 106, 5]])
        # # para 0.5
        # arch = np.array([[10, 16, 5], [16, 33, 5], [44, 120, 5]])
        # # para 0.25
        # arch = np.array([[8, 8, 5], [14, 36, 5], [65, 120, 5]])

        self.train_data, self.test_data = data_provider(
            self.dataset, args.data_path, self.batchsize)
        # self.net = VGG('VGG19', self.num_classes)
        # self.block = PreActBottleneck
        self.block = PreActBlock
        self.net = PreActResNet(self.block, arch, self.num_classes)
        # self.net = MobileNetV2(num_classes=self.num_classes)
        self.para, self.flop = self.net.cost()
        logger.info('Para:' + str(self.para) + 'Flops:' + str(self.flop))
        self.criterion = nn.CrossEntropyLoss()
        self.warmup = 0
        self.weight_decay = 1e-4
        self.lr = 1.
        self.lr_drop = [0, 120, 160, 180]
        self.lr_weight = 10.
        # self.lr = 0.2
        # self.lr_drop = [0, 160, 180]
        # self.lr_weight = 2.
        logger.info('weight decay:' + str(self.weight_decay) + ', lr drop:' +
                    str(self.lr_drop))

        self.stream_epoch = 80
        self.prune_times = 96
        self.base_prune = 0
        self.dimension1 = [0, 1, 2]
        self.dimension2 = [0, 1]
        self.w_para = 0.5
        self.w_flop = 1. - self.w_para
        logger.info('stream epoch:' + str(self.stream_epoch) +
                    ', prune times:' + str(self.prune_times) +
                    ', prune base:' + str(self.base_prune) +
                    ', prune dimensions:' + str(self.dimension1) +
                    str(self.dimension2))
Example #18
0
	def __init__(self):
		self.data_provider = data_provider.data_provider()
Example #19
0
if __name__ == '__main__':
    try:
        assert (len(sys.argv) >= 4)
    except AssertionError:
        print >> sys.stderr, "[ERROR] you should provide at least 3 inputs!"
        sys.exit()
    trainFileName = sys.argv[1]
    testFileName = sys.argv[2]
    try:
        m = int(sys.argv[3])
    except:
        print >> sys.stderr, "[ERROR] [m] should be in integer!"
        sys.exit()

    attribute, trainset = data_provider(trainFileName)
    testAttribute, testset = data_provider(testFileName)
    try:
        assert (testAttribute == attribute)
    except AssertionError:
        print >> sys.stderr, "[ERROR] pls check the attributes of test data."
        sys.exit()

    # train
    root = TreeNode(trainset, attribute)
    curTree = DecisionTree(root)
    curTree.createTree(root, m)
    curTree.printTree(root, 0)

    # test
    print '<Predictions for the Test Set Instances>'
Example #20
0
def Run(input_path, output_path, do_bayes_opt, feature_key, epochs):
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    """Runs the model selection and assesment of survivalnet.

    Arguments:
       input_path: str. Path to dataset. The input dataset in this script is
                   expected to be a mat file contating 'Survival' and 'Censored'
                   keys in addition the the feature_key.
       output_path: str. Path to save the model and results.
       do_bayes_opt: bool. Whether to do Bayesian optimization of hyperparams.
       feature_key: str. Key to the input data in the .mat file.
       epochs: int. Number of training epochs.
    """
    # Loading dataset. The model requires a nxp matrix of input data, nx1 array
    # of time to event labels, and nx1 array of censoring status.
    T, C, X, _ = data_provider.data_provider(input_path)
    T = np.asarray(T).astype('float32')
    # C is censoring status where 1 means incomplete folow-up. We change it to
    # Observed status where 1 means death.
    O = 1 - np.asarray(C).astype('int32')
    X = np.asarray(X).astype('float32')

    # Optimization algorithm.
    opt = 'GDLS'

    # Pretraining settings
    # pretrain_config = {'pt_lr':0.01, 'pt_epochs':1000,
    #                    'pt_batchsize':None,'corruption_level':.3}
    pretrain_config = None  # No pre-training

    # The results in the paper are averaged over 20 random assignment of samples
    # to training/validation/testing sets.
    cindex_results = []
    avg_cost = 0
    for i in range(N_SHUFFLES):
        # Sets random generator seed for reproducibility.
        prng = np.random.RandomState(i)
        order = prng.permutation(np.arange(len(X)))
        X = X[order]
        O = O[order]
        T = T[order]

        # Uses the entire dataset for pretraining
        pretrain_set = X

        # 'foldsize' denotes th number of samples used for testing. The same
        # number of samples is used for model selection.
        fold_size = int(20 * len(X) / 100)  # 20% of the dataset.
        train_set = {}
        test_set = {}
        val_set = {}

        # Caclulates the risk group for every patient i: patients whose time of
        # death is greater than that of patient i.
        sa = SurvivalAnalysis()
        train_set['X'], train_set['T'], train_set['O'], train_set[
            'A'] = sa.calc_at_risk(X[2 * fold_size:], T[2 * fold_size:],
                                   O[2 * fold_size:])
        test_set['X'], test_set['T'], test_set['O'], test_set[
            'A'] = sa.calc_at_risk(X[:fold_size], T[:fold_size], O[:fold_size])
        val_set['X'], val_set['T'], val_set['O'], val_set[
            'A'] = sa.calc_at_risk(X[fold_size:2 * fold_size],
                                   T[fold_size:2 * fold_size],
                                   O[fold_size:2 * fold_size])

        # Writes data sets for bayesopt cost function's use.
        with file('train_set', 'wb') as f:
            cPickle.dump(train_set, f, protocol=cPickle.HIGHEST_PROTOCOL)
        with file('val_set', 'wb') as f:
            cPickle.dump(val_set, f, protocol=cPickle.HIGHEST_PROTOCOL)

        if do_bayes_opt == True:
            print '***Model Selection with BayesOpt for shuffle', str(i), '***'
            _, bo_params = BayesOpt.tune()
            n_layers = int(bo_params[0])
            n_hidden = int(bo_params[1])
            do_rate = bo_params[2]
            nonlin = theano.tensor.nnet.relu if bo_params[3] > .5 else np.tanh
            lambda1 = bo_params[4]
            lambda2 = bo_params[5]
        else:
            n_layers = 1
            n_hidden = 100
            do_rate = 0.5
            lambda1 = 0
            lambda2 = 0
            nonlin = np.tanh  # or nonlin = theano.tensor.nnet.relu

        # Prints experiment identifier.
        expID = 'nl{}-hs{}-dor{}_nonlin{}_id{}'.format(str(n_layers),
                                                       str(n_hidden),
                                                       str(do_rate),
                                                       str(nonlin), str(i))

        finetune_config = {'ft_lr': 0.0001, 'ft_epochs': epochs}

        print '*** Model Assesment ***'
        _, train_cindices, _, test_cindices, _, _, model, _ = train(
            pretrain_set,
            train_set,
            test_set,
            pretrain_config,
            finetune_config,
            n_layers,
            n_hidden,
            dropout_rate=do_rate,
            lambda1=lambda1,
            lambda2=lambda2,
            non_lin=nonlin,
            optim=opt,
            verbose=True,
            earlystp=False)
        cindex_results.append(test_cindices[-1])
        avg_cost += test_cindices[-1]
        print expID, ' ', test_cindices[-1], 'average = ', avg_cost / (i + 1)
        print np.mean(cindex_results), np.std(cindex_results)
        with file(os.path.join(output_path, 'final_model'), 'wb') as f:
            cPickle.dump(model, f, protocol=cPickle.HIGHEST_PROTOCOL)

    outputFileName = os.path.join(output_path, 'c_index_list.mat')
    sio.savemat(outputFileName, {'c_index': cindex_results})
Example #21
0
lstm_args = {}
lstm_args['embed_num'] = max_idx
lstm_args['vec'] = vec
lstm_args['class_num'] = 2
lstm_args['cuda'] = torch.cuda.is_available()
lstm_args['hidden'] = args.num_hidden
lstm_args['embed_dim'] = EMBEDDING_DIM
lstm_args['dropout'] = args.dropout

# Intialise model
lstm = Model(lstm_args)
lstm = lstm.cuda()

optimizer = torch.optim.Adam(lstm.parameters(), lr=LEARNING_RATE)

threads, labels, features = data_provider(args.dataset_name)

# Extract features which are useful
X = features.to_numpy()[:, 2:]
t = [0, 1, 5, 7, 8, 14, 15]
X = X[:, t]
a = X[:, 2]
X[:, 2] = np.asarray([float(i - min(a)) / (max(a) - min(a)) for i in a])

# Split training data and testing data
rng = default_rng(seed=0)
num_tv = int(0.9 * len(threads))
idx = rng.choice(len(threads), size=num_tv, replace=False)

# Data for training and validation
X_tv = X[idx, :]
Example #22
0
 def __init__(self, offset, days):
     self.offset = offset
     self.days = days
     self.dp = data_provider.data_provider()
Example #23
0
    def __init__(self):
        logger.info('\n' + '*' * 100 + '\n' + '******init******\n' + '*' * 100)
        self.dataset = 'cifar100' if 'cifar100' in args.savefile else 'cifar10'
        if self.dataset == 'cifar10':
            self.num_classes = 10
        else:
            self.num_classes = 100
        self.batchsize = 256
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        logger.info('device:' + self.device)
        self.savefile_checkpoint = args.savefile + '/checkpoint'
        self.max_epoch = 200
        self.test_every_k_epoch = 1

        self.choose_best_acc = False
        self.best_acc = 0  # best test accuracy
        self.start_epoch = 0  # start from epoch 0 or last checkpoint epoch
        self.train_acc = 0
        self.test_acc = 0
        self.train_loss = 0
        # arch = np.array([[38,22], [38, 28], [38, 20]])
        # nif = 38 * 2
        # arch = np.array([[38, 8], [38, 14], [38, 28], [38, 20]])
        # nif = 38*2
        # arch = np.array([[12, 16], [12, 16], [12, 16]])
        arch = np.array([[24, 16], [24, 16], [24, 16]])
        nif = 24
        # arch = np.array([[32, 6], [32, 12], [32, 24], [32, 16]])
        # nif = 32*2
        # self.pruned_arch = np.array(
        # [[23, 128, 5], [56, 183, 5], [114, 296, 5]]
        # )
        # arch = arch.astype(float)
        # arch[:, 0:2] *= 2
        # arch = arch.astype(int)
        self.warm_up = -1
        self.train_data, self.test_data = data_provider(
            self.dataset, args.data_path, self.batchsize)
        # self.net = VGG('VGG19', self.num_classes)
        # self.net = PreActResNet(PreActBottleneck, arch, self.num_classes)
        # self.net = MobileNetV2(arch, num_classes=self.num_classes, small_input=True)
        self.net = densenet(num_init_features=nif,
                            arch_set=arch,
                            num_classes=self.num_classes,
                            small_inputs=True)
        self.para, self.flop = self.net.cost()
        logger.info('Para:' + str(self.para) + ', Flops:' + str(self.flop))
        self.criterion = nn.CrossEntropyLoss()
        self.lr = 1.
        self.weight_decay = 1e-4
        self.lr_drop = [0, 120, 160, 180]
        logger.info('weight decay:' + str(self.weight_decay) + ', lr drop:' +
                    str(self.lr_drop))

        self.stream_epoch = -1
        self.prune_idx = 0
        self.prune_times = 96
        self.base_prune = 0
        self.dimension1 = [0, 1, 2]
        self.dimension2 = [0]
        self.w_para = 0.5
        self.w_flop = 0.5
        logger.info('stream epoch:' + str(self.stream_epoch) +
                    ', prune times:' + str(self.prune_times) +
                    ', prune base:' + str(self.base_prune) +
                    ', prune dimensions:' + str(self.dimension1) +
                    str(self.dimension2))

        self.stream_arch = {
            'arch': [arch],
            'para': [self.para],
            'flop': [self.flop],
            'cost': [0]
        }
        self.prenet = {
            'net':
            densenet(num_init_features=nif,
                     arch_set=arch,
                     num_classes=self.num_classes,
                     small_inputs=True),
            'acc':
            0,
            'cost':
            0,
            'gate_set':
            np.array(self.net.gate_set),
            'para':
            self.para,
            'flop':
            self.flop
        }

        self.bestnet = {
            'net':
            densenet(num_init_features=nif,
                     arch_set=arch,
                     num_classes=self.num_classes,
                     small_inputs=True),
            'acc':
            0,
            'cost':
            0,
            'gate_set':
            np.array(self.net.gate_set),
            'para':
            self.para,
            'flop':
            self.flop
        }

        self.current = {
            'net': self.net,
            'acc': 0,
            'cost': 0,
            'gate_set': self.net.gate_set,
            'para': self.para,
            'flop': self.flop
        }
Example #24
0
 def __init__(self):
     self.data_provider = data_provider.data_provider()
Example #25
0
    parse.add_argument('-m',
                       '--model',
                       type=str,
                       default='fc',
                       choices=['fc', 'res'])
    parse.add_argument('-e', '--epoch', type=int, default=300)
    parse.add_argument('-s', '--snapshotstep', type=int, default=10)
    parse.add_argument('-b', '--batchsize', type=int, default=-1)
    parse.add_argument('-v', '--valbatchsize', type=int, default=-1)
    parse.add_argument('-r', '--runid', type=str, default='dnn')
    parse.add_argument('-lr', '--learningrate', type=float, default=1e-3)
    return parse.parse_args()


args = get_args()
dp = data_provider.data_provider()

train_X, train_Y, train_W = *utils.list_to_array_train(
    dp.train[0]), dp.train[1]
train_W = train_W / np.sum(train_W)

val_X, val_Y, val_W = *utils.list_to_array_train(dp.val[0]), dp.val[1]
val_W = val_W / np.sum(val_W)

if args.batchsize > 0:
    cfg.train_batch_size = args.batchsize
if args.valbatchsize > 0:
    cfg.validation_batch_size = args.valbatchsize

net = model.network(checkpoint=args.checkpoint,
                    learning_rate=args.learningrate,
Example #26
0
    def __init__(self):
        logger.info('\n' + '*' * 100 + '\n' + '******init******\n' + '*' * 100)
        self.dataset = 'imagenet'
        self.num_classes = 1000
        self.batchsize = 128
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        logger.info('device:'+self.device)
        self.savefile_checkpoint = args.savefile + '/checkpoint'
        self.max_epoch = 100
        self.test_every_k_epoch = 1

        self.choose_best_acc = False
        self.best_acc = 0  # best test accuracy
        self.start_epoch = 0  # start from epoch 0 or last checkpoint epoch
        self.train_acc = 0
        self.test_acc = 0
        self.train_loss = 0

        self.train_data, self.test_data = data_provider(self.dataset,
                                                        args.data_path,
                                                        self.batchsize,
                                                        n_threads=16)
        # self.net = VGG('VGG19', self.num_classes)
        # arch = np.array([[64, 256, 2], [128, 512, 2], [256, 1024, 2],
        #                  [512, 2048, 2]])
        # arch = np.array([[74, 296, 2], [148, 592, 2], [296, 1184, 2],
        #                  [592, 2368, 2]])
        arch = np.array([[64, 64, 2], [128, 128, 2], [256, 256, 2], [512, 512, 2]])
        # arch = np.array([[77, 77, 2], [154, 154, 2], [307, 307, 2], [614, 614, 2]])
        # self.block = PreActBlock
        self.block = PreActBottleneck
        self.net = PreActResNet(self.block, arch, self.num_classes)
        # self.net = PreActResNet(PreActBlock, [2, 2, 2, 2], self.num_classes)
        # self.net = PreActResNet(PreActBottleneck, [3, 4, 6, 3],
        #                         self.num_classes)
        # self.net = MobileNetV2(num_classes=self.num_classes)
        self.criterion = nn.CrossEntropyLoss()
        self.lr = 1.
        self.weight_decay = 1e-4
        self.lr_drop = [0, 60, 80]
        self.lr_weight = 10.
        logger.info('weight decay:' + str(self.weight_decay) + ', lr drop:' +
                    str(self.lr_drop))

        self.para, self.flop = self.net.cost()
        logger.info('Para:' + str(self.para) + ', Flops:' + str(self.flop))
        logger.info('weight decay:' + str(self.weight_decay) + ', lr drop:' +
                    str(self.lr_drop))

        self.stream_epoch = 40
        self.prune_times = 96
        self.base_prune = 0
        self.dimension1 = [0, 1, 2, 3]
        self.dimension2 = [0, 1]
        self.w_para = 0.5
        self.w_flop = 1. - self.w_para
        logger.info('stream epoch:' + str(self.stream_epoch) +
                    ', prune times:' + str(self.prune_times) +
                    ', prune base:' + str(self.base_prune) +
                    ', prune dimensions:' + str(self.dimension1) +
                    str(self.dimension2))

        self.stream_arch = {
            'arch': [arch],
            'para': [self.para],
            'flop': [self.flop],
            'cost': [0]
        }
        self.prenet = {
            'net': PreActResNet(self.block, arch, self.num_classes),
            'acc': 0,
            'cost': 0,
            'gate_set': np.array(self.net.gate_set),
            'para': self.para,
            'flop': self.flop
        }

        self.bestnet = {
            'net': PreActResNet(self.block, arch, self.num_classes),
            'acc': 0,
            'cost': 0,
            'gate_set': np.array(self.net.gate_set),
            'para': self.para,
            'flop': self.flop
        }

        self.current = {
            'net': self.net,
            'acc': 0,
            'cost': 0,
            'gate_set': self.net.gate_set,
            'para': self.para,
            'flop': self.flop
        }
Example #27
0
def utStratify():
    attrNum, labels, instances = data_provider('../test.arff')
    cv = CrossValidate(6, instances, labels)
    print cv.mergefolds
    print len(cv.mergefolds)
Example #28
0
import scipy.io as sio
import survivalnet as sn
import data_provider
import numpy as np

# Integrated models.
# Defines model/dataset pairs.
ModelPaths = ['results/']
Models = ['final_model']
Data = ['./survivalData.csv']

# Loads datasets and performs feature analysis.
for i, Path in enumerate(ModelPaths):

    # Loads normalized data.
    Censored, Survival, Normalized, Symbols = data_provider.data_provider(
        Data[i])

    # Extracts relevant values.
    # Raw = None
    Raw = np.asarray([1, 2, 3]).astype(
        'float32')  # Just for parsing something other than None

    # Loads model.
    f = open(Path + Models[i], 'rb')
    Model = pickle.load(f)
    f.close()

    sn.analysis.FeatureAnalysis(Model,
                                Normalized,
                                Raw,
                                Symbols,
Example #29
0
def main(model_options):
  
  print 'Loading data'
  dp = data_provider()
  dp.load_data(model_options['batch_size'], model_options['word_count_threshold'])
  dp.build_word_vocab()
  dp.group_train_captions_by_length()
  model_options['vocab_size'] = dp.get_word_vocab_size()

  print 'Building model'  
  # This create the initial parameters as numpy ndarrays.
  generator = caption_generator()
  params = generator.init_params(model_options)
  save_n = {}
  save_n['checkpoint'] = 0
  save_n['prediction'] = 0
  
  # reload a saved checkpoint
  if model_options['reload_checkpoint_path']:
    _, save_n['checkpoint'] = utils.load_params(model_options['reload_checkpoint_path'], params)
    print 'Reloaded checkpoint from', model_options['reload_checkpoint_path']
  
  # This create Theano Shared Variable from the parameters.
  # Dict name (string) -> Theano Tensor Shared Variable
  # params and tparams have different copy of the weights.
  tparams = utils.init_tparams(params)
  
  # use_noise is for dropout
  sents, mask, imgs, gt_sents, use_noise, cost = generator.build_model(tparams)
  grads = tensor.grad(cost, wrt=tparams.values())
  
  lr = tensor.scalar(name='lr')
  f_grad_shared, f_update = optimizers[model_options['optimizer']](lr, tparams, grads, sents, mask, imgs, gt_sents, cost)
  
  imgs_to_predict, predicted_indices, predicted_prob = generator.predict(tparams)
  f_pred = theano.function([imgs_to_predict], predicted_indices, name='f_pred')
  f_pred_prob = theano.function([imgs_to_predict], predicted_prob, name='f_pred_prob')
    
  train_iter = dp.train_iterator
  kf_valid = KFold(len(dp.split['val']), n_folds=len(dp.split['val']) / model_options['batch_size'], shuffle=False)
  
  if model_options['use_dropout'] == 1:
    use_noise.set_value(1.)
  else:
    use_noise.set_value(0.)
     
  print 'Optimization'
  
  uidx = 0
  lrate = model_options['lrate']
  # display print time duration
  dp_start = time.time()
  for eidx in xrange(model_options['max_epochs']):
    print 'Epoch ', eidx
    
    for batch_data in train_iter:
      uidx += 1
      
      # preparing the mini batch data
      pd_start = time.time()
      sents, sents_mask, imgs, gt_sents = dp.prepare_train_batch_data(batch_data)
      pd_duration = time.time() - pd_start
      
      if sents is None:
        print 'Minibatch is empty'
        continue
      
      # training on the mini batch
      ud_start = time.time()
      cost = f_grad_shared(sents, sents_mask, imgs, gt_sents)
      f_update(lrate)
      ud_duration = time.time() - ud_start
      
      # Numerical stability check
      if numpy.isnan(cost) or numpy.isinf(cost):
        print 'NaN detected'
      
      if numpy.mod(uidx, model_options['disp_freq']) == 0:
        dp_duration = time.time() - dp_start
        print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'Prepare data ', pd_duration, 'Update data ', ud_duration, '{0}_iter_time {1}'.format(model_options['disp_freq'], dp_duration)
        dp_start = time.time()

      # Log validation loss + checkpoint the model with the best validation log likelihood
      if numpy.mod(uidx, model_options['valid_freq']) == 0:
        scores = validate_and_save_checkpoint(model_options, dp, params, tparams, f_pred, f_pred_prob, kf_valid, save_n)
        print scores
  
  print 'Performing final validation'
  scores = validate_and_save_checkpoint(model_options, dp, params, tparams, f_pred, f_pred_prob, kf_valid, save_n)
  print scores
  print 'Done!!!'
Example #30
0
                negative + positive)
        return predictClass, actualClass, posteriorProb


def utMutulInformation(tan):
    """
        file: lymph_t_debug_output.txt
        Verbose output
        Conditional mutual information graph:
        (row,column) = w : The mutual information between rowth attribute and columnth attribute is w.
    """
    for items in tan.edges:
        print ' '.join(['{}'.format(item) for item in items])


def utPrims(tan):
    print tan.graph


def utPrintTree(tan):
    tan.printTree()


if __name__ == '__main__':
    from data_provider import data_provider
    attributes, labels, instances = data_provider('../lymph_train.arff')
    tan = TAN(attributes, labels, instances)
    utMutulInformation(tan)
    utPrims(tan)
    utPrintTree(tan)
Example #31
0
    def __init__(self):
        logger.info('\n' + '*' * 100 + '\n' + '******init******\n' + '*' * 100)
        self.dataset = 'cifar100' if 'cifar100' in args.savefile else 'cifar10'
        if self.dataset == 'cifar10':
            self.num_classes = 10
        else:
            self.num_classes = 100
        self.batchsize = 128
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        logger.info('device:' + self.device)
        self.savefile_checkpoint = args.savefile + '/checkpoint'
        self.max_epoch = 200
        self.test_every_k_epoch = 1

        self.choose_best_acc = False
        self.best_acc = 0  # best test accuracy
        self.start_epoch = 0  # start from epoch 0 or last checkpoint epoch
        self.train_acc = 0
        self.test_acc = 0
        self.train_loss = 0
        rate = 1
        arch = np.array([[16 * rate, 16, 18], [32 * rate, 32, 18],
                         [64 * rate, 64, 18]])
        arch[:, 0:2] *= 2

        self.train_data, self.test_data = data_provider(
            self.dataset, args.data_path, self.batchsize)
        # self.net = VGG('VGG19', self.num_classes)
        # self.block = PreActBottleneck
        self.block = PreActBlock
        # self.net = PreActResNet(self.block, arch, self.num_classes, dp=0.0)
        self.net = PreActResNet(self.block, arch, self.num_classes)
        # self.net = MobileNetV2(num_classes=self.num_classes)
        self.criterion = nn.CrossEntropyLoss()
        self.warmup = 0
        self.weight_decay = 1e-4
        self.lr = 1.
        self.lr_drop = [0, 120, 160, 180]
        self.lr_weight = 10.
        # self.lr = 0.2
        # self.lr_drop = [0, 160, 180]
        # self.lr_weight = 2.
        self.para, self.flop = self.net.cost()
        logger.info('Para:' + str(self.para) + ', Flops:' + str(self.flop))
        logger.info('weight decay:' + str(self.weight_decay) + ', lr drop:' +
                    str(self.lr_drop))

        self.stream_epoch = 80
        self.prune_times = 96
        self.base_prune = 0
        self.dimension1 = [0, 1, 2]
        self.dimension2 = [0, 1]
        self.densityprune = 0.3
        self.eachprune = 0.05
        self.w_para = 0.5
        self.w_flop = 1. - self.w_para
        logger.info('stream epoch:' + str(self.stream_epoch) +
                    ', prune times:' + str(self.prune_times) +
                    ', prune base:' + str(self.base_prune) +
                    ', prune dimensions:' + str(self.dimension1) +
                    str(self.dimension2))

        self.stream_arch = {
            'arch': [arch],
            'para': [self.para],
            'flop': [self.flop],
            'cost': [0]
        }
        self.prenet = {
            'net': PreActResNet(self.block, arch, self.num_classes),
            'acc': 0,
            'cost': 0,
            'gate_set': np.array(self.net.gate_set),
            'para': self.para,
            'flop': self.flop
        }

        self.bestnet = {
            'net': PreActResNet(self.block, arch, self.num_classes),
            'acc': 0,
            'cost': 0,
            'gate_set': np.array(self.net.gate_set),
            'para': self.para,
            'flop': self.flop
        }

        self.current = {
            'net': self.net,
            'acc': 0,
            'cost': 0,
            'gate_set': self.net.gate_set,
            'para': self.para,
            'flop': self.flop
        }