Example #1
0
def evaluate(inp, client, l, DBlocation):
    inp = inp.split(' ')
    status = client.status()

    if len(inp) > 1 and not str(inp[1]):
        inp.pop()

    if inp[0] == 'p' or 'play' == (inp[0]):
        try:
            if not status['state'] == 'stop':
                if len(inp) == 1:
                    util.pause(client)
                else:
                    util.play(client, int(inp[1]))
            else:
                if len(inp) == 1:
                    util.play(client, 0)
                else:
                    util.play(client, int(inp[1]))
        except:
            print('mpd error: bad song index')
    elif inp[0] == 'pause':
        util.pause(client)
    elif inp[0] == 'next' or inp[0] == 'n':
        util.next(client)
    elif inp[0] == 'previous' or inp[0] == 'ps':
        util.previous(client)
    elif inp[0] == 'stop':
        util.stop(client)
    elif inp[0] == 'pl' or inp[0] == 'playlist':
        util.print_playlist(client)
    elif inp[0] == 'update' or inp[0] == 'u':
        util.update(client)
    elif inp[0] == 'clear':
        util.clear(client)
    elif inp[0] == 'random':
        util.mpdrandom(client, inp[1])
    elif inp[0] == 'shuffle':
        util.shuffle(client)
    elif inp[0] == 'consume':
        util.consume(client, inp[1])
    elif inp[0] == 'swap':
        util.swap(client, int(inp[1]) - 1, int(inp[2]) - 1)
    elif inp[0] == 'single':
        util.single(client, inp[1])
    elif inp[0] == 'search' or inp[0] == 's':
        if '-f' in inp or '--filter' in inp:
            l = util.mpdsearch(inp[1], inp, DBlocation, True)
        else:
            l = util.mpdsearch(inp[1], inp, DBlocation, False)
    elif inp[0] == 'a' or inp[0] == 'add':
        if l:
            for line in l:
                client.add(line)
        else:
            print('You have to search first!')
    elif inp[0] == 'q' or inp[0] == 'quit':
        quit()
    return l
Example #2
0
    def __init__(self,
                 classified_data_list,
                 kernel,
                 svm_constructor=LibSvmClassifier,
                 ensemble_size=3):
        self._classifiers = []

        for _ in range(ensemble_size):
            data = classified_data_list.copy()
            util.shuffle(data)
            util.take_n(data, 10)
            self._classifiers.append(svm_constructor(data, kernel))
 def add(self, rect):
     if len(self.pieces) < 7 and self.selected is None:
         self.pieces.insert(randint(0, len(self.pieces)), rect)
     elif len(self.pieces) < 7 and self.selected is not None: 
         self.pieces.append(rect)
     else: 
         self.extras.append(rect)
         
     if self.selected is None and self.shuffle >= 1:
         shuffle(self.pieces)
         self.shuffle -= 1
     self.realign()
def load_data():
    with open(os.path.join(data_dir, dataset, 'train.txt')) as f:
        train_data = np.load(f)
    with open(os.path.join(data_dir, dataset, 'val.txt')) as f:
        val_data = np.load(f)
    with open(os.path.join(data_dir, dataset, 'test.txt')) as f:
        test_data = np.load(f)
    train_x, train_y = train_data[:,:-1], train_data[:,-1]
    val_x, val_y = val_data[:,:-1], val_data[:,-1]
    test_x, test_y = test_data[:,:-1], test_data[:,-1]

    train_x, train_y = util.shuffle(train_x, train_y)
    val_x, val_y = util.shuffle(val_x, val_y)
    return train_x, train_y, val_x, val_y, test_x, test_y
Example #5
0
def classify(sess, pos_data, neg_data, pos_label, neg_label, pos_data_test, neg_data_test, pos_label_test, neg_label_test, **kwargs):
    net = multi_classifier(sess, **kwargs)

    for i in range(kwargs['epoch']):
        train_neg_data, train_neg_label = util.shuffle(neg_data, neg_label)
        trainData = np.concatenate((pos_data, train_neg_data))
        trainLabel = np.concatenate((pos_label, train_neg_label))
        trainData, trainLabel = util.shuffle(trainData, trainLabel)
        net.train_epoch(sess, trainData, trainLabel, **kwargs)
        # print(kwargs['trainNum'], "train", net.train_epoch(sess, trainData, trainLabel, **kwargs))
        # print(kwargs['trainNum'], "test", net.test(sess, testData, testLabel))
        # print(net.inference(sess, testData))
    print(kwargs['trainNum'], "test_pos", net.test(sess, pos_data_test, pos_label_test))
    print(kwargs['trainNum'], "test_neg", net.test(sess, neg_data_test, neg_label_test))
    net.save_model(sess, **kwargs)
Example #6
0
def knn(trainData, trainLabel, testData, testLabel, **kwargs):
    print(kwargs)
    trainData = util.normalization(trainData)
    testData = util.normalization(testData)
    acc_list = []
    ret = []
    acc_max = 0
    for i in range(10):
        trainData_shuffle, trainLabel_shuffle = util.shuffle(
            trainData, trainLabel)
        neigh = KNeighborsClassifier(n_neighbors=kwargs['n_neighbors'],
                                     weights=kwargs['weights'],
                                     p=kwargs['p'])
        if kwargs['PCA']:
            pca = PCA(n_components=kwargs['n_components'])
            trainData_shuffle = pca.fit_transform(trainData_shuffle)
            neigh.fit(trainData_shuffle, trainLabel_shuffle)
            testData_PCA = pca.transform(testData)
            acc_i = neigh.score(testData_PCA, testLabel)
            if acc_i > acc_max:
                acc_max = acc_i
                ret = neigh.predict(testData_PCA)
        else:
            neigh.fit(trainData_shuffle, trainLabel_shuffle)
            acc_i = neigh.score(testData, testLabel)
        print("%d acc: " % i, acc_i)
        acc_list.append(acc_i)
    acc = np.mean(np.array(acc_list))
    print("KNN accuracy: ", acc)
    return ret, acc_max
def calc_cross_validated_beta(x_full, y_full, lam, step_size, iterations,
        weight_step, k, use_nll, plot_nll):
    #shuffle x_full and y_full so we can crossvalidate
    feature_count = len(x_full[0])
    x_full, y_full = util.shuffle(x_full, y_full, to_numpy_array = True)
    beta_all = np.zeros(shape=(k, feature_count))
    validation_error_rates = np.empty(shape=k)
    nll = [None]*k
    for i in xrange(k):
        x_train, x_test = extract_fold(x_full, i, k)
        y_train, y_test = extract_fold(y_full, i, k)
        #This alters beta_all and possibly nll
        nll[i], beta_all[i] = run_batch_gradient_descent(x_train, y_train, lam,
                step_size, iterations, weight_step, use_nll = use_nll)
        test_labels_calc = calc_labels(x_test, beta_all[i])
        validation_error_rates[i] = calc_error_rate(test_labels_calc, y_test)
        print 'cross-validation error rate', validation_error_rates[i]
        training_labels = calc_labels(x_train, beta_all[i])
        print 'training error rate', calc_error_rate(training_labels, y_train),
        full_labels = calc_labels(x_full, beta_all[i])
        print 'full', calc_error_rate(full_labels, y_full)
        if plot_nll and use_nll:
            plot_nll_data(nll[i], 'derp')
    #Take the average beta among all betas calculated during cross-validation
    #beta = np.sum(beta_all, axis=0)/float(len(beta_all))
    if use_nll:
        for i in xrange(len(validation_error_rates)):
            print i, nll[i][-1], validation_error_rates[i]
    print 'avg error rate', np.mean(validation_error_rates)
    return beta_all
Example #8
0
def like(job, session=None):
    count = 0
    try:
        insta = Insta()
        insta.login(username=job.i_user.username,
                    password=job.i_user.get_password())
        time.sleep(1)

        # get users tags and shuffles them
        tag_names = [str(tag) for tag in job.i_user.tags]
        tags = shuffle(tag_names)
        for tag in tags:
            insta.search(tag)
            count += insta.like_tag(tag)
            time.sleep(5)

    except Exception as e:
        job.error = '{}: {}'.format(type(e), e)

    job.count = count
    job.finish()

    # new run for jobs
    new_job = schedule_next_job(job, rando_hour())
    session.add(new_job)
    session.commit()

    insta.driver.quit()
    return job
Example #9
0
def calc_cross_validated_beta(x_full, y_full, lam, step_size, iterations,
                              weight_step, k, use_nll, plot_nll):
    #shuffle x_full and y_full so we can crossvalidate
    feature_count = len(x_full[0])
    x_full, y_full = util.shuffle(x_full, y_full, to_numpy_array=True)
    beta_all = np.zeros(shape=(k, feature_count))
    validation_error_rates = np.empty(shape=k)
    nll = [None] * k
    for i in xrange(k):
        x_train, x_test = extract_fold(x_full, i, k)
        y_train, y_test = extract_fold(y_full, i, k)
        #This alters beta_all and possibly nll
        nll[i], beta_all[i] = run_batch_gradient_descent(x_train,
                                                         y_train,
                                                         lam,
                                                         step_size,
                                                         iterations,
                                                         weight_step,
                                                         use_nll=use_nll)
        test_labels_calc = calc_labels(x_test, beta_all[i])
        validation_error_rates[i] = calc_error_rate(test_labels_calc, y_test)
        print 'cross-validation error rate', validation_error_rates[i]
        training_labels = calc_labels(x_train, beta_all[i])
        print 'training error rate', calc_error_rate(training_labels, y_train),
        full_labels = calc_labels(x_full, beta_all[i])
        print 'full', calc_error_rate(full_labels, y_full)
        if plot_nll and use_nll:
            plot_nll_data(nll[i], 'derp')
    #Take the average beta among all betas calculated during cross-validation
    #beta = np.sum(beta_all, axis=0)/float(len(beta_all))
    if use_nll:
        for i in xrange(len(validation_error_rates)):
            print i, nll[i][-1], validation_error_rates[i]
    print 'avg error rate', np.mean(validation_error_rates)
    return beta_all
Example #10
0
def bayes(trainData, trainLabel, testData, testLabel, **kwargs):
    print(kwargs)
    trainData = util.normalization(trainData)
    testData = util.normalization(testData)
    acc_list = []
    acc_max = 0
    ret = []
    for i in range(10):
        trainData_shuffle, trainLabel_shuffle = util.shuffle(
            trainData, trainLabel)
        clf = GaussianNB()
        if kwargs['PCA']:
            pca = PCA(n_components=kwargs['n_components'])
            trainData_shuffle = pca.fit_transform(trainData_shuffle)
            clf.fit(trainData_shuffle, trainLabel_shuffle)
            testData_PCA = pca.transform(testData)
            acc_i = clf.score(testData_PCA, testLabel)
            if acc_i > acc_max:
                acc_max = acc_i
                ret = clf.predict(testData_PCA)
        else:
            clf.fit(trainData_shuffle, trainLabel_shuffle)
            acc_i = clf.score(testData, testLabel)
        print("%d acc: " % i, acc_i)
        acc_list.append(acc_i)
    acc = np.mean(np.array(acc_list))
    print("Naive Bayes accuracy: ", acc)
    return ret, acc_max
Example #11
0
def train(train_imgs, model, sess):
	n_train = len(train_imgs)
	for epoch in range(EPOCH_NUM):
		imgs = util.shuffle(train_imgs)
		loss = 0.
		step = 0
		for start in range(0, n_train, BATCH_SIZE):
			end = min(start + BATCH_SIZE, n_train)
			batch = imgs[start:end]
			o, l, _ = sess.run([model.outs, model.losses, model.optim],
				feed_dict={
					model.inputs: util.get_input_imgs(batch),
					model.labels: batch
				})
			loss += l*(end-start)

			if epoch%2==0 and step==0: # save outputs of 1st batch
				out_imgs = util.arrays2imgs(o[:5])
				for i, img in enumerate(out_imgs):
					cv2.imwrite(OUT_IMG_DIR_INT + 'e%d_%d.jpg' % (epoch, i), img)

			if step%100==0: 
				print('.. Step %5d, loss: %.5f' % (step, l))
			step += 1

		print('Epoch %3d >> avg_loss: %.5f' % (epoch, loss/n_train))

	return model
Example #12
0
def testRBM(opts):
    """show how to use RBM to do classification"""
    # read data
    data = np.load(opts.feature)
    label = np.load(opts.label)

    # set the nodes of hidden layers
    nHid = 1000

    # shuffle data and label
    [data, label] = util.shuffle(data, label)

    # decide how many samples to be used as training set
    percent = float(opts.trainPercent)
    nCase = data.shape[0]

    nTrain = int(nCase * percent)
    nTest = nCase - nTrain

    # split data and label into  train dataset and test dataset
    trainData = data[0:nTrain, :]
    trainLabel = label[0:nTrain, :]
    example = data[nTrain:, :]
    testLabel = label[nTrain:, :]

    p = {"maxEpoch": opts.maxEpoch}

    m = rbmFit.rbmFit(trainData,
                      nHid,
                      trainLabel,
                      isSaveModel=True,
                      name=opts.model,
                      **p)

    [trainR, F1] = rbmPredict.rbmPredict(m, trainData)
    [testR, F2] = rbmPredict.rbmPredict(m, example)

    trainK = 0
    for x in range(nTrain):
        if trainLabel[x] != trainR[x]:
            trainK = trainK + 1

    testK = 0
    for x in range(nTest):
        if testLabel[x] != testR[x]:
            testK = testK + 1

    print "---------------------------------------"
    print "train classification rate : %f " % (1 - trainK * 1.0 / nTrain)
    print "test  classification rate : %f " % (1 - testK * 1.0 / nTest)
    print "---------------------------------------"

    if options.isSaveResult:
        result = shelve.open(options.resultName)
        result["nHid"] = nHid
        result["maxEpoch"] = options.maxEpoch
        result["trainAcc"] = 1 - trainK * 1.0 / nTrain
        result["testAcc"] = 1 - testK * 1.0 / nTest
        result.close()
Example #13
0
    def __init__(self):
        self.gamemode = 'matchmake'

        # players -> {name: {'hand':list of id of item_card in hand, 'scored_accident': list of id of got accident_card}}
        self.players = {}

        self.accident_map = daily_accidents

        self.item_map = daily_items

        self.accidents = deque(shuffle(self.accident_map))
        self.items = deque(shuffle(self.item_map))

        self.onfield_accident = None
        self.onfield_item = []
        self.now_use_item_num = None
        self.item_pool = []
        self.accident_pool = []
Example #14
0
def run_decision_trees(x_train, y_train, x_test, y_test, args):

    #Allow validation without external testing data
    if x_test is None or np.array_equal(x_test, x_train):
        if args['shuffle']:
            x_train, y_train = util.shuffle(x_train,
                                            y_train,
                                            to_numpy_array=True)
        print 'no test data found'
        if args['validate'] > 0:
            validation_size = args['validate']
            crashes = x_train[validation_size:]
            labels = y_train[validation_size:]
            crashes_validate = x_train[:validation_size]
            labels_validate = y_train[:validation_size]
        elif args['validate'] == 0:
            crashes = x_train
            labels = y_train
            crashes_validate = crashes
            labels_validate = labels
        else:
            #Check the training set error rate - useful for debugging
            validation_size = args['validate']
            crashes = x_train[-validation_size:]
            labels = y_train[-validation_size:]
            crashes_validate = crashes
            labels_validate = labels

        x_test = crashes_validate
    else:
        if args['shuffle']:
            x_train, y_train = util.shuffle(x_train,
                                            y_train,
                                            to_numpy_array=True)
        crashes = x_train
        labels = y_train
        crashes_validate = x_test
        labels_validate = y_test

    if args['tree_size'] == 0:
        args['tree_size'] = len(crashes)

    decision_trees.do_stuff(crashes, labels, crashes_validate, labels_validate,
                            x_test, args)
Example #15
0
    def run(self):
        for _ in range(10):
            util.shuffle(self.classified_data_list)
            for i in range(0, self.folds):
                self.train_test_pairs[i] = [
                    _split_train_test(d, i, self.folds)
                    for d in self.classified_data_list
                ]

            for i in range(0, self.folds):
                classifier = self._get_classifier(
                    [train_set for (train_set, _) in self.train_test_pairs[i]])
                test_set = [(self.train_test_pairs[i][j][1], j)
                            for j in range(0, self.num_class)]

                for (rows, gt) in test_set:
                    for d in rows:
                        class_of_data = classifier.classify(d)
                        self.confusions[class_of_data][gt] += 1
Example #16
0
def run_logistic_regression(x_train, y_train, x_test, y_test, args):
    print 'data loaded'
    if args['method'] == 'logistic-plot':
        logistic_regression.plot_batch_gradient_descent(
            x_train,
            y_train,
            lam=args['lambda'],
            step_size=args['step_size'],
            iterations=args['iterations'],
            weight_step=False)
    elif args['method'] == 'logistic':
        x_train = logistic_regression.standardize_data(x_train)
        x_test = logistic_regression.standardize_data(x_test)

        if args['beta_file'] is None:
            beta = logistic_regression.calc_cross_validated_beta(
                x_train,
                y_train,
                lam=args['lambda'],
                step_size=args['step_size'],
                iterations=args['iterations'],
                weight_step=False,
                k=args['k'],
                use_nll=args['use_nll'],
                plot_nll=args['plot_nll'])
        else:
            inputfile = open(args['beta_file'], 'rb')
            beta = cPickle.load(inputfile)

        #Save beta
        if args['store_beta']:
            beta_dumpfile = open(
                'beta{0}{1}.pkl'.format(datetime.now().hour,
                                        datetime.now().minute), 'wb')
            cPickle.dump(beta, beta_dumpfile)

        beta = np.sum(beta, axis=0) / float(len(beta))
        training_labels = logistic_regression.calc_labels(x_train, beta)
        training_error = logistic_regression.calc_error_rate(
            training_labels, y_train)
        print 'training error rate', training_error

        testing_labels = logistic_regression.calc_labels(x_test, beta)
        testing_error = logistic_regression.calc_error_rate(
            testing_labels, y_test)
        print 'testing error rate', testing_error
        logistic_regression.write_labels(x_test, beta)

    elif args['method'] == 'logistic-sklearn':
        x_train = logistic_regression.standardize_data(x_train)
        x_test = logistic_regression.standardize_data(x_test)
        x_train, y_train = util.shuffle(x_train, y_train, to_numpy_array=True)
        logistic = LogisticRegression()
        logistic.fit(x_train, y_train)
        print logistic.score(x_test, y_test)
Example #17
0
	def get(self,id):
		questionnaire = db.get_questionnaire(id)
		self.req_parser = reqparse.RequestParser()
		self.req_parser.add_argument('shuffle', default=0, type=int, required=False)
		self.req_parser.add_argument('size', default=len(questionnaire['questions']), type=int, required=False)
		self.args = self.req_parser.parse_args()
		if not self.args['shuffle'] == 0:
			ordered_questions = questionnaire['questions'];
			quizz_questions = util.shuffle(ordered_questions, self.args['size'] )
			questionnaire['questions'] = quizz_questions
		return questionnaire
Example #18
0
def testRBM(opts) :
    """show how to use RBM to do classification"""
    # read data
    data  = np.load(opts.feature)
    label = np.load(opts.label)

    # set the nodes of hidden layers
    nHid = 1000

    # shuffle data and label
    [data, label] = util.shuffle(data, label)

    # decide how many samples to be used as training set
    percent = float(opts.trainPercent)
    nCase   = data.shape[0]

    nTrain = int(nCase * percent)
    nTest = nCase - nTrain

    # split data and label into  train dataset and test dataset
    trainData  = data[0:nTrain, :]
    trainLabel = label[0:nTrain, :]
    example   = data[nTrain:, :]
    testLabel  = label[nTrain:, :]

    p = {"maxEpoch" : opts.maxEpoch}

    m = rbmFit.rbmFit(trainData, nHid, trainLabel, isSaveModel=True, name=opts.model, **p)
    
    [trainR, F1] = rbmPredict.rbmPredict(m, trainData)
    [testR, F2] = rbmPredict.rbmPredict(m, example)
	
    trainK = 0
    for x in range(nTrain) :
        if trainLabel[x] != trainR[x] :
            trainK = trainK + 1

    testK = 0
    for x in range(nTest) :
        if testLabel[x] != testR[x] :
            testK = testK+1

    print "---------------------------------------"
    print "train classification rate : %f " % (1-trainK*1.0/nTrain)
    print "test  classification rate : %f " % (1-testK*1.0/nTest)
    print "---------------------------------------"

    if options.isSaveResult :
        result = shelve.open(options.resultName)
        result["nHid"]     = nHid
        result["maxEpoch"] = options.maxEpoch
        result["trainAcc"] = 1-trainK*1.0/nTrain
        result["testAcc"]  = 1-testK*1.0/nTest
        result.close()
Example #19
0
def run_decision_trees(x_train, y_train, x_test, y_test, args):


    #Allow validation without external testing data
    if x_test is None or np.array_equal(x_test,x_train):
        if args['shuffle']:
            x_train, y_train = util.shuffle(x_train, y_train, to_numpy_array = True)
        print 'no test data found'
        if args['validate']>0:
            validation_size = args['validate']
            crashes = x_train[validation_size:]
            labels = y_train[validation_size:]
            crashes_validate = x_train[:validation_size]
            labels_validate = y_train[:validation_size]
        elif args['validate']==0:
            crashes = x_train
            labels = y_train
            crashes_validate = crashes
            labels_validate = labels
        else:
            #Check the training set error rate - useful for debugging 
            validation_size = args['validate']
            crashes = x_train[-validation_size:]
            labels = y_train[-validation_size:]
            crashes_validate = crashes
            labels_validate = labels

        x_test = crashes_validate
    else:
        if args['shuffle']:
            x_train, y_train = util.shuffle(x_train, y_train, to_numpy_array = True)
        crashes = x_train
        labels = y_train
        crashes_validate = x_test
        labels_validate = y_test

    if args['tree_size']==0:
        args['tree_size'] = len(crashes)

    decision_trees.do_stuff(crashes, labels, crashes_validate,
            labels_validate, x_test, args)
Example #20
0
def train_novel_classifier(sess, trainData, trainLabel, testData, testLabel, **kwargs):
    net = multi_classifier(sess, test=False, **kwargs)

    for i in range(kwargs['epoch']):
        trainData, trainLabel = util.shuffle(trainData, trainLabel)
        # net.train_epoch(sess, trainData, trainLabel, **kwargs)
        print(i, "train", net.train_epoch(sess, trainData, trainLabel, **kwargs))
        print(i, "test", net.test(sess, testData, testLabel))
        # print(net.inference(sess, testData))
    # print(kwargs['trainNum'], "test_pos", net.test(sess, pos_data_test, pos_label_test))
    # print(kwargs['trainNum'], "test_neg", net.test(sess, neg_data_test, neg_label_test))
    net.save_model(sess, **kwargs)
Example #21
0
    def run(self):
        while True:
            # listen for packets
            packet, addr = self.incoming.recvfrom(UDP_MTU)

            # if the src addr of the last packet is the same as the addr of the
            # next hop, then this packet is a response, otherwise a mix fragment
            if addr == self.next_addr:
                self.handle_response(packet)
            else:
                if self.mix_addr is None:
                    self.mix_addr = addr
                self.handle_mix_fragment(packet)

            # send out requests
            if len(ChannelMid.requests) >= STORE_LIMIT:
                # mix packets before sending
                shuffle(ChannelMid.requests)
                # send STORE_LIMIT packets
                for _ in range(STORE_LIMIT):
                    # use bound socket to send packets
                    packet = ChannelMid.requests.pop()

                    enc_packet = self.request_link_encryptor.encrypt(packet)

                    print(self, "Data/Init", "->", len(enc_packet))
                    self.incoming.sendto(enc_packet, self.next_addr)

            # send out responses
            if len(ChannelMid.responses) >= STORE_LIMIT:
                # mix packets before sending
                shuffle(ChannelMid.responses)
                # send STORE_LIMIT packets
                for _ in range(STORE_LIMIT):
                    packet = ChannelMid.responses.pop()

                    enc_packet = self.response_link_encryptor.encrypt(packet)

                    print(self, "Data/Init", "<-", len(enc_packet))
                    self.incoming.sendto(enc_packet, self.mix_addr)
Example #22
0
 def test_shuffle(self):
     print util.shuffle('ae')
     print util.shuffle('ate')
     f = util.shuffle('aest')
     print len(f), f
     
     f = util.shuffle('etaelehoyr')
     print len(f)
Example #23
0
def prepare_data(df):
    df = u.shuffle(df, 999)
    X, Y = u.xy(df)

    scaler = preprocessing.MinMaxScaler()
    scaler.fit(X)
    df_tr, df_te = u.split(df, 0.75)

    X_tr, Y_tr = u.xy(df_tr)
    X_te, Y_te = u.xy(df_te)

    X_te_norm = scaler.transform(X_te)
    X_tr_norm = scaler.transform(X_tr)
    return X_tr_norm, Y_tr, X_te_norm, Y_te
    def __init__(self):
        alldata, alltargets = [], []
        with open('./data/review_data') as f:
            alldata = pickle.load(f)
        with open('./data/review_targets') as f:
            alltargets = pickle.load(f)

        p = range(len(alldata))
        random.seed(0)
        random.shuffle(p)
        shuffle = lambda l: [l[p[i]] for i in range(len(p))]
        alldata = shuffle(alldata)
        alltargets = shuffle(alltargets)

        train_size = 0.6
        self.train_data, self.train_target = \
          subset(alldata, alltargets, 0, 0.6)
        self.alltest_data, self.alltest_target = \
          subset(alldata, alltargets, 0.6, 1)

        self.num_classes = 3

        DataGatherer.__init__(self)
Example #25
0
  def __init__(self):
    alldata, alltargets = [], []
    with open('./data/review_data') as f:
      alldata = pickle.load(f)
    with open('./data/review_targets') as f:
      alltargets = pickle.load(f)

    p = range(len(alldata))
    random.seed(0)
    random.shuffle(p)
    shuffle = lambda l: [l[p[i]] for i in range(len(p))]
    alldata = shuffle(alldata)
    alltargets = shuffle(alltargets)

    train_size = 0.6
    self.train_data, self.train_target = \
      subset(alldata, alltargets, 0, 0.6)
    self.alltest_data, self.alltest_target = \
      subset(alldata, alltargets, 0.6, 1)

    self.num_classes = 3

    DataGatherer.__init__(self)
Example #26
0
def train_base_classifier(sess, trainData, trainLabel, trainIndex, **kwargs):
    for i, indexlist in enumerate(trainIndex):
        pos_data = trainData[indexlist]
        pos_label = [1] * len(indexlist)
        templist = list(np.arange(0, len(trainData)))
        for l in indexlist:
            templist.remove(l)
        neg_data = trainData[templist]
        neg_label = [0] * len(templist)
        data = np.concatenate((pos_data, neg_data))
        label = np.concatenate((pos_label, neg_label))
        data, label = util.shuffle(data, label)
        classify(sess, data, label, None, None, trainNum=i, test=False, **kwargs)
        tf.get_variable_scope().reuse_variables()
Example #27
0
def svm(trainData, trainLabel, testData, testLabel, **kwargs):
    print(kwargs)
    #neigh = KNeighborsClassifier(n_neighbors=kwargs['n_neighbors'], weights=kwargs['weights'], p=kwargs['p'])
    #trainData = util.normalization(trainData.reshape((trainData.shape[0], trainData.shape[1] * trainData.shape[2] * trainData.shape[3])))
    #testData = util.normalization(testData.reshape((testData.shape[0], testData.shape[1] * testData.shape[2] * testData.shape[3])))
    linearSVC_clf = LinearSVC()
    #this
    acc_list = []
    acc_max = 0
    ret = []
    shuffleTimes = 1
    for i in range(shuffleTimes):
        print(i+1, '/', shuffleTimes)
        trainData, trainLabel = util.shuffle(trainData, trainLabel)
        linearSVC_clf.fit(trainData, trainLabel)
        acc_i = linearSVC_clf.score(testData, testLabel)
        acc_list.append(acc_i)
        print("LinearSVC accuracy: ", np.mean(np.array(acc_list)))
        if acc_i > acc_max:
            acc_max = acc_i
            ret = linearSVC_clf.predict(testData)
    acc = np.mean(np.array(acc_list))
    print("LinearSVC accuracy: ", acc)

    return ret, acc_max

    SVC_clf = SVC()
    acc_list = []
    shuffleTimes = 50
    for i in range(shuffleTimes):
        print(i+1, '/', shuffleTimes)
        trainData, trainLabel = util.shuffle(trainData, trainLabel)
        SVC_clf.fit(trainData, trainLabel)
        acc_list.append(SVC_clf.score(testData, testLabel))
        print("SVC accuracy: ", np.mean(np.array(acc_list)))
    acc = np.mean(np.array(acc_list))
    print("SVC accuracy: ", acc)
Example #28
0
	def next_mixed(self):

		sample = next(self)
		B = self.arg.batch_size // self.N

		for i in range(self.N):
			select = torch.randperm(self.arg.batch_size)[:B]
			self.mixed[i*B:(i+1)*B].copy_(sample[i][select])
			self.label[i*B:(i+1)*B].fill_(i)

		shuffle = util.shuffle(self.arg.batch_size)

		self.mixed = self.mixed[shuffle]
		self.label = self.label[shuffle]

		return self.mixed, self.label
Example #29
0
    def __close_register(self):
        self.gamemode = 'ongoing'
        self.player_order = deque(shuffle(self.players),
                                  maxlen=len(self.players))
        self.turn_player = self.player_order[0]

        msg = '登録を締め切ったわ,順番はこんな感じね\n'
        for idx, name in enumerate(self.player_order):
            ordermsg = str(idx + 1) + ': ' + name + '\n'
            msg += ordermsg
        msg += '準備できたら手番のプレイヤーは`/turn`と入力するのよ\n'

        deal_item_msg = '\n'.join(
            [self.__deal_items(player) for player in self.player_order])
        msg += deal_item_msg

        return msg
Example #30
0
def fineTune(sess, trainData, trainLabel, testData, testLabel, **kwargs):
    net = network.Network(sess, model="default", **kwargs)
    maxAcc = 0
    retInf = []
    for i in range(20):
        trainData, trainLabel = util.shuffle(trainData, trainLabel)
        print(network.train_epoch(net, sess, trainData, trainLabel, **kwargs))
        acc = net.test(sess, testData, testLabel)[1]
        print(acc)
        if acc > maxAcc:
            maxAcc = acc
            retInf = net.inference(sess, testData)[0]
    ##for i in range(0, 100):
    ##    print(net.train(sess, trainData[0:10], trainLabel[0:10], args['keep_prob'])[0:2])
    print(maxAcc)

    return retInf, maxAcc
Example #31
0
def prepare_data(df):
    df = u.shuffle(df, 999)
    df_train, df_test = u.split(df, 0.75)

    X_train, Y_train = u.xy(df_train)
    X_test, Y_test = u.xy(df_test)

    X_train = preprocessing.maxabs_scale(X_train)
    X_test = preprocessing.maxabs_scale(X_test)

    ones = np.ones((X_train.shape[0], 1))
    X_train = np.hstack((X_train, ones))

    ones = np.ones((X_test.shape[0], 1))
    X_test = np.hstack((X_test, ones))

    return X_train, Y_train, X_test, Y_test
Example #32
0
def run_logistic_regression(x_train, y_train, x_test, y_test, args):
    print 'data loaded'
    if args['method'] == 'logistic-plot':
        logistic_regression.plot_batch_gradient_descent(x_train, y_train,
                lam=args['lambda'], step_size = args['step_size'],
                iterations = args['iterations'], weight_step = False)
    elif args['method'] == 'logistic':
        x_train = logistic_regression.standardize_data(x_train)
        x_test = logistic_regression.standardize_data(x_test)

        if args['beta_file'] is None:
            beta = logistic_regression.calc_cross_validated_beta(x_train,
                    y_train, lam=args['lambda'], 
                    step_size = args['step_size'],
                    iterations=args['iterations'], weight_step = False,
                    k=args['k'], use_nll = args['use_nll'],
                    plot_nll = args['plot_nll'])
        else:
            inputfile = open(args['beta_file'], 'rb')
            beta = cPickle.load(inputfile)

        #Save beta
        if args['store_beta']:
            beta_dumpfile = open('beta{0}{1}.pkl'.format(
                datetime.now().hour, datetime.now().minute), 'wb')
            cPickle.dump(beta, beta_dumpfile)

        beta = np.sum(beta, axis=0)/float(len(beta))
        training_labels = logistic_regression.calc_labels(x_train, beta)
        training_error = logistic_regression.calc_error_rate(training_labels, y_train)
        print 'training error rate', training_error

        testing_labels = logistic_regression.calc_labels(x_test, beta)
        testing_error = logistic_regression.calc_error_rate(testing_labels, y_test)
        print 'testing error rate', testing_error
        logistic_regression.write_labels(x_test, beta)

    elif args['method'] == 'logistic-sklearn':
        x_train = logistic_regression.standardize_data(x_train)
        x_test = logistic_regression.standardize_data(x_test)
        x_train, y_train = util.shuffle(x_train, y_train, to_numpy_array = True)
        logistic = LogisticRegression()
        logistic.fit(x_train, y_train)
        print logistic.score(x_test, y_test)
def linearReg(trainData, trainLabel, testData, testLabel, **kwargs):
    print(kwargs)
    # neigh = KNeighborsClassifier(n_neighbors=kwargs['n_neighbors'], weights=kwargs['weights'], p=kwargs['p'])
    clf = linear_model.LinearRegression()
    trainData = util.normalization(trainData)
    testData = util.normalization(testData)
    acc_list = []

    # Shuffle 10 times, seems useless for linear regression
    for i in range(10):
        trainData_shuffle, trainLabel_shuffle = util.shuffle(trainData, trainLabel)
        clf.fit(trainData_shuffle, trainLabel_shuffle)
        acc_i = clf.score(testData, testLabel)
        print("%d acc: " % i, acc_i)
        acc_list.append(acc_i)

    acc = np.mean(np.array(acc_list))
    print("Linear Regression accuracy: ", acc)
    return acc
Example #34
0
def decisionTree(trainData, trainLabel, testData, testLabel, **kwargs):
    print(kwargs)
    trainData = util.normalization(trainData)
    testData = util.normalization(testData)

    shuffle_times = 10
    acc_list = []

    for i in range(shuffle_times):
        trainData_shuffle, trainLabel_shuffle = util.shuffle(
            trainData, trainLabel)
        cdt = DecisionTreeClassifier()
        cdt.fit(trainData_shuffle, trainLabel_shuffle)
        now_acc = cdt.score(testData, testLabel)
        acc_list.append(now_acc)
        print("%d acc, %.3f" % (i, now_acc))
    acc = np.mean(np.array(acc_list))
    print("Decision Tree accuracy: ", acc)
    return acc
Example #35
0
def follow(job, session=None):
    new_follows = []
    count = 0
    try:
        insta = Insta()
        insta.login(username=job.i_user.username,
                    password=job.i_user.get_password())
        time.sleep(1)

        # get users tags and shuffles them
        tag_names = [str(tag) for tag in job.i_user.tags]
        tags = shuffle(tag_names)
        for tag in tags:
            insta.search(tag)
            users, finished = insta.follow(tag)
            count += len(users)
            new_follows += users
            if finished is True:
                break
            time.sleep(5)

    except Exception as e:
        job.error = '{}: {}'.format(type(e), e)

    if len(new_follows) > 0:
        for user in new_follows:
            f = Following()
            f.timestamp = time.time()
            f.i_user = job.i_user
            f.other_user = user
            session.add(f)

    session.commit()
    job.count = count
    job.finish()

    # new run for jobs
    new_job = schedule_next_job(job, 1.5 * rando_hour())
    session.add(new_job)
    session.commit()

    insta.driver.quit()
    return job
Example #36
0
    def train(self):
        log.infov("Training Starts!")
        output_save_step = 1000
        self.session.run(self.global_step.assign(0))  # reset global step

        from data_loader import load_kdd99
        x_train, x_test, y_train, y_test = load_kdd99('kdd_cup.npz',
                                                      self.config.seed)

        n_updates = 0
        with open(self.res_dir + "/step.txt", 'w') as f:
            for e in range(1, 1 + self.config.n_epochs):
                x_train, y_train = shuffle(x_train, y_train)
                n_train = len(x_train)
                max_batches = n_train // self.config.batch_size
                #if n_train % self.config.batch_size != 0: max_batches+=1

                for x_batch, y_batch in tqdm(iter_data(
                        x_train, y_train, size=self.config.batch_size),
                                             total=max_batches):
                    step, summary, loss, step_time = self.run_single_step(
                        x_batch)
                    self.summary_writer.add_summary(summary,
                                                    global_step=n_updates)

                    n_updates += 1
                    #if n_updates % 100 == 0:
                    #    eng, eng_chk = self.session.run([self.model.energy, self.model.energy_check], feed_dict=self.model.get_feed_dict(x_batch))
                    #    print(np.mean(eng), np.mean(eng_chk))

                if e % 10 == 0:
                    accuracy, precision, recall, f_score = self.evaluate(
                        x_train, y_train, x_test, y_test)
                    f.write(self.filepath + ',' + repr(e) + ',' +
                            repr(accuracy) + ',' + repr(precision) + ',' +
                            repr(recall) + ',' + repr(f_score) + '\n')
                    f.flush()

                    # save model at the end
                    self.saver.save(self.session,
                                    os.path.join(self.res_dir, 'model'),
                                    global_step=step)
def get_shuffled_seeds(num_participants):
    """Get randomized seedings for a tournament with num_participants.

    This is not fully randomized, but instead uses a bucket approach,
    where the final projected placements of the participants are unaffected.
    This is nice for varying who gets matched up in a tourney while still
    preserving the overall benefits of seeding.

    Args:
      num_participants: The number of participants in the tournament.

    Returns:
      A list of seeds to use for the tournament. For a given seed X, the value
      at index X - 1 is their randomized seed to use for the tournament.
    """
    shuffled_buckets = [util.shuffle(x) for x in _get_buckets(num_participants)]

    # Buckets are ordered from last place to first place, so we need to reverse
    # them to get the seeds ordered from first to last.
    return util.flatten(reversed(shuffled_buckets))
Example #38
0
def testDBN(opts) :
    """show how to use DBN to do classification"""
    # read data
    data  = np.load(opts.feature)
    label = np.load(opts.label)

    # set the nodes of hidden layers
    nHid = [5000, 2000]

    # shuffle data and label
    [data, label] = util.shuffle(data, label)

    # decide how many samples to be used as training set
    percent = float(opts.trainPercent)
    nCase   = data.shape[0]

    nTrain = int(nCase * percent)
    nTest = nCase - nTrain

    # split data and label into  train dataset and test dataset
    trainData  = data[0:nTrain, :]
    trainLabel = label[0:nTrain, :]
    example   = data[nTrain:, :]
    testLabel  = label[nTrain:, :]

    # set parameters
    # layer1
    p1 = {"maxEpoch" : opts.maxEpoch, "modelType" : "BB"}

    # layer2
    p2 = {"maxEpoch" : opts.maxEpoch}

    p = {"layer1" : p1, "layer2" : p2}

    # train the DBN model
    model = DBNFit.DBNFit(trainData, trainLabel, nHid, name=opts.model, isSingleDBN=True, **p)

    # do prediction for training set and testing set
    [trainR, F1] = DBNPredict.DBNPredict(model, trainData, isSingleDBN=True)
    [testR, F2]  = DBNPredict.DBNPredict(model, example, isSingleDBN=True)

    # calculate classification accuracy
    trainK = 0
    for x in range(nTrain) :
        if trainLabel[x] != trainR[x] :
            trainK = trainK+1

    testK = 0
    for x in range(nTest) :
        if testLabel[x] != testR[x] :
            testK = testK+1

    print "---------------------------------------"
    print "train classification rate : %f " % (1 - trainK*1.0/nTrain)
    print "test  classification rate : %f " % (1 - testK*1.0/nTest)
    print "---------------------------------------"

    if opts.isSaveResult :
        result = shelve.open(opts.resultName)
        result["nHid"]     = nHid
        result["maxEpoch"] = opts.maxEpoch
        result["trainPercent"] = opts.trainPercent
        result["trainAcc"] = 1-trainK*1.0/nTrain
        result["testAcc"]  = 1-testK*1.0/nTest
        result["trainLabel"] = trainLabel
        result["trainR"] = trainR
        result["testLabel"] = testLabel
        result["testR"] = testR
        result.close()
def main(param=None):
	if not param:
		param = {'lr': 0.0970806646812754,
		    'verbose': 1,
		    'decay': True,
		    # decay on the learning rate if improvement stops
		    'win': 7,
		    # number of words in the context window
		    'nhidden': 200,
		    # number of hidden units
		    'seed': 345,
		    'emb_dimension': 50,
		    # dimension of word embedding
		    'nepochs': 100,
		    # 60 is recommended
		    'savemodel': False}
	print param

	folder = "RelationExtraction"
	if not os.path.exists(folder):
		os.mkdir(folder)
	#load dataset
	pickle_file = 'semeval.pkl'
	with open(pickle_file, 'rb') as f:
	    save = pickle.load(f)
	    train_dataset = save['train_dataset']
	    train_labels = save['train_labels']
	    test_dataset = save['test_dataset']
	    test_labels = save['test_labels']
	    dic=save['dicts']
	    del save  # hint to help gc free up memory  
	    print('Training set', train_dataset.shape, train_labels.shape)
	    print('Test set', test_dataset.shape, test_labels.shape)


	# In[5]:
	train_dataset=[np.array(x,dtype=np.int32) for x in train_dataset]
	train_labels=[np.array(x,dtype=np.int32) for x in train_labels]
	x_test=[np.array(x,dtype=np.int32) for x in test_dataset]
	y_test=[np.array(x,dtype=np.int32) for x in test_labels]

	x_train=train_dataset[0:7200]
	y_train=train_labels[0:7200]
	x_valid=train_dataset[7201:8000]
	y_valid=train_labels[7201:8000]
	
	

	# In[6]:

	#Raw input encoding -''' visualize a few sentences '''
	w2idx,labels2idx = dic['words2idx'], dic['labels2idx']
	idx2w  = dict((v,k) for k,v in w2idx.iteritems())
	idx2la = dict((v,k) for k,v in labels2idx.iteritems())  

	# In[10]:

	vocsize = len(idx2w)
	nclasses = len(idx2la)
	nsentences = len(x_train)

	groundtruth_valid = [map(lambda x: idx2la[x], y) for y in y_valid]
	words_valid = [map(lambda x: idx2w[x], w) for w in x_valid]
	groundtruth_test = [map(lambda x: idx2la[x], y) for y in y_test]
	words_test = [map(lambda x: idx2w[x], w) for w in x_test]

	# instanciate the model
	np.random.seed(param['seed'])
	random.seed(param['seed'])
 
	rnn = GRUTheano(word_dim=param['emb_dimension'], window_context_size=param['win'], vocab_size=vocsize, num_labels=nclasses, hidden_dim=param['nhidden'])
	#rnn = RNNSLU_LSTM(hidden_dim=param['nhidden'], num_labels=nclasses, vocab_size=vocsize, word_dim=param['emb_dimension'], window_context_size=param['win'])

	# train with early stopping on validation set
	best_f1 = -np.inf
    	param['clr'] = param['lr']
    	for e in xrange(param['nepochs']):

		# shuffle
		shuffle([x_train, y_train], param['seed'])

		param['ce'] = e
		tic = timeit.default_timer()
		
		for i, (x, y) in enumerate(zip(x_train, y_train)):
		    rnn.train(x, y, param['win'], param['clr'])
		    print '[learning] epoch %i >> %2.2f%%' % (
		        e, (i + 1) * 100. / nsentences),
		    print 'completed in %.2f (sec) <<\r' % (timeit.default_timer() - tic),
		    sys.stdout.flush()

		# evaluation // back into the real world : idx -> words
		predictions_test = [map(lambda x: idx2la[x],
		                    rnn.classify(np.asarray(
		                    contextwin(x, param['win'])).astype('int32')))
		                    for x in x_test]
		predictions_valid = [map(lambda x: idx2la[x],
		                     rnn.classify(np.asarray(
		                     contextwin(x, param['win'])).astype('int32')))
		                     for x in x_valid]

		# evaluation // compute the accuracy using conlleval.pl
		res_test = conlleval(predictions_test,
		                     groundtruth_test,
		                     words_test,
		                     folder + '/current.test.txt',
		                     folder)
		res_valid = conlleval(predictions_valid,
		                      groundtruth_valid,
		                      words_valid,
		                      folder + '/current.valid.txt',
		                      folder)

		if res_valid['f1'] > best_f1:

		    if param['savemodel']:
		        rnn.save(folder)

		    best_rnn = copy.deepcopy(rnn)
		    best_f1 = res_valid['f1']

		    if param['verbose']:
		        print('NEW BEST: epoch', e,
		              'valid F1', res_valid['f1'],
		              'best test F1', res_test['f1'])

		    param['vf1'], param['tf1'] = res_valid['f1'], res_test['f1']
		    param['vp'], param['tp'] = res_valid['p'], res_test['p']
		    param['vr'], param['tr'] = res_valid['r'], res_test['r']
		    param['be'] = e

		    subprocess.call(['mv', folder + '/current.test.txt',
		                    folder + '/best.test.txt'])
		    subprocess.call(['mv', folder + '/current.valid.txt',
		                    folder + '/best.valid.txt'])
		else:
		    if param['verbose']:
		        print ''

		# learning rate decay if no improvement in 10 epochs
		if param['decay'] and abs(param['be']-param['ce']) >= 10:
		    param['clr'] *= 0.5
		    rnn = best_rnn

		if param['clr'] < 1e-5:
		    break

	print('BEST RESULT: epoch', param['be'],
		  'valid F1', param['vf1'],
		  'best test F1', param['tf1'],
		  'with the model', folder)
Example #40
0
File: w_lopo.py Project: giffy1/MIL
def main(data_dir, active_participant_counter, bag_size, held_out_bag_size, test_bag_size, M, N, K, \
	   clf_name, eta, kernel, cv_method, cv, n_iter, n_jobs, n_trials, verbose, save, description):
	"""
	@param data_dir : The directory in which the data is located. The directory should contain a 
				load_data.py script with a load_data() method, which returns the feature 
				representation of the dataset.
	@param active_participant_counter : Index of the held-out test participant.
	
	@param bag_size : The size of the training bags. Use -1 for sessions.
	@param held_out_bag_size : The size of the training bags from the held-out participant. Use -1 for sessions.
	@param test_bag_size : The size of the test bags. Use -1 for sessions.
	
	@param M : The number of labeled training instances.
	@param N : The number of labeled training bags.
	@param K : The number of labeled training instances from the held-out participant.
	
	@param clf_name : Classifier; one of 'SVM', 'LinearSVC', 'RF', 'SIL', 'LinearSIL', 
				'sMIL', 'sbMIL', 'MIForest' or 'misvm'.
	@param eta_ : If the classifier used is sbMIL, eta is the expected density 
				of positive instances in positive bags, between 0.0 and 1.0.
	@param kernel : If a non-linear SVM-based classifier is used, the kernel 
				can be specified, i.e. 'rbf', 'linear_av', etc.

	@param cv_method : The search method for cross-validation; either 'grid' or 
				'randomized'.			
	@param cv : Number of cross-validation folds.
	@param n_iter : If the cross-validation search method is 'randomized', then 
				n_iter is the number of randomly sampled parameter tuples.
	@param n_jobs : The number of jobs, -1 for full parallelization.
	@param n_trials : Number of trials, in case randomness is introduced in each trial.

	@param verbose : Indicates the level of detail to be displayed during run-time.
	@param save : The path of the file where results are stored.
	@param description : Description of the evaluation to be saved with the results.

	"""
	
	sys.path.insert(0, data_dir)
	from load_data import load_data
	
	dataset = load_data(data_dir)
	X = dataset['data']['X']
	Y = dataset['data']['Y']
	session_start = dataset['data']['sessions']['start']
	session_labels = dataset['data']['sessions']['labels']	
	print data_dir
	print dataset['description']
						
	
	if clf_name == 'RF':
		clf = RandomForestClassifier(n_estimators=185, verbose=(verbose>1))
	elif clf_name == 'SVM':
		clf = SVC(kernel=kernel, verbose=(verbose>1))
	elif clf_name == 'SIL':
		clf = misvm.SIL(kernel=kernel, C=1.0, verbose=(verbose>1))
	elif clf_name == 'MIForest':
		clf = MIForest(n_estimators=50, directory="miforest",  prefix="eating")
	elif clf_name == 'sMIL':
		clf = misvm.sMIL(kernel=kernel, C=1.0, verbose=(verbose>1))
	elif clf_name == 'sbMIL':
		clf = misvm.sbMIL(kernel=kernel, eta=eta, C=1.0, verbose=(verbose>1))
	elif clf_name == 'misvm':
		clf = misvm.MISVM(kernel=kernel, C=1.0, verbose=(verbose>1))
	elif clf_name == 'LinearSIL':
		clf = misvm.LinearSIL(C=1.0)
	elif clf_name == 'LinearSVC':
		clf = LinearSVC(C=1.0)
		
	#class weights are determined by a Farey sequence to make sure that redundant pairs, 
	#i.e. (1,1) = (2,2), (2,3) = (4,6), etc. are not included.
	class_weights = [{1 : i, -1 : j} for (i,j) in farey(25)[1:]] #ignore first value where i=0
	class_weights.extend([{1 : j, -1 : i} for (i,j) in farey(25)[1:]]) #swap i and j, ignore first value
	C_array = np.logspace(-5, 15, 21, base=2).tolist()
	gamma_array = np.logspace(-15, 3, 19, base=2).tolist()
	eta_array = np.linspace(0,1,9).tolist()
	n_estimators_array = [25,50,75,100,125,150]
	param_grid = {}
	
	if clf_name in {'RF', 'MIForest'}:
		param_grid.update({'n_estimators' : n_estimators_array})
	
	if clf_name in {'SIL', 'sMIL', 'sbMIL', 'RF', 'SVM', 'LinearSVC'}:
		param_grid.update({'class_weight' : class_weights})
		
	if clf_name in {'SIL', 'sMIL', 'sbMIL', 'misvm', 'SVM', 'LinearSIL', 'LinearSVC'}:
		param_grid.update({'C' : C_array})
	
	if clf_name in {'SIL', 'sMIL', 'sbMIL', 'misvm', 'SVM'} and kernel == 'rbf':
		param_grid.update({'gamma' : gamma_array})
		
	if clf_name == 'sbMIL':
		param_grid.update({'eta' : eta_array})
	
	data_params = {"Number of Training Bags": N, "Number of Single-Instance Bags" : M, "Test Participant": active_participant_counter}
	cv_params = {"K-Fold": cv, "Method": cv_method, "Parameter Grid" : param_grid, "Number of Iterations": n_iter}
	params = {
		"Bag Size": bag_size, \
		"Data": data_params, \
		"Classifier": str(clf), \
		"Number of Trials": n_trials, \
		"CV": cv_params \
	}	
	results = {
		"Confusion Matrix" : {"Training" : np.zeros((2,2)), "Test" : np.zeros((2,2))}, \
		"Precision": {"Training" : 0.0, "Test" : 0.0}, \
		"Recall": {"Training" : 0.0, "Test" : 0.0}, \
		"F1 Score": {"Training" : 0.0, "Test" : 0.0, "Validation" : 0.0} \
	}
		
	participant_indices = range(len(X))
	n_si_participants = 5
	n_bag_participants = len(X) - n_si_participants - 1
	
	if verbose:
		pprint_header("Train Model for Participant: " + str(active_participant_counter + (active_participant_counter>=13) + 1))
	
	for T in xrange(1,n_trials+1): #allow multiple trials to account for randomness
		pprint_header("Trial: " + str(T))
		
		#indices for participants in training data; skip active participant counter:
		train_indices = participant_indices[:active_participant_counter] + participant_indices[active_participant_counter+1:]
		
		si_participant_indices = train_indices[:n_si_participants]
		bag_participant_indices = train_indices[n_si_participants+1:n_si_participants+n_bag_participants+1]
		
		#single-instance training data:
		X_SI = np.vstack([X[k] for k in si_participant_indices])
		Y_SI = np.hstack([Y[k] for k in si_participant_indices])

		#bag-level training data:
		X_B = np.vstack([X[k] for k in bag_participant_indices])
		Y_B = np.hstack([Y[k] for k in bag_participant_indices])
	
		#test data
		X_test = X[active_participant_counter]
		Y_test = Y[active_participant_counter]

		#convert to bags:
		if clf_name in MIL:
			X_SI = [X_SI[k:k+1, :] for k in xrange(len(X_SI))]
			Y_SI = [max(Y_SI[k:k+1]) for k in xrange(len(Y_SI))]
			
			if bag_size == -1:
				X_B, Y_B, _ = single_instances_to_sessions(X, Y, session_labels, session_start, bag_participant_indices)
			else:
				X_B = [X_B[k:k+bag_size, :] for k in xrange(0, len(X_B), bag_size)]
				Y_B = [max(Y_B[k:k+bag_size]) for k in xrange(0, len(Y_B), bag_size)]
			
			if held_out_bag_size == -1:
				X_T, Y_T, Y_si = single_instances_to_sessions(X, Y, session_labels, session_start, [active_participant_counter])
			else:
				X_T = [X_test[k:k+bag_size, :] for k in xrange(0,len(X_test), held_out_bag_size)]
				Y_si = [Y_test[k:k+bag_size] for k in xrange(0,len(Y_test), held_out_bag_size)]
				Y_T = [max(y_t) for y_t in Y_si]
								
			X_T, Y_T, Y_si = shuffle(X_T, Y_T, Y_si)	

			# convert remaining bags back to test instances
			X_test = []
			Y_test = []
			for i, (x_t, y_si) in enumerate(zip(X_T, Y_si)[K:]):
				for (x,y) in zip(x_t, y_si):
					X_test.append(x)
					Y_test.append(y)
					
			X_test = [np.asarray(X_test)[k:k+test_bag_size, :] for k in xrange(0, len(X_test), test_bag_size)]
			Y_test = [max(Y_test[k:k+test_bag_size]) for k in xrange(0, len(Y_test), test_bag_size)]

		else: # standard supervised learning case
			X_T = X_test[:K]
			X_T = X_test[:K]
			X_test = X_test[K:]
			Y_test = Y_test[K:]
			
		if N < 0:
			N=len(X_B)
			
		if M < 0:
			M=len(X_SI)
			
		X_SI, Y_SI = shuffle(X_SI, Y_SI)
		X_B, Y_B = shuffle(X_B, Y_B)
		X_test, Y_test = shuffle(X_test, Y_test)

		#combine into single training data set with mixed bags and single-instances
		X_train = []
		Y_train = []
		if M > 0:
			X_train += X_SI[:M]
			Y_train += Y_SI[:M]
		if K > 0:
			X_train += X_T[:K]
			Y_train += Y_T[:K]
		if N > 0:
			X_train += X_B[:N]
			Y_train += Y_B[:N]

		if test_bag_size > 1:
			cv_iterator = mil_train_test_split(X_SI[:M], X_T[:K] + X_B[:N], cv)
		else:
			cv_iterator = mil_train_test_split(X_SI[:M] + X_T[:K], X_B[:N], cv)
		
		if clf_name in MIL:
			print ("Total number of bags : %d" %len(X_train))
			print ("Feature Dimensionality: %d " %X_train[0].shape[1])
		else:
			print ("Total number of instances : %d" %len(X_train))
			print("Feature Dimensionality %d " %len(X_train[0]))
		
		sys.stdout.flush()
		if cv_method == 'grid':
			gs = GridSearchCV(clf, param_grid, scoring=score, cv=cv_iterator, verbose=verbose, n_jobs = n_jobs)
		elif cv_method == 'randomized':
			#scoring='f1_weighted'
			gs = RandomizedSearchCV(clf, param_distributions=param_grid, scoring=score, cv=cv, n_jobs = n_jobs, n_iter=n_iter, verbose=verbose)
		
		t0 = time()
		gs = gs.fit(X_train, Y_train)
		tf = time()

		print("Time elapsed: %0.2f seconds." %(tf-t0))
		
		print("Best params: ")
		print(gs.best_params_)	
		print("Best F1-score on training data: %0.2f%%" %(100*gs.best_score_))
		results['F1 Score']['Validation'] += gs.best_score_
		
		if clf_name == 'MIForest': #for MIForest, we need to pass in Y as well
			#check training accuracy to start:
			y_pred = 2*np.greater(gs.best_estimator_.predict(X_train, Y_train),0)-1	
		else: #for MIForest, we need to pass in Y as well
			#check training accuracy to start:
			y_pred = 2*np.greater(gs.best_estimator_.predict(X_train),0)-1
		
		conf = confusion_matrix(Y_train, y_pred, [-1,+1])
		print("Confusion matrix on the training data:")
		print(conf)
		results['Confusion Matrix']['Training'] += conf
		
		if clf_name == 'MIForest':
			y_pred = 2*np.greater(gs.best_estimator_.predict(X_test, Y_test),0)-1
		else:
			y_pred = 2*np.greater(gs.best_estimator_.predict(X_test),0)-1
			
		conf = confusion_matrix(Y_test, y_pred, [-1,+1])
		print("Confusion matrix on the test data:")
		print(conf)
		results['Confusion Matrix']['Test'] += conf
		
	pprint_header("Results")
	
	conf = results['Confusion Matrix']['Training']
	avg_precision, avg_recall, avg_fscore = accuracy_precision_recall_fscore(conf)[1][1]
	results['F1 Score']['Training'] = avg_fscore
	results['Precision']['Training'] = avg_precision
	results['Recall']['Training'] = avg_recall	

	print("Average Precision on the training data: %0.2f%%" %(100*avg_precision))
	print("Average Recall on the training data: %0.2f%%" %(100*avg_recall))
	print("Average F1 Score on the training data: %0.2f%%\n" %(100*avg_fscore))	
	
	conf = results['Confusion Matrix']['Test']
	avg_precision, avg_recall, avg_fscore = accuracy_precision_recall_fscore(conf)[1][1]
	results['F1 Score']['Test'] = avg_fscore
	results['Precision']['Test'] = avg_precision
	results['Recall']['Test'] = avg_recall	
	
	print("Average Precision on the test data: %0.2f%%" %(100*avg_precision))
	print("Average Recall on the test data: %0.2f%%" %(100*avg_recall))
	print("Average F1 Score on the test data: %0.2f%%\n" %(100*avg_fscore))
	
	if save != 'none':
		print("Saving results to %s ..." %save)
		
		evaluation = {"Parameters" : params, "Results" : results}
		with open(save, 'wb') as f:
			pickle.dump(evaluation, f)
Example #41
0
def rbmFit(X, numHid, y, isSaveModel=False, name=None, **kwargs) :
    """
    X              ... data. should be binary, or in [0,1] interpreted as
                   ... probabilities
    numhid         ... number of hidden units
    y              ... List of discrete labels

    nClass          number of classes
    method          CD or SML
    eta             learning rate
    momentum        momentum for smoothness amd to prevent overfitting
                    NOTE: momentum is not recommended with SML
    maxepoch        # of epochs: each is a full pass through train data
    avglast         how many epochs before maxepoch to start averaging
                before. Procedure suggested for faster convergence by
                Kevin Swersky in his MSc thesis

    batchsize       The number of training instances per batch
    verbose         For printing progress

    model.weight         The weights of the connections
    model.biasH         The biases of the hidden layer
    model.biasV         The biases of the visible layer

    model.weightlabel       ... The weights on labels layer
    model.biasLabel       ... The biases on labels layer

    errors          The errors in reconstruction at each epoch
       """

    arg = util.processOptions(kwargs, \
                            nClass = np.unique(y).size, \
                            method = "CD", \
                            eta = 0.1, \
                            momentum = 0.5,\
                            maxEpoch = 500, \
                            avgLast = 0, \
                            penalty = 0, \
                            batchSize = 100, \
                            verbose = True)
    [nClass, method, eta, momentum, maxEpoch, avgLast, penalty, batchSize, verbose] = [\
        arg["nClass"],\
        arg["method"],\
        arg["eta"],\
        arg["momentum"],\
        arg["maxEpoch"],\
        arg["avgLast"],\
        arg["penalty"],\
        arg["batchSize"],\
        arg["verbose"]
    ]

    if verbose :
        print "Processing data ..."

    # from which step, we start to compute the average
#    avgStart = maxEpoch - avgLast

    # for weight decay use
#    oldPenalty = penalty

    # numCases : number of example
    # numDims : the length of each example
    # each row is an example
    [numCases, numDims] = list(X.shape)

    numVis = numDims
    uniqueLabel = np.unique(y)
    numBatch = util.ceil(numCases, batchSize)

    y = util.matrixLabel(y)

    # shuffle data and label
    data = copy.deepcopy(X)
    [data, label] = util.shuffle(data, y)

    # init CUDA
    cm.cublas_init()
    cm.CUDAMatrix.init_random(100)
    deviceData = cm.CUDAMatrix(cm.reformat(data))
    deviceLabel = cm.CUDAMatrix(cm.reformat(label))

    # init weights
    weight = cm.CUDAMatrix(0.1*np.random.randn(numVis,numHid))
    biasV = cm.CUDAMatrix(np.zeros((1, numVis)))
    biasH = cm.CUDAMatrix(np.zeros((1, numHid)))
    weightLabel = cm.CUDAMatrix(0.1*np.random.randn(nClass, numHid))
    biasLabel = cm.CUDAMatrix(np.zeros((1,nClass)))

    # init weight update
    weightInc = cm.CUDAMatrix(np.zeros((numVis,numHid)))
    biasVInc = cm.CUDAMatrix(np.zeros((1,numVis)))
    biasHInc = cm.CUDAMatrix(np.zeros((1,numHid)))
    weightLabelInc = cm.CUDAMatrix(np.zeros((nClass, numHid)))
    biasLabelInc = cm.CUDAMatrix(np.zeros((1,nClass)))

    #init temporary storage
    visActP = cm.empty((batchSize, numVis))
    hidActP = cm.empty((batchSize, numHid))
    hidState = cm.empty((batchSize, numHid))

    for epoch in range(maxEpoch) :
        error = []

        for batch in range(numBatch) :
            # train each data batch
            if batchSize*(batch+1) > numCases :
                visTrue = deviceData.get_row_slice(batchSize*batch, numCases)
                labelTrue = deviceLabel.get_row_slice(batchSize*batch, numCases)
                batchSize = visTrue.shape[0]

                visActP = cm.empty((batchSize, numVis))
                hidActP = cm.empty((batchSize, numHid))
                hidState = cm.empty((batchSize, numHid))
            else :
                visTrue = deviceData.get_row_slice(batchSize*batch, batchSize*(batch+1))
                labelTrue = deviceLabel.get_row_slice(batchSize*batch, batchSize*(batch+1))
                batchSize = visTrue.shape[0]

            visActP.assign(visTrue)

            #apply momentum
            weightInc.mult(momentum)
            biasVInc.mult(momentum)
            biasHInc.mult(momentum)
            weightLabel.mult(momentum)
            biasLabel.mult(momentum)

            # positive phase
            cm.dot(visActP, weight, target = hidActP)
            hidActP.add_dot(labelTrue, weightLabel)
            hidActP.add_row_vec(biasH)
            hidActP.apply_sigmoid()

            weightInc.add_dot(visActP.T, hidActP)
            biasVInc.add_sums(visActP, axis=0)
            biasHInc.add_sums(hidActP, axis=0)
            weightLabelInc.add_dot(labelTrue.T, hidActP)
            biasLabelInc.add_sums(labelTrue, axis=0)

            hidState.fill_with_rand()
            hidState.less_than(hidActP, target=hidActP)

            if cmp(method, "SML") == 0 :
                if np.logical_and(np.equal(epoch,1), np.equal(batch,1)) :
                    pass # here does not need in practical use
            elif cmp(method, "CD") == 0 :
                pass

            # negative phase
            cm.dot(hidActP, weight.T, target = visActP)
            visActP.add_row_vec(biasV)
            visActP.apply_sigmoid()

            cm.dot(hidActP, weightLabel.T, target = labelTrue)
            labelTrue.add_row_vec(biasLabel)
            labelTrue = util.softmax(labelTrue)

            # another positive phase
            cm.dot(visActP, weight, target = hidActP)
            hidActP.add_dot(labelTrue, weightLabel)
            hidActP.add_row_vec(biasH)
            hidActP.apply_sigmoid()

            weightInc.subtract_dot(visActP.T, hidActP)
            biasVInc.add_sums(visActP, axis=0, mult=-1)
            biasHInc.add_sums(hidActP, axis=0, mult=-1)
            weightLabelInc.subtract_dot(labelTrue.T, hidActP)
            biasLabelInc.add_sums(labelTrue, axis=0, mult=-1)

            # update weights and bias
            weight.add_mult(weightInc, eta/batchSize)
            biasV.add_mult(biasVInc, eta/batchSize)
            biasH.add_mult(biasHInc, eta/batchSize)
            weightLabel.add_mult(weightLabelInc, eta/batchSize)
            biasLabel.add_mult(biasLabelInc, eta/batchSize)

            # calculate reconstruction error
            visTrue.subtract(visActP)
            error.append(visTrue.euclid_norm()**2)

            # free memory
            visTrue.free_device_memory()
            labelTrue.free_device_memory()

        if verbose :
            print "Epoch %d/%d, reconstruction error is %f " % (epoch+1, maxEpoch, sum(error))

    # save rbm model
    weight.copy_to_host()
    biasV.copy_to_host()
    biasH.copy_to_host()
    weightLabel.copy_to_host()
    biasLabel.copy_to_host()

    model_ = m.rbmModel(weight.numpy_array, biasV.numpy_array, biasH.numpy_array, \
                        weightLabel = weightLabel.numpy_array,\
                        biasLabel = biasLabel.numpy_array, labels = uniqueLabel)

    # free device memory
    deviceData.free_device_memory()
    deviceLabel.free_device_memory()

    weight.free_device_memory()
    biasV.free_device_memory()
    biasH.free_device_memory()
    weightLabel.free_device_memory()
    biasLabel.free_device_memory()

    weightInc.free_device_memory()
    biasVInc.free_device_memory()
    biasHInc.free_device_memory()
    weightLabelInc.free_device_memory()
    biasLabelInc.free_device_memory()

    hidActP.free_device_memory()
    visActP.free_device_memory()
    hidState.free_device_memory()

    cm.shutdown()

    if isSaveModel :
        modelList = []
        modelList.append(model_)
        model = np.array(modelList)
        np.save(name,model)

    return model_
Example #42
0
if __name__ == '__main__':
  print "Opening data files..."
  X, y = [], []
  with open('./data/dmoz_data') as f:
    X = pickle.load(f)
  with open('./data/dmoz_targets') as f:
    y = pickle.load(f)

  print y[:20]
  
  print "Shuffling..."
  p = range(len(y))
  random.seed(0)
  random.shuffle(p)
  shuffle = lambda l: [l[p[i]] for i in range(len(p))]
  y = shuffle(y)
  X = X[p]

  print "Loading data..."

  labeled_data, labeled_target = \
    subset_matrix(X, y, 0, 0.2)
  unlabeled_data, unlabeled_target = \
    subset_matrix(X, y, 0.2, 0.6)
  validate_data, validate_target = \
    subset_matrix(X, y, 0.6, 0.8)
  test_data, test_target = \
    subset_matrix(X, y, 0.8, 1)

  X_labeled = X_unlabeled = X_validate = X_test = None
  def dump (data, target, fname): 
Example #43
0
def main(data_dir, data_file, bag_size, active_participant_counter, M, N, seed=None, shuffle_bags = False, shuffle_si = False, K=0, K_max=0, held_out_b=1, shuffle_heldout = True):

#data_dir = '../data/eating_detection_inertial_ubicomp2015/'
#data_dir = '../data/smoking-data/'
#data_file = "data_p0.pickle"

	sys.path.insert(0, data_dir)
	from load_data import load_data
	
	dataset = load_data(data_dir)
	X = dataset['data']['X']
	Y = dataset['data']['Y']
	session_start = dataset['data']['sessions']['start']
	session_labels = dataset['data']['sessions']['labels']	
	
	participant_indices = range(len(X))
	n_si_participants = 5
	n_bag_participants = len(X) - n_si_participants - 1
	
	#indices for participants in training data; skip active participant counter:
	train_indices = participant_indices[:active_participant_counter] + participant_indices[active_participant_counter+1:]	
	
	si_participant_indices = train_indices[:n_si_participants]
	bag_participant_indices = train_indices[n_si_participants:n_si_participants+n_bag_participants+1]
		
	#single-instance training data:
	X_SI = []
	Y_SI = []
	for p in si_participant_indices:
		x = X[p]
		y = Y[p]
		if shuffle_si:
			x, y = shuffle(seed, x, y)
		X_SI.append(x[:M])
		Y_SI.append(y[:M])
		
#	X_SI.append(X[active_participant_counter][:K])
#	Y_SI.append(Y[active_participant_counter][:K])
	
	#bag-level training data:
	X_B = []
	Y_B = []
	for p in bag_participant_indices:
		if bag_size == -1:
			x, y, _ = single_instances_to_sessions(X[p], Y[p], session_labels[p], session_start[p])
		else:
			x = [X[p][k:k+bag_size, :] for k in xrange(0, len(X[p]), bag_size)]
			y = [max(Y[p][k:k+bag_size]) for k in xrange(0, len(Y[p]), bag_size)]
		if shuffle_bags:
			x, y = shuffle(seed, x,y)	
		X_B.append(x[:N])
		Y_B.append(y[:N])
	
	if K_max > 0:
		if held_out_b == -1:
			x, y, si_labels = single_instances_to_sessions(X[active_participant_counter], Y[active_participant_counter], session_labels[active_participant_counter], session_start[active_participant_counter])
		else:
			x = [X[active_participant_counter][k:k+held_out_b, :] for k in xrange(0, len(X[active_participant_counter]), held_out_b)]
			#y = [max(Y[active_participant_counter][k:k+held_out_b]) for k in xrange(0, min(K*held_out_b,len(Y[active_participant_counter])), held_out_b)]
			si_labels = [Y[active_participant_counter][k:k+held_out_b] for k in xrange(0, len(Y[active_participant_counter]), held_out_b)]
			y = [max(y_i) for y_i in si_labels]
		if shuffle_heldout:
			x, y, si_labels = shuffle(seed, x, y, si_labels)	
		print("len(x): %d" %len(x))
		X_B.append(x[:K])
		Y_B.append(y[:K])
		X_test = []
		Y_test = []
		if held_out_b == -1:
			starts = np.cumsum([len(x[l]) for l in range(len(x))])
			K_start = np.argmax(starts >= K_max)
			print("start[K_start]: %d" %starts[K_start])
		else:
			K_start = int(np.ceil(K_max / held_out_b))
			
		for k in range(K_start, len(x)):
			X_test.extend([x[k][j] for j in range(x[k].shape[0])])
			Y_test.extend([si_labels[k][j] for j in range(si_labels[k].shape[0])])
			
		if held_out_b==-1 and K_max > starts[K_start-1]:
			X_test = X_test[K_max - starts[K_start-1]:]
			Y_test = Y_test[K_max - starts[K_start-1]:]
	else:
		#test data:
		X_test = X[active_participant_counter]
		Y_test = Y[active_participant_counter]
	#X_test, Y_test = shuffle(X_test, Y_test)

##convert to bags:
#if clf_name in MIL:
#	X_SI = [X_SI[k:k+1, :] for k in xrange(len(X_SI))]
#	Y_SI = [max(Y_SI[k:k+1]) for k in xrange(len(Y_SI))]
#	
#	if bag_size == -1:
#		X_B, Y_B, _ = single_instances_to_sessions(X, Y, session_labels, session_start, bag_participant_indices)
#	else:
#		X_B = [X_B[k:k+bag_size, :] for k in xrange(0, len(X_B), bag_size)]
#		Y_B = [max(Y_B[k:k+bag_size]) for k in xrange(0, len(Y_B), bag_size)]
#	
#	if held_out_bag_size == -1:
#		X_T, Y_T, Y_si = single_instances_to_sessions(X, Y, session_labels, session_start, [active_participant_counter])
#	else:
#		X_T = [X_test[k:k+bag_size, :] for k in xrange(0,len(X_test), held_out_bag_size)]
#		Y_si = [Y_test[k:k+bag_size] for k in xrange(0,len(Y_test), held_out_bag_size)]
#		Y_T = [max(y_t) for y_t in Y_si]
#						
#	X_T, Y_T, Y_si = shuffle(X_T, Y_T, Y_si)	
#
#	# convert remaining bags back to test instances
#	X_test = []
#	Y_test = []
#	for i, (x_t, y_si) in enumerate(zip(X_T, Y_si)[K:]):
#		for (x,y) in zip(x_t, y_si):
#			X_test.append(x)
#			Y_test.append(y)
#			
#	X_test = [np.asarray(X_test)[k:k+test_bag_size, :] for k in xrange(0, len(X_test), test_bag_size)]
#	Y_test = [max(Y_test[k:k+test_bag_size]) for k in xrange(0, len(Y_test), test_bag_size)]
#
#else: # standard supervised learning case
#	X_T = X_test[:K]
#	X_T = X_test[:K]
#	X_test = X_test[K:]
#	Y_test = Y_test[K:]
#	
#if N < 0:
#	N=len(X_B)
#	
#if M < 0:
#	M=len(X_SI)
#		
#X_SI, Y_SI = shuffle(X_SI, Y_SI)
#X_B, Y_B = shuffle(X_B, Y_B)
#X_test, Y_test = shuffle(X_test, Y_test)

	data = {}
	data['training'] = {'instance' : {'X' : X_SI, 'Y' : Y_SI}, 'bag' : {'X' : X_B, 'Y' : Y_B}}
	data['test'] = {'X' : X_test, 'Y' : Y_test}
	
	with open(data_file, 'wb') as f:
		pickle.dump(data, f)
		
	return data
Example #44
0
File: lopo.py Project: giffy1/MIL
def main(data_file, clf_str, cv_method, n_iter, n_jobs, verbose, save, description):
	"""
	TODO: Doc string
	"""
	
	with open(data_file, 'rb') as f:
		data = pickle.load(f)
	
	X_SI = data['training']['instance']['X']
	Y_SI = data['training']['instance']['Y']
	X_B = data['training']['bag']['X']
	Y_B = data['training']['bag']['Y']
	X_train = []
	Y_train = []
	X_val = []
	Y_val = []
	X_SI_val = []
	Y_SI_val = []
	X_B_val = []
	Y_B_val = []
	for p in range(len(X_SI)):
		X_train.extend(X_SI[p])
		Y_train.extend(Y_SI[p])
		
		#l = min(100,int(np.ceil(0.5*len(X_SI[p]))))
		x,y = shuffle(None, X_SI[p], Y_SI[p])
		X_val.extend(x)
		Y_val.extend(y)
		
		X_SI_val.append(x)
		Y_SI_val.append(y)
	n_single_instances = len(X_train)
	
	#for class weights:
#	N1 = np.sum(np.greater(Y_train, 0))
#	N0 = np.sum(np.less(Y_train, 0))
		
	for p in range(len(X_B)):
		X_train.extend(X_B[p])
		Y_train.extend(Y_B[p])
		
		l = int(np.ceil(0.25*len(X_B[p])))
		x,y = shuffle(None, X_B[p], Y_B[p])
		X_val.extend(x[:l])
		Y_val.extend(y[:l])

		X_B_val.append(x[:l])
		Y_B_val.append(y[:l])
		
	n_bags = len(X_train) - n_single_instances
	X_test = data['test']['X']
	Y_test = data['test']['Y']
	
	clf_name, clf_params = parse_clf(clf_str)
#	if N0 + N1 == 0:
#		clf_params['class_weight'] = {1 : 0.9, -1 : 0.1}
#	else:
#		clf_params['class_weight'] = {1 : N0/(N0 + N1), -1 : N1/(N0 + N1)}
#	print clf_params['class_weight']
	clf = get_clf_by_name(clf_name, **clf_params)
	param_grid = get_param_grid_by_clf(clf_name, clf_params.get("kernel", "linear"))

	results = {
		"Confusion Matrix" : {"Training" : np.zeros((2,2)), "Test" : np.zeros((2,2))}, \
		"Precision": {"Training" : 0.0, "Test" : 0.0}, \
		"Recall": {"Training" : 0.0, "Test" : 0.0}, \
		"F1 Score": {"Training" : 0.0, "Test" : 0.0, "Validation" : 0.0} \
	}
	
	cv_iterator = mil_train_test_split(X_SI_val, X_B_val, Y_SI_val, Y_B_val)
	
	pprint_header("Number of bags : %d    Number of single instances: %d       Number of test instances: %d" %(n_bags, n_single_instances, len(Y_test)))

	if cv_method == 'grid':
		gs = GridSearchCV(clf, param_grid, scoring=score, cv=cv_iterator, verbose=verbose, n_jobs = n_jobs, refit=False)
	elif cv_method == 'randomized':
		gs = RandomizedSearchCV(clf, param_distributions=param_grid, scoring=score, cv=cv_iterator, n_jobs = n_jobs, n_iter=n_iter, verbose=verbose, refit=False)
	
	t0 = time()
	gs = gs.fit(X_val, Y_val)
	tf = time()
	
	print("Best parameters set found on development set:\n")
	print(gs.best_params_)
	print("\nGrid scores on development set:\n")
	for params, mean_score, scores in gs.grid_scores_:
		print("%0.3f (+/-%0.03f) for %r"
		% (mean_score, scores.std() * 2, params))
	
	clf.set_params(**gs.best_params_)
	clf.fit(X_train, Y_train)
	print("\nDetailed classification report:\n")
	print("The model is trained on the full development set.")
	print("The scores are computed on the full evaluation set.\n")
	y_true, y_pred = Y_test, 2*np.greater(clf.predict(X_test),0)-1
	print(classification_report(y_true, y_pred))

	print("\nTime elapsed: %0.2f seconds." %(tf-t0))
	
#	if clf_name == 'MIForest': #for MIForest, we need to pass in Y as well
#		#check training accuracy to start:
#		y_pred = 2*np.greater(gs.best_estimator_.predict(X_train, Y_train),0)-1	
#	else: #for MIForest, we need to pass in Y as well
#		#check training accuracy to start:
#		y_pred = 2*np.greater(gs.best_estimator_.predict(X_train),0)-1
#	
#	conf = confusion_matrix(Y_train, y_pred, [-1,+1])
#	print("Confusion matrix on the training data:")
#	print(conf)
#	results['Confusion Matrix']['Training'] = conf
#	
#	precision, recall, fscore = accuracy_precision_recall_fscore(conf)[1][1]
#	results['F1 Score']['Training'] = fscore
#	results['Precision']['Training'] = precision
#	results['Recall']['Training'] = recall	
	
	
#	if clf_name == 'MIForest':
#		y_pred = 2*np.greater(gs.best_estimator_.predict(X_test, Y_test),0)-1
#	else:
#		y_pred = 2*np.greater(gs.best_estimator_.predict(X_test),0)-1
#		
	conf = confusion_matrix(y_true, y_pred, [-1,+1])
	print("Confusion matrix on the test data:")
	print(conf)
	results['Confusion Matrix']['Test'] = conf
		
	precision, recall, fscore = accuracy_precision_recall_fscore(conf)[1][1]
	results['F1 Score']['Test'] = fscore
	results['Precision']['Test'] = precision
	results['Recall']['Test'] = recall	
	
	print("Precision on the test data: %0.2f%%" %(100*precision))
	print("Recall on the test data: %0.2f%%" %(100*recall))
	print("F1 Score on the test data: %0.2f%%\n" %(100*fscore))
	
	evaluation = {"Description": description, "Results" : results}	
	if save != 'none':
		print("Saving results to %s ..." %save)

		with open(save, 'wb') as f:
			pickle.dump(evaluation, f)
		
	return evaluation