Ejemplo n.º 1
0
def load_data(go_id):
    positive1 = list()
    positive2 = list()
    negative1 = list()
    negative2 = list()
    with open(DATA_ROOT + go_id + '.txt') as f:
        for line in f:
            line = line.strip().split(' ')
            label = int(line[0])
            seq = line[2][:MAXLEN]
            hydro = encode_seq_hydro(seq, maxlen=MAXLEN)
            seq = encode_seq_one_hot(seq, maxlen=MAXLEN)
            if label == 1:
                positive1.append(seq)
                positive2.append(hydro)
            else:
                negative1.append(seq)
                negative2.append(hydro)
    shuffle(negative1, negative2, seed=0)
    n = len(positive1)
    data1 = negative1[:n] + positive1
    data2 = negative2[:n] + positive2
    labels = [0] * len(negative1) + [1] * len(positive1)
    shuffle(data1, data2, labels, seed=0)
    data = (
        numpy.array(data1, dtype='float32'),
        numpy.array(data2, dtype='float32'))
    return (
        numpy.array(labels, dtype='float32'),
        data)
Ejemplo n.º 2
0
def load_data(parent_id, go_id):
    data = list()
    labels = list()
    positive = list()
    negative = list()
    with open(DATA_ROOT + parent_id + '/' + go_id + '.txt') as f:
        for line in f:
            line = line.strip().split(' ')
            label = int(line[0])
            seq = line[2][:MAXLEN]
            if label == 1:
                labels.append(1)
                positive.append(seq)
            else:
                labels.append(0)
                negative.append(seq)
    shuffle(negative, seed=0)
    n = len(positive)
    negative = negative[:n]
    n = len(positive)
    labels = [0] * len(negative) + [1] * len(positive)
    data = negative + positive
    for i in range(len(data)):
        data[i] = encode_seq_one_hot(data[i], maxlen=MAXLEN)
    shuffle(data, labels, seed=0)
    return numpy.array(labels), numpy.array(data, dtype='float32')
Ejemplo n.º 3
0
def main():
    start_time = time.time()
    print "Loading all proteins"
    all_prots = load_all_proteins()
    shuffle(all_prots)
    split = 0.8
    train_len = int(len(all_prots) * split)
    # print 'Loading train proteins'
    # train_set = load_train_proteins()
    # all_set = set(all_prots.keys())
    # print len(all_set), len(train_set)
    # unseen = all_set - train_set
    with open(RESULT_ROOT + "train.txt", "w") as f:
        for prot_id, seq, gos in all_prots[:train_len]:
            f.write(prot_id + "\t" + seq + "\t" + gos + "\n")
    with open(RESULT_ROOT + "test.txt", "w") as f:
        for prot_id, seq, gos in all_prots[train_len:]:
            f.write(prot_id + "\t" + seq + "\t" + gos + "\n")

    # print 'Loading unseen proteins'
    # unseen = load_unseen_proteins()
    # print 'Loading all proteins'
    # all_prots = load_all_proteins()
    # with open(DATA_ROOT + 'unseen-gos.txt', 'w') as f:
    #     for prot_id in unseen:
    #         f.write(prot_id)
    #         f.write('\t' + all_prots[prot_id] + '\n')

    end_time = time.time() - start_time
    print "Done in %d seconds" % (end_time,)
Ejemplo n.º 4
0
def load_data(parent_id, go_id):
    data = list()
    labels = list()
    positive = list()
    negative = list()
    with open(DATA_ROOT + parent_id + '/' + go_id + '.txt') as f:
        for line in f:
            line = line.strip().split(' ')
            label = int(line[0])
            seq = line[2][:MAXLEN]
            if label == 1:
                labels.append(1)
                positive.append(seq)
            else:
                labels.append(0)
                negative.append(seq)
    shuffle(negative, seed=0)
    n = len(positive)
    negative = negative[:n]
    n = len(positive)
    labels = [0] * len(negative) + [1] * len(positive)
    data = negative + positive
    for i in range(len(data)):
        data[i] = encode_seq_one_hot(data[i], maxlen=MAXLEN)
    shuffle(data, labels, seed=0)
    return numpy.array(labels), numpy.array(data, dtype='float32')
Ejemplo n.º 5
0
def load_data(parent_id, go_id):
    data = list()
    labels = list()
    global nb_classes
    with open(DATA_ROOT + parent_id + '/' + go_id + '.txt') as f:
        for line in f:
            line = line.strip().split('\t')
            seq = line[1][:MAXLEN]
            labs = line[2].split('|')
            data.append(seq)
            for i in range(len(labs)):
                labs[i] = int(labs[i])
                nb_classes = max(nb_classes, labs[i])
            labels.append(labs)
    nb_classes += 1
    for i in range(len(labels)):
        l = [0] * nb_classes
        for x in labels[i]:
            l[x] = 1
        labels[i] = l
    for i in range(len(data)):
        data[i] = encode_seq_one_hot(data[i], maxlen=MAXLEN)
    shuffle(data, labels, seed=0)
    return numpy.array(
        labels, dtype='float32'), numpy.array(data, dtype='float32')
def main():
    start_time = time.time()
    print 'Loading all proteins'
    all_prots = load_all_proteins()
    shuffle(all_prots, seed=0)
    split = 0.8
    train_len = int(len(all_prots) * split)
    # print 'Loading train proteins'
    # train_set = load_train_proteins()
    # all_set = set(all_prots.keys())
    # print len(all_set), len(train_set)
    # unseen = all_set - train_set
    with open(RESULT_ROOT + 'train.txt', 'w') as f:
        for prot_id, seq, gos in all_prots[:train_len]:
            f.write(prot_id + '\t' + seq + '\t' + gos + '\n')
    with open(RESULT_ROOT + 'test.txt', 'w') as f:
        for prot_id, seq, gos in all_prots[train_len:]:
            f.write(prot_id + '\t' + seq + '\t' + gos + '\n')

    # print 'Loading unseen proteins'
    # unseen = load_unseen_proteins()
    # print 'Loading all proteins'
    # all_prots = load_all_proteins()
    # with open(DATA_ROOT + 'unseen-gos.txt', 'w') as f:
    #     for prot_id in unseen:
    #         f.write(prot_id)
    #         f.write('\t' + all_prots[prot_id] + '\n')

    end_time = time.time() - start_time
    print 'Done in %d seconds' % (end_time, )
Ejemplo n.º 7
0
def load_data(go_id):
    positive1 = list()
    positive2 = list()
    negative1 = list()
    negative2 = list()
    with open(DATA_ROOT + go_id + '.txt') as f:
        for line in f:
            line = line.strip().split(' ')
            label = int(line[0])
            seq = line[2][:MAXLEN]
            hydro = encode_seq_hydro(seq, maxlen=MAXLEN)
            seq = encode_seq_one_hot(seq, maxlen=MAXLEN)
            if label == 1:
                positive1.append(seq)
                positive2.append(hydro)
            else:
                negative1.append(seq)
                negative2.append(hydro)
    shuffle(negative1, negative2, seed=0)
    n = len(positive1)
    data1 = negative1[:n] + positive1
    data2 = negative2[:n] + positive2
    labels = [0] * len(negative1) + [1] * len(positive1)
    shuffle(data1, data2, labels, seed=0)
    data = (numpy.array(data1,
                        dtype='float32'), numpy.array(data2, dtype='float32'))
    return (numpy.array(labels, dtype='float32'), data)
    def train(self, x_set, y_set):
        """
        Train function, training the model by splitting first the
        train dataset to train and validation, for each epoch we use
        shuffle for the original dataset and split it again. at the end
        of each epoch we use validation function to check accuracy and
        average loss for the specific epoch.
        :param x_set: the complete training dataset.
        :param y_set: the correlated classes.
        """
        loss_sum = 0
        for i in range(EPOCHS):
            x_set, y_set = utils.shuffle(x_set, y_set)
            train_x, train_y, val_x, val_y = utils.split_validation(
                x_set, y_set, VALIDATION_SIZE)
            train_x, train_y = utils.shuffle(train_x, train_y)

            # running of each example from the train dataset.
            for x, y in zip(train_x, train_y):
                x = np.reshape(x, (1, x.shape[0]))
                z1, h1, z2 = self.feedforward(x)
                probs = utils.softmax(self.weights2, h1, self.bias2, CLASSES)
                loss = utils.loss(probs[int(y)])
                loss_sum += loss
                self.backprop(x, y, z1, h1, z2, probs)
            val_loss, acc = self.validation(val_x, val_y)
Ejemplo n.º 9
0
def gen_random_annotations():
    go_ids = [pheno for pheno in get_phenos() if pheno.startswith("MP:")]
    shuffle(go_ids)
    groups = get_gene_groups(DATA_ROOT + 'mouse_pheno_annotations_genes.txt')
    with open(DATA_ROOT + 'mouse_pheno_annotations_genes_random.txt', 'w') as f:
        for group in groups:
            shuffle(go_ids)
            f.write(go_ids[0])
            for go_id in go_ids[1:group]:
                f.write('\t' + go_id)
            f.write('\n')
Ejemplo n.º 10
0
Archivo: ner.py Proyecto: rz1993/Enelpy
    def fit(self,
            docs,
            labels,
            batch_size=200,
            epochs=50,
            lstm_dim=200,
            lr=0.001,
            validate=False,
            val_every=100,
            val_docs=None,
            val_labels=None):
        if not self._built:
            self._build_train(lstm_dim, lr=lr)

        self._validate_input(docs, batch_size)
        all_char_ids, all_word_ids = batch_docs
        n_batches = len(char_ids) // batch_size

        if validate:
            self._validate_input(val_docs)
            if not val_labels:
                raise Exception('`val_labels must be non-empty list of'
                                '[`label_ids`] for cross validation.')
            val_char_ids, val_word_ids = val_docs

        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)

            run_train = [self.loss, self.train_step]
            run_eval = [self.pred_score]

            _iter = 0
            for epoch in range(epochs):
                utils.shuffle(docs)

                for i in range(n_batches):
                    _iter += 1
                    start, end = i*batch_size, (i+1)*batch_size
                    char_ids = all_char_ids[start:end]
                    word_ids = all_word_ids[start:end]
                    label_ids = labels[start:end]

                    loss, _ = sess.run(run_train,
                        feed_dict={self.char_ids: char_ids,
                                   self.word_ids: word_ids,
                                   self.label_ids: label_ids})
                    if validate and _iter % val_every == 0:
                        val_score = sess.run(run_eval,
                            feed_dict={self.char_ids: val_char_ids,
                                       self.word_ids: val_word_ids,
                                       self.label_ids: val_labels})
                        print('Validation accuracy: {0:.4f}'.format(val_score))
Ejemplo n.º 11
0
def train(batch_size, class_nums, growth_rate, weight_decay, depth, cifar10_path, train_epoch, lr):
    inputs = tf.placeholder(tf.float32, [None, 32, 32, 3])
    labels = tf.placeholder(tf.int64, [None])
    train_phase = tf.placeholder(tf.bool)
    learning_rate = tf.placeholder(tf.float32)
    logits = DenseNet(inputs, nums_out=class_nums, growth_rate=growth_rate, train_phase=train_phase, depth=depth)
    pred = softmax(logits)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(pred, axis=1), labels), tf.float32))
    one_hot_label = to_OneHot(labels, class_nums)
    cross_entropy_loss = tf.reduce_mean(-tf.log(tf.reduce_sum(pred * one_hot_label, axis=1) + 1e-10))
    regular = tf.add_n([tf.nn.l2_loss(var) for var in tf.trainable_variables()])
    Opt = tf.train.MomentumOptimizer(learning_rate, momentum=0.9, use_nesterov=True).minimize(cross_entropy_loss + weight_decay * regular)
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    path = cifar10_path + "data_batch_"
    valid_path = cifar10_path + "data_batch_5"
    loss_list = []
    train_acc_list = []
    test_acc_list = []
    saver = tf.train.Saver()
    # saver.restore(sess, "./save_para//.\\densenet.ckpt")
    # saver.restore(sess, "./save_para/densenet.ckpt")
    for epoch in range(train_epoch):
        if epoch == train_epoch // 2 or epoch == train_epoch * 3 // 4:
            lr /= 10
        for i in range(1, 6):
            if i != 5:
                data, labels_ = read_cifar_data(path + str(i))
                data, labels_ = shuffle(data, labels_)
            else:
                data, labels_ = read_cifar_data(path + str(i))
                data, labels_ = shuffle(data[:5000], labels_[:5000])
            for j in range(data.shape[0] // batch_size - 1):
                batch_data = data[j * batch_size:j * batch_size + batch_size, :, :, :]
                batch_labels = labels_[j * batch_size:j * batch_size + batch_size]
                [_, loss, acc] = sess.run([Opt, cross_entropy_loss, accuracy], feed_dict={inputs: batch_data, labels: batch_labels, train_phase: True, learning_rate: lr})
                loss_list.append(loss)
                train_acc_list.append(acc)
                if j % 100 == 0:
                    print("Epoch: %d, iter: %d, loss: %f, train_acc: %f"%(epoch, j, loss, acc))
                    np.savetxt("loss.txt", loss_list)
                    np.savetxt("train_acc.txt", train_acc_list)
                    np.savetxt("test_acc.txt", test_acc_list)
            if ((epoch + 1) % 5) == 0:
                vali_acc = validation_acc(inputs, labels, train_phase, accuracy, sess, valid_path)
                test_acc_list.append(vali_acc)
                print("Validation Accuracy: %f"%(vali_acc))
                saver.save(sess, "./save_para/densenet.ckpt")



# if __name__ == "__main__":
#     train(batch_size=64, class_nums=10, growth_rate=12, weight_decay=1e-4, depth=40, train_epoch=5)
def gen_sgd_random_annotations():
    print len(go)
    go_ids = [go_id for go_id in go if 'is_obsolete' not in go[go_id]]
    print len(go_ids)
    print len(go) - len(go_ids)
    shuffle(go_ids)
    groups = get_gene_groups()
    with open('data/sgd_random_annotations.txt', 'w') as f:
        for group in groups:
            shuffle(go_ids)
            f.write(go_ids[0])
            for go_id in go_ids[1:group]:
                f.write('\t' + go_id)
            f.write('\n')
def gen_go_annotations():
    print len(go)
    go_ids = [go_id for go_id in go if 'is_obsolete' not in go[go_id]]
    print len(go_ids)
    print len(go) - len(go_ids)
    shuffle(go_ids)
    with open('data/annotations.txt', 'w') as f:
        for group in range(1, 56):
            for i in range(100):
                shuffle(go_ids)
                f.write(go_ids[0])
                for go_id in go_ids[1:group]:
                    f.write('\t' + go_id)
                f.write('\n')
Ejemplo n.º 14
0
def gen_go_annotations():
    print len(go)
    go_ids = [go_id for go_id in go if 'is_obsolete' not in go[go_id]]
    print len(go_ids)
    print len(go) - len(go_ids)
    shuffle(go_ids)
    with open('data/annotations.txt', 'w') as f:
        for group in range(1, 56):
            for i in range(100):
                shuffle(go_ids)
                f.write(go_ids[0])
                for go_id in go_ids[1:group]:
                    f.write('\t' + go_id)
                f.write('\n')
Ejemplo n.º 15
0
def gtsrb(root=config.GTSRB):
	x_train, y_train, x_dev, y_dev, x_test, y_test = [], [], [], [], [], []

	classes = np.arange(0, class_num) # 0-42
	for i in trange(class_num):
		class_name = format(classes[i], '05d')
		prefix = root + '/Images/' + class_name + '/'
		f = open(prefix + 'GT-' + class_name + '.csv')
		reader = csv.reader(f, delimiter=';')
		next(reader, None)

		x, y = [], []
		for row in reader:
			img = cv2.imread(prefix + row[0])
			img = img[np.int(row[4]):np.int(row[6]), np.int(row[3]):np.int(row[5]), :] # np.int()从string转化为int
			# cv2.imshow('img', img)
			# cv2.waitKey(0)
			x.append(img)
			y.append(i)

		x, y = utils.shuffle(np.array(x), np.array(y))
		x, y = x.tolist(), y.tolist()

		split = len(y) // 10
		x_dev += x[:split]
		y_dev += y[:split]
		x_test += x[split:2*split]
		y_test += y[split:2*split]
		x_train += x[2*split:]
		y_train += y[2*split:]
		f.close()

	size = (32, 32)
	x_train = [cv2.resize(x, size) for x in x_train]
	x_dev   = [cv2.resize(x, size) for x in x_dev]
	x_test  = [cv2.resize(x, size) for x in x_test]

	x_train, y_train = np.array(x_train).astype(np.float32), np.array(y_train)
	x_dev, y_dev     = np.array(x_dev).astype(np.float32), np.array(y_dev)
	x_test, y_test   = np.array(x_test).astype(np.float32), np.array(y_test)

	x_train, x_dev, x_test = list(map(utils.data_normalize, [x_train, x_dev, x_test]))

	x_train, y_train = utils.shuffle(x_train, y_train)
	x_dev, y_dev     = utils.shuffle(x_dev, y_dev)
	x_test, y_test   = utils.shuffle(x_test, y_test)

	pickle.dump((x_train, y_train), open(root + '/train.p', 'wb'))
	pickle.dump((x_dev, y_dev), open(root + '/dev.p', 'wb'))
	pickle.dump((x_test, y_test), open(root + '/test.p', 'wb'))  # 'w' for write, 'b' for binary; use 'rb' to read
Ejemplo n.º 16
0
    def iterXY(self, X, Y):
        
        if self.shuffle:
            X, Y = shuffle(X, Y)

        for x_chunk, y_chunk in iter_data(X, Y, size=self.size*20):
            sort = np.argsort([len(x) for x in x_chunk])
            x_chunk = [x_chunk[idx] for idx in sort]
            y_chunk = [y_chunk[idx] for idx in sort]
            mb_chunks = [[x_chunk[idx:idx+self.size], y_chunk[idx:idx+self.size]] for idx in range(len(x_chunk))[::self.size]]
            mb_chunks = shuffle(mb_chunks)
            for xmb, ymb in mb_chunks:
                xmb = padded(xmb)
                yield self.x_dtype(xmb), self.y_dtype(ymb)  
Ejemplo n.º 17
0
    def iterXY(self, X, Y):
        
        if self.shuffle:
            X, Y = shuffle(X, Y)

        for x_chunk, y_chunk in iter_data(X, Y, size=self.size*20):
            sort = np.argsort([len(x) for x in x_chunk])
            x_chunk = [x_chunk[idx] for idx in sort]
            y_chunk = [y_chunk[idx] for idx in sort]
            mb_chunks = [[x_chunk[idx:idx+self.size], y_chunk[idx:idx+self.size]] for idx in range(len(x_chunk))[::self.size]]
            mb_chunks = shuffle(mb_chunks)
            for xmb, ymb in mb_chunks:
                xmb = padded(xmb)
                yield self.x_dtype(xmb), self.y_dtype(ymb)  
    def train(self):
        if self.cnn_type == '2d':
            y_ = self.build_network_2d()
        else:
            y_ = self.build_network()
        loss = -tf.reduce_mean(self.Y * tf.log(tf.clip_by_value(y_, 1e-10, 1.0)))
        train_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(loss)
        correct = tf.equal(tf.argmax(y_, 1), tf.argmax(self.Y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

        train_x, train_y = shuffle(self.train_x, self.train_y)
        train_xc, train_yc, val_xc, val_yc = cross_val(train_x, train_y, self.no_exp)

        sess_config = tf.ConfigProto()
        sess_config.gpu_options.per_process_gpu_memory_fraction = GPU_MEMORY
        with tf.Session(config=sess_config) as sess:
            tf.global_variables_initializer().run()
            for epoch in range(self.num_epochs):
                train_xc, train_yc = shuffle(train_xc, train_yc)
                for i in range(self.num_batches):
                    batch_x = extract_batch_size(train_xc, i, self.batch_size)
                    batch_y = extract_batch_size(train_yc, i, self.batch_size)
                    _, c = sess.run([train_op, loss], feed_dict={self.X: batch_x, self.Y: batch_y,
                                                                 self.is_training: True})
                if (epoch + 1) % self.print_val_each_epoch == 0:
                    print("### Epoch: ", epoch + 1, "|Train loss = ", c,
                          "|Val acc = ", sess.run(accuracy, feed_dict={self.X: val_xc, self.Y: val_yc,
                                                                       self.is_training: False}), " ###")
                # if (epoch + 1) % self.print_test_each_epoch == 0:
                #     print("### 1st After Epoch: ", epoch + 1,
                #           " |Test acc = ", sess.run(accuracy,
                #                                     feed_dict={self.X: self.test_x, self.Y: self.test_y,
                #                                                self.is_training: False}), " ###")
                if (epoch + 1) % self.print_test_each_epoch == 0:
                    test_acc = np.empty(0)
                    for i in range(self.test_x.shape[0] // self.batch_size):
                        batch_x_t = extract_batch_size(self.test_x, i, self.batch_size)
                        batch_y_t = extract_batch_size(self.test_y, i, self.batch_size)
                        test_acc = np.append(test_acc,
                                             sess.run(correct,
                                                      feed_dict={self.X: batch_x_t, self.Y: batch_y_t,
                                                                 self.is_training: False}))
                    # print(test_acc.shape)
                    _test_acc = np.average(test_acc)
                    print("### After Epoch: ", epoch + 1,
                          " |Test acc = ", _test_acc, " ###")
                    if self.print_cm:
                        pred_y = sess.run(tf.argmax(y_, 1), feed_dict={self.X: self.test_x, self.is_training: False})
                        cm = confusion_matrix(np.argmax(self.test_y, 1), pred_y, )
                        print(cm)
Ejemplo n.º 19
0
 def fit(self, X, y):
     # X: [n_samples, input_dim]
     n_samples, input_dim = X.shape
     self.W = np.random.randn(input_dim)
     self.b = 0.0
     # pass dataset for max_iter runs
     for i in range(self.max_iter):
         # suffle
         if self.shuffle:
             utils.shuffle(X, y)
         for j in range(n_samples):
             score = self.predict(X[j])
             self.W -= self.eta * (score - y[j]) * X[j]
             self.b -= self.eta * (score - y[j]) * 1
def gen_depth_annotations():
    get_go_by_depth('GO:0008150', 1)  # Biological process Ontology
    get_go_by_depth('GO:0005575', 1)  # Cellular component Ontology
    get_go_by_depth('GO:0003674', 1)  # Molecular function Ontology
    with open('data/depth_annotations.txt', 'w') as f:
        for level in go_depth:
            print level
            gos = list(go_depth[level])
            for i in range(100):
                shuffle(gos)
                n = abs(random.randint(2, min(100, len(gos)) - 1))
                f.write(gos[0])
                for go_id in gos[1:n]:
                    f.write('\t' + go_id)
                f.write('\n')
def gen_hp_annotations():
    cls = list()
    with open('data/hp.txt', 'r') as f:
        for line in f:
            items = line.strip().split()
            cls.append(items[0])
    shuffle(cls)
    with open('data/hp_annotations.txt', 'w') as f:
        for group in range(1, 56):
            for i in range(100):
                shuffle(cls)
                f.write(cls[0])
                for hp_id in cls[1:group]:
                    f.write('\t' + hp_id)
                f.write('\n')
Ejemplo n.º 22
0
def gen_hp_annotations():
    cls = list()
    with open('data/hp.txt', 'r') as f:
        for line in f:
            items = line.strip().split()
            cls.append(items[0])
    shuffle(cls)
    with open('data/hp_annotations.txt', 'w') as f:
        for group in range(1, 56):
            for i in range(100):
                shuffle(cls)
                f.write(cls[0])
                for hp_id in cls[1:group]:
                    f.write('\t' + hp_id)
                f.write('\n')
Ejemplo n.º 23
0
def gen_depth_annotations():
    get_go_by_depth('GO:0008150', 1)  # Biological process Ontology
    get_go_by_depth('GO:0005575', 1)  # Cellular component Ontology
    get_go_by_depth('GO:0003674', 1)  # Molecular function Ontology
    with open('data/depth_annotations.txt', 'w') as f:
        for level in go_depth:
            print level
            gos = list(go_depth[level])
            for i in range(100):
                shuffle(gos)
                n = abs(random.randint(2, min(100, len(gos)) - 1))
                f.write(gos[0])
                for go_id in gos[1:n]:
                    f.write('\t' + go_id)
                f.write('\n')
Ejemplo n.º 24
0
 def _get_inits(self, n_inits):
     init = list(range(len(self.distance_matrix)))
     inits = []
     for i in range(n_inits):
         new_init = shuffle(init)
         inits.append(new_init)
     return inits
Ejemplo n.º 25
0
def BPR_train_original(cuda_loader,
                       recommend_model,
                       loss_class,
                       epoch,
                       neg_k=1,
                       w=None):
    Recmodel = recommend_model
    Recmodel.train()
    bpr: utils.BPRLoss = loss_class
    users, posItems, negItems = cuda_loader.get_train_data_at(epoch)
    users, posItems, negItems = utils.shuffle(users, posItems, negItems)
    total_batch = len(users) // world.config['bpr_batch_size'] + 1
    aver_loss = 0.
    for (batch_i, (batch_users, batch_pos, batch_neg)) in enumerate(
            utils.minibatch(users,
                            posItems,
                            negItems,
                            batch_size=world.config['bpr_batch_size'])):
        cri = bpr.stageOne(batch_users, batch_pos, batch_neg)
        aver_loss += cri
        if world.tensorboard:
            w.add_scalar(
                f'BPRLoss/BPR', cri,
                epoch * int(len(users) / world.config['bpr_batch_size']) +
                batch_i)
    aver_loss = aver_loss / total_batch
    return f"[BPR[aver loss{aver_loss:.3e}]"
Ejemplo n.º 26
0
    def prepare_data_for_d(self):
        """generate positive and negative samples for the discriminator"""
        motifs = []
        labels = []
        g_s_args = []
        poss = []
        negs = []
        for i in range(self.graph.n_node):
            if np.random.rand() < config.update_ratio:
                pos = random.sample(
                    self.graph.id2motifs[i],
                    min(len(self.graph.id2motifs[i]), config.n_sample_dis))
                poss.append(pos)
                g_s_args.append((i, len(pos), True))

        negs, _ = self.sampling(g_s_args)
        for pos, neg in zip(poss, negs):
            if len(pos) != 0 and neg is not None:
                motifs.extend(pos)
                labels.extend([1] * len(pos))
                motifs.extend(neg)
                labels.extend([0] * len(neg))

        motifs, labels = utils.shuffle(motifs, labels)
        pickle.dump(
            motifs,
            open(config.cache_filename_prefix + '.motifs_ford.pkl', 'wb'))
        pickle.dump(
            labels,
            open(config.cache_filename_prefix + '.labels_ford.pkl', 'wb'))
        return motifs, labels
Ejemplo n.º 27
0
def main():
    divisions = 100
    ds = Dataset()
    ds.load("Bike-Sharing-Dataset/hour.csv")
    size = ds.get_size()

    X = []
    y = []
    percentages = []
    #Full X and y from dataset
    all_X = ds.get_x()
    all_y = ds.get_y()

    #Shuffle data and split into divisons
    for i in range(1, divisions + 1):
        percentage = (1 / divisions * i)
        percentages.append(percentage)

        all_X, all_y = utils.shuffle(all_X, all_y)
        X.append(all_X[:int(size * percentage)])
        y.append(all_y[:int(size * percentage)])

    X_train, X_test, y_train, y_test = split(X, y)

    scores, featureimportances = all_models(X_train, y_train, X_test, y_test)
    print("scores")
    print(scores)

    plt.scatter(percentages, scores)
    plt.ylabel('Score')
    plt.xlabel('Percentage of Original Dataset')
    plt.title('Percentage of Original Dataset vs Score')
    plt.show()

    plotFI(featureimportances)
Ejemplo n.º 28
0
def main(*args, **kwargs):
    try:
        if len(args) != 3:
            raise Exception("Please provide go_id and number of proteins")
        go_id = args[1]
        positives, negatives = load_data(go_id)
        n = int(args[2])
        shuffle(positives)
        shuffle(negatives)
        with open(DATA_ROOT + go_id + ".small.txt", "w") as f:
            for line in negatives[:n]:
                f.write(line + "\n")
            for line in positives[:n]:
                f.write(line + "\n")
    except Exception, e:
        print e
Ejemplo n.º 29
0
def main(*args, **kwargs):
    try:
        if len(args) != 3:
            raise Exception("Please provide go_id and number of proteins")
        go_id = args[1]
        positives, negatives = load_data(go_id)
        n = int(args[2])
        shuffle(positives)
        shuffle(negatives)
        with open(DATA_ROOT + go_id + '.small.txt', 'w') as f:
            for line in negatives[:n]:
                f.write(line + '\n')
            for line in positives[:n]:
                f.write(line + '\n')
    except Exception, e:
        print e
Ejemplo n.º 30
0
 def gen_aim(self):
     l=utils.copy_list(g.nos)
     #shuffle nos
     lt=utils.shuffle(l)
     #generate answer
     buff=""
     r=random.randint(1,2) # for level 1
     while True:
         n=lt[0]; lt.remove(n); buff+=str(n)
         if len(lt)==0: break
         if g.level>1: r=random.randint(0,2)
         if g.signs[r]=='=':
             n=eval(buff); buff=""; lt.append(n); lt=utils.shuffle(lt)
         else:
             buff=buff+g.signs[r]
     return eval(buff)
Ejemplo n.º 31
0
def BPR_train_original(dataset, recommend_model, loss_class, epoch, neg_k=1, w=None):
    Recmodel = recommend_model
    Recmodel.train()
    bpr: utils.BPRLoss = loss_class
    allusers = list(range(dataset.n_users))
    S, sam_time = utils.UniformSample_original(allusers, dataset)
    print(f"BPR[sample time][{sam_time[0]:.1f}={sam_time[1]:.2f}+{sam_time[2]:.2f}]")
    users = torch.Tensor(S[:, 0]).long()
    posItems = torch.Tensor(S[:, 1]).long()
    negItems = torch.Tensor(S[:, 2]).long()

    users = users.to(world.device)
    posItems = posItems.to(world.device)
    negItems = negItems.to(world.device)
    users, posItems, negItems = utils.shuffle(users, posItems, negItems)
    total_batch = len(users) // world.config['bpr_batch_size'] + 1
    aver_loss = 0.
    for (batch_i,
         (batch_users,
          batch_pos,
          batch_neg)) in enumerate(utils.minibatch(users,
                                                   posItems,
                                                   negItems,
                                                   batch_size=world.config['bpr_batch_size'])):
        cri = bpr.stageOne(batch_users, batch_pos, batch_neg)
        aver_loss += cri
        if world.tensorboard:
            w.add_scalar(f'BPRLoss/BPR', cri, epoch * int(len(users) / world.config['bpr_batch_size']) + batch_i)
    aver_loss = aver_loss / total_batch
    return f"[BPR[aver loss{aver_loss:.3e}]"
def load_data(path):

    # load from csv file
    Data = pd.read_csv(path)
    # split
    img_paths = Data["image_path"]
    X = []
    Y = Data["labelid"]

    # get images
    for path in img_paths:
        img =  cv2.imread(path , -1)  # BGR
        X.append(img)

    # convert into numpy arrays
    X = np.array(X).reshape(-1, H, W, C)
    Y = np.array(Y).reshape(-1, 1)  # 0 - based labels (scaller)

    # cast the data set to keep good precision
    X = X.astype(np.float64)  # 64 give accurate prevision for mean calc and subtraction

    # apply preprocessing
    X = dataset_preprocessing(X, 1)

    # shuffle
    X , Y = shuffle(X , Y)

    # demo
    print(Y[:100])
    print("\n\n")
    print(Y[500:700])

    return X , Y
Ejemplo n.º 33
0
def train(x, y, model, optimizer, loss_fn, params):
    model.train()

    x, y = utils.shuffle(x, y)
    total = len(y)
    n_batch = total // params.batch_size
    x_split, y_split = np.array_split(x, n_batch), np.array_split(y, n_batch)
    t = trange(n_batch)
    avg_loss = 0

    for i, (x_bch, y_bch) in enumerate(zip(x_split, y_split)):
        x_bch = torch.from_numpy(x_bch).float().permute(
            0, 3, 1, 2).to(device=params.device)
        y_bch = torch.from_numpy(y_bch).to(device=params.device)

        y_hat_bch = model(x_bch)
        loss = loss_fn(y_hat_bch, y_bch, params)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        t.set_postfix(loss='{:05.3f}'.format(loss.item()))
        t.update()

        avg_loss += loss.item() / n_batch

    return avg_loss
Ejemplo n.º 34
0
def main():
    """Main function."""
    # sorted valid RT dataset exists
    if os.path.isfile(VALID_RT_FILE_NAMES[0]):
        df = pnd.read_csv(VALID_RT_FILE_NAMES[0])
        sorted_rt_triangles = [[row[col] for col in df.columns[:3]]
                               for row in df.to_dict('records')]

    # unsorted valid RT dataset exists
    if os.path.isfile(VALID_RT_FILE_NAMES[1]):
        df = pnd.read_csv(VALID_RT_FILE_NAMES[1])
        unsorted_rt_triangles = [[row[col] for col in df.columns[:3]]
                                 for row in df.to_dict('records')]

    # valid RT dataset doesn't exist
    else:
        rt_triangles = create_valid_rt_points()
        sorted_rt_triangles = []
        unsorted_rt_triangles = []

        for [a, b, c] in rt_triangles:
            [x, y, z] = sorted([a, b, c])
            sorted_rt_triangles.append([x, y, z])

            [x, y, z] = shuffle([a, b, c])
            unsorted_rt_triangles.append([x, y, z])

    write_exp_rt_datasets(sorted_rt_triangles, unsorted_rt_triangles)
Ejemplo n.º 35
0
def train(model_dir, sample_dir):

    model_name = 'model_'

    input_x, input_y, validation_x, validation_y = load_data(dataset_name, batch_size)

    with tf.Session(config=tf.ConfigProto()) as sess:

        sess.run(init)

        ckpt = tf.train.get_checkpoint_state(model_dir)
        if ckpt and ckpt.model_checkpoint_path:
            ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
            print("[*] restore model : %s"%ckpt_name)
            saver.restore(sess, os.path.join(model_dir, ckpt_name))
        else:
            print("model does not exits")

        print('============ Start training ==============')

        iteration_number = input_x.shape[0] // batch_size

        for epoch in range(epoch_size):

            start_position = int(epoch % iteration_number)

            if (start_position == 0):
                input_x, input_y = shuffle(input_x, input_y)

            train_X = input_x[start_position*batch_size : (start_position+1)*batch_size]
            train_Y = input_y[start_position*batch_size : (start_position+1)*batch_size]

            if (epoch % 10 == 0):
                _, loss, train_acc = sess.run([train_op, total_loss, train_accuracy], feed_dict={X:train_X, label:train_Y})

                print('Train [%d / %d]: accuracy %.4f, loss %.4f' % (epoch, epoch_size, train_acc, loss))
            else:
                sess.run(train_op, feed_dict={X:train_X, label:train_Y})

            # save model and sample result
            if (epoch % 20 == 0):
                
                # evaluate
                iter_num = validation_x[1:1000].shape[0] // batch_size
                acc = 0.0
                for itr in range(iter_num):
                    samples = sess.run([decoded], feed_dict={X:train_X, label:train_Y})
                    # pos = int(random.random() * 5)
                    pos = itr
                    validate_label = sess.run(argmax_idx, feed_dict={X:validation_x[pos*batch_size:(pos+1)*batch_size]})

                    validate_acc = 1.0 * np.sum(validate_label == validation_y[pos*batch_size:(pos+1)*batch_size]) / batch_size
                    acc += validate_acc

                acc = acc / iter_num
                save_images(np.reshape(samples, (batch_size, 28, 28))[0:100], [10, 10], os.path.join(sample_dir, 'sample_'+str(epoch)+'.png'))
                save_images(np.reshape(train_X, (batch_size, 28, 28))[0:100], [10, 10], os.path.join(sample_dir, 'input_'+str(epoch)+'.png'))
                saver.save(sess, './model/' + model_name + str(epoch) + '.ckpt')
                print("Save model and sample images, val acc: %.4f" % acc)
Ejemplo n.º 36
0
 def next_card(self):
     if self.rest == []:  # ensure no cards are lost!
         fresh = utils.copy_list(self.deck)
         fresh = utils.shuffle(fresh)
         for n in fresh:
             if n not in self.yes: self.rest.append(n)
     n = self.rest.pop()
     return n
Ejemplo n.º 37
0
    def iterXY(self, X, Y):
        
        if self.shuffle:
            X, Y = shuffle(X, Y)

        for xmb, ymb in iter_data(X, Y, size=self.size):
            xmb = padded(xmb)
            yield self.x_dtype(xmb), self.y_dtype(ymb)
Ejemplo n.º 38
0
def replaceChars(token, subData):
    subProb = subData["count"]
    subMatrix = subData["subs"]
    appliable = {k: subProb[k] for k in subProb.keys() if k in token}
    subCandidates = list(appliable.keys())
    shuffle(subCandidates)
    tokenBitMask = [0 for char in token]
    for sub in subCandidates:
        subProb = appliable[sub]
        subWith = weighted_choice(subMatrix[sub])
        for start in find_all(token, sub):
            if sum(tokenBitMask[start:start + len(sub)]
                   ) == 0 and probability_boolean(subProb):
                token = token[:start] + subWith + token[start + len(sub):]
                tokenBitMask = tokenBitMask[:start] + \
                    [1 for c in subWith] + tokenBitMask[start+len(sub):]
    return token
Ejemplo n.º 39
0
    def iterXY(self, X, Y):

        if self.shuffle:
            X, Y = shuffle(X, Y)

        for xmb, ymb in iter_data(X, Y, size=self.size):
            xmb = padded(xmb)
            yield self.x_dtype(xmb), self.y_dtype(ymb)
Ejemplo n.º 40
0
def test():
    pre_processor = PreProcessor()
    start_time = time.time()
    numberal_data, labels = pre_processor.text2number('./data/aspect/')
    X, y = utils.shuffle(numberal_data, labels)
    X, y = utils.shuffle(X, y)

    X_train, y_train = X[:4200], y[:4200]
    X_test, y_test = X[4200:], y[4200:]

    with open('./data/numberal_data/aspect_data_training.pkl', 'wb') as f:
        print(X_train.shape, y_train.shape)
        pickle.dump({'sample': X_train, 'label': y_train}, f)

    with open('./data/numberal_data/aspect_data_test.pkl', 'wb') as f:
        print(X_test.shape, y_test.shape)
        pickle.dump({'sample': X_test, 'label': y_test}, f)
Ejemplo n.º 41
0
    def iterXY(self, X, Y):

        if self.shuffle:
            X, Y = shuffle(X, Y)

        for xmb, ymb in iter_data(X, Y, size=self.size):
            xmb = self.trXt(xmb)
            ymb = self.trYt(ymb)            
            yield xmb, ymb
Ejemplo n.º 42
0
    def iterXY(self, X, Y):
        
        if self.shuffle:
            X, Y = shuffle(X, Y)

        for x_chunk, y_chunk in iter_data(X, Y, size=self.size*20):
            sort = np.argsort([len(x) for x in x_chunk])
            x_chunk = [x_chunk[idx] for idx in sort]
            y_chunk = [y_chunk[idx] for idx in sort]
            mb_chunks = [[x_chunk[idx:idx+self.size], y_chunk[idx:idx+self.size]] for idx in range(len(x_chunk))[::self.size]]
            mb_chunks = shuffle(mb_chunks)
            for xmb, ymb in mb_chunks:
                xmb = _padded(xmb, final=self.y_lag)
                if ymb[0].ndim == 2:
                    ymb, padsize = _padded(ymb, return_sizes=True, initial=self.y_lag)
                    yield self.x_dtype(xmb), (self.y_dtype(ymb), padsize.T)
                else:
                    yield self.x_dtype(xmb), self.y_dtype(ymb)
Ejemplo n.º 43
0
def shuffle_split_users(users):
    has_image        = (lambda a: a.get('image_url', None) is not None)
    has_custom_image = (lambda a: has_image(a) and not is_static_profile_image(a['image_url']))
    
    # find all users who have a custom profile image
    a0 = filter(has_custom_image, users)
    
    # find all users who have the default profile image
    a1 = filter(lambda a: not has_custom_image(a), users)
    
    # shuffle them both independently
    a0 = utils.shuffle(a0)
    a1 = utils.shuffle(a1)
    
    # and combine the results s.t. all users with custom profile images precede 
    # all those without custom profile images
    a0.extend(a1)
    
    return a0
Ejemplo n.º 44
0
def main(*args, **kwargs):
    if len(args) < 3:
        raise Exception('Please provide parent id and go id')
    parent_id = args[1]
    go_id = args[2]
    if len(args) == 4:
        level = int(args[3])
        global CUR_LEVEL
        global NEXT_LEVEL
        CUR_LEVEL = 'level_' + str(level) + '/'
        NEXT_LEVEL = 'level_' + str(level + 1) + '/'
    df = load_data(parent_id, go_id)
    go_node = go[go_id]
    for ch_id in go_node['children']:
        ch_set = get_subtree_set(ch_id)
        positives = list()
        negatives = list()
        for i in df.index:
            pos = False
            for g_id in df['gos'][i]:
                if g_id in ch_set:
                    pos = True
                    break
            if pos:
                positives.append(i)
            else:
                negatives.append(i)
        n = min(len(positives), len(negatives))
        if n > 0:
            shuffle(positives)
            shuffle(negatives)
            positives = positives[:n]
            negatives = negatives[:n]
            filename = DATA_ROOT + NEXT_LEVEL + go_id + '/' + ch_id + '.pkl'
            if not os.path.exists(os.path.dirname(filename)):
                os.makedirs(os.path.dirname(filename))
            labels = [0] * n + [1] * n
            index = negatives + positives
            new_df = df.reindex(index)
            new_df['labels'] = pd.Series(labels, index=new_df.index)
            new_df.to_pickle(filename)
Ejemplo n.º 45
0
    def iterXY(self, X, Y):
        
        if self.shuffle:
            X, Y = shuffle(X, Y)

        self.loader = Loader(X, self.train_load, self.train_transform, self.size)
        self.proc = Process(target=self.loader.load)
        self.proc.start()

        for ymb in iter_data(Y, size=self.size):
            xmb = self.loader.get()             
            yield xmb, floatX(ymb)
def main(*args, **kwargs):
    if len(args) != 2:
        raise Exception('Please provide function id')
    go_id = args[1]
    paacs = load_data_by_prot_id(go_id)
    data = load_training_data(go_id)
    go_node = go[go_id]
    go_set = get_subtree_set(go_id)
    for ch_id in go_node['children']:
        ch_set = get_subtree_set(ch_id)
        positives = list()
        negatives = list()
        for prot_id, gos in data:
            if prot_id not in paacs:
                continue
            pos = False
            for g_id in gos:
                if g_id in ch_set:
                    pos = True
                    break
            if pos:
                positives.append(prot_id)
            else:
                negatives.append(prot_id)
        n = len(positives)
        shuffle(positives)
        shuffle(negatives)
        negatives = negatives[:n]
        with open(DATA_ROOT + 'level_2/' + go_id + '/' + ch_id + '.txt', 'w') as f:
            for prot_id in negatives:
                f.write('0 ' + prot_id)
                for p in paacs[prot_id]:
                    f.write(' ' + str(p))
                f.write('\n')
            for prot_id in positives:
                f.write('1 ' + prot_id)
                for p in paacs[prot_id]:
                    f.write(' ' + str(p))
                f.write('\n')
Ejemplo n.º 47
0
    def iterXY(self, X, Y):
        
        if self.shuffle:
            X, Y = shuffle(X, Y)

        for xmb, ymb in iter_data(X, Y, size=self.size):
            xmb = self.x_dtype(xmb)
            shape = range(len(xmb.shape))
            shape[0] = 1
            shape[1] = 0
            shape = tuple(shape)
            xmb = xmb.transpose(*shape)
            yield xmb, self.y_dtype(ymb)
Ejemplo n.º 48
0
    def iterXY(self, X, Y):
        
        if self.shuffle:
            X, Y = shuffle(X, Y)

        for xmb, ymb in iter_data(X, Y, size=self.size):
            xmb = _padded(xmb, final=self.y_lag)
            if ymb[0].ndim == 2:
                # sequence prediction
                ymb, padsize = _padded(ymb, return_sizes=True, initial=self.y_lag)
                yield self.x_dtype(xmb), (self.y_dtype(ymb), padsize.T)
            else:
                yield self.x_dtype(xmb), self.y_dtype(ymb)
Ejemplo n.º 49
0
def select_proteins(go_id, parent_go_set):
    node = go[go_id]
    pos_go_set = get_subtree_set(go_id)
    neg_go_set = parent_go_set - pos_go_set
    positives = set()
    for g_id in pos_go_set:
        if g_id in go_prot:
            positives |= go_prot[g_id]
    negatives = set()
    for g_id in neg_go_set:
        if g_id in go_prot:
            negatives |= go_prot[g_id]
    negatives = negatives - positives
    positives = list(positives)
    negatives = list(negatives)
    shuffle(positives)
    shuffle(negatives)
    min_len = min(len(positives), len(negatives))
    # with open(RESULT_ROOT + go_id + '.txt', 'w') as f:
    labels = list()
    proteins = list()
    data = list()
    for prot_id in negatives[:min_len]:
        labels.append(0)
        proteins.append(prot_id)
        data.append(fofe[prot_id])
    for prot_id in positives[:min_len]:
        labels.append(1)
        proteins.append(prot_id)
        data.append(fofe[prot_id])
    df = pd.DataFrame({'labels': labels, 'proteins': proteins, 'data': data})
    df.to_pickle(RESULT_ROOT + go_id + '.pkl')
    # numpy.savez(
    #     RESULT_ROOT + go_id + '.npz',
    #     labels=numpy.array(labels),
    #     proteins=numpy.array(proteins),
    #     data=numpy.array(data))
    print 'Finished selection for ' + go_id
Ejemplo n.º 50
0
 def gen_aim(self):
     l = utils.copy_list(g.nos)
     #shuffle nos
     lt = utils.shuffle(l)
     #generate answer
     buff = ""
     r = random.randint(1, 2)  # for level 1
     while True:
         n = lt[0]
         lt.remove(n)
         buff += str(n)
         if len(lt) == 0:
             break
         if g.level > 1:
             r = random.randint(0, 2)
         if g.signs[r] == '=':
             n = eval(buff)
             buff = ""
             lt.append(n)
             lt = utils.shuffle(lt)
         else:
             buff = buff + g.signs[r]
     return eval(buff)
Ejemplo n.º 51
0
def load_data(parent_id, go_id):
    data1 = list()
    data2 = list()
    labels = list()
    positive1 = list()
    negative1 = list()
    positive2 = list()
    negative2 = list()

    with open(DATA_ROOT + parent_id + '/' + go_id + '.txt') as f:
        for line in f:
            line = line.strip().split(' ')
            label = int(line[0])
            seq = line[2][:MAXLEN]
            sq1 = encode_seq_one_hot(seq, maxlen=MAXLEN)
            sq2 = encode_seq(OGAK980101, seq, maxlen=MAXLEN)
            sq3 = encode_seq(MEHP950102, seq, maxlen=MAXLEN)
            sq4 = encode_seq(CROG050101, seq, maxlen=MAXLEN)
            sq5 = encode_seq(TOBD000101, seq, maxlen=MAXLEN)
            sq6 = encode_seq(ALTS910101, seq, maxlen=MAXLEN)
            if label == 1:
                positive1.append([sq1])
                positive2.append(sq1)
            else:
                negative1.append([sq1])
                negative2.append(sq1)
    shuffle(negative1, negative2, seed=0)
    n = min(len(positive1), len(negative1))
    data1 = negative1[:n] + positive1[:n]
    data2 = negative2[:n] + positive2[:n]
    labels = [0.0] * n + [1.0] * n
    # Previous was 30
    shuffle(data1, data2, labels, seed=0)
    data = (
        numpy.array(data1, dtype='float32'),
        numpy.array(data2, dtype='float32'))
    return (numpy.array(labels), data)
def select_proteins(go_id, parent_go_set):
    node = go[go_id]
    pos_go_set = get_subtree_set(go_id)
    neg_go_set = parent_go_set - pos_go_set
    positives = set()
    for g_id in pos_go_set:
        if g_id in go_prot:
            positives |= go_prot[g_id]
    negatives = set()
    for g_id in neg_go_set:
        if g_id in go_prot:
            negatives |= go_prot[g_id]
    negatives = negatives - positives
    positives = list(positives)
    negatives = list(negatives)
    shuffle(positives, seed=10)
    shuffle(negatives, seed=10)
    min_len = min(len(positives), len(negatives))
    with open(RESULT_ROOT + go_id + '.txt', 'w') as f:
        for prot_id in negatives[:min_len]:
            f.write('0 ' + prot_id + ' ' + prot_paac[prot_id] + '\n')
        for prot_id in positives[:min_len]:
            f.write('1 ' + prot_id + ' ' + prot_paac[prot_id] + '\n')
    print 'Finished selection for ' + go_id
Ejemplo n.º 53
0
def load_data(go_id):
    data = list()
    labels = list()
    pos = 1
    positive = list()
    negative = list()
    with open(DATA_ROOT + go_id + '.txt') as f:
        for line in f:
            line = line.strip().split(' ')
            label = int(line[0])
            seq = []
            seq = encode_seq_one_hot(line[2][:500], maxlen=MAXLEN)

            if label == pos:
                positive.append(seq)
            else:
                negative.append(seq)
    shuffle(negative, seed=0)
    n = len(positive)
    data = negative[:n] + positive
    labels = [0.0] * n + [1.0] * n
    # Previous was 30
    shuffle(data, labels, seed=0)
    return numpy.array(labels), numpy.array(data, dtype="float32")
Ejemplo n.º 54
0
    def iterXY(self, X, Y):
        
        if self.shuffle:
            X, Y = shuffle(X, Y)

        for x_chunk, y_chunk in iter_data(X, Y, size=self.size*20):
            sort = np.argsort([len(x) for x in x_chunk])
            x_chunk = [x_chunk[idx] for idx in sort]
            y_chunk = [y_chunk[idx] for idx in sort]
            mb_chunks = [[x_chunk[idx:idx+self.size], y_chunk[idx:idx+self.size]] for idx in range(len(x_chunk))[::self.size]]
            py_rng.shuffle(mb_chunks)
            for xmb, ymb in mb_chunks:
                xmb = self.trXt(xmb)
                ymb = self.trYt(ymb)
                yield xmb, ymb
Ejemplo n.º 55
0
    def iterXY(self, X, Y):
        
        if self.shuffle:
            X, Y = shuffle(X, Y)

        for xmb, ymb in iter_data(X, Y, size=self.size):
            xmb = self.x_dtype(xmb)
            shape = range(len(xmb.shape))
            shape[0] = 1
            shape[1] = 0
            shape = tuple(shape)
            xmb = xmb.transpose(*shape)
            ymb = self.y_dtype(ymb)
            if ymb.ndim == 3:
                # sequence prediction! also reorder ymb.
                ymb = ymb.transpose(*shape)
            yield xmb, ymb
Ejemplo n.º 56
0
 def process_user(self, user, categories=None):
     assert user is not None
     
     if categories is None:
         categories = self._categories
     
     retries = 0
     
     while retries < 3:
         try:
             for category in categories:
                 ts = {
                     'user_id' : user.user_id, 
                     'scope'   : 'user'
                 }
                 
                 if category != 'default':
                     if category == 'app':
                         ts['subcategory'] = 'app'
                     else:
                         ts['category']    = category
                 
                 ts['limit'] = 100
                 collage     = self._collages[category]
                 stamp_slice = HTTPTimeSlice().dataImport(ts).exportTimeSlice()
                 stamps      = self.api.getStampCollection(stamp_slice)
                 entities    = map(lambda s: s.entity, stamps)
                 entities    = utils.shuffle(entities)[:30]
                 
                 logs.info("creating collage for user '%s' w/ category '%s' and %d entities" % (user.screen_name, category, len(entities)))
                 images = collage.generate_from_user(user, entities)
                 
                 for image in images:
                     filename = "collage-%s-%s-%sx%s.jpg" % (user.screen_name, category, image.size[0], image.size[1])
                     
                     self.save_image(image, filename)
             
             break
         except Exception, e:
             logs.warn("unexpected error processing user %s: %s" % (str(user), e))
             logs.warn(utils.getFormattedException())
             
             retries += 1
             time.sleep(2 ** retries)
 def __init__(self, data, n_valid, corruptor=None, prng=None):
     """
     Parameters
     ----------
     data : numpy array
         Data matrix array with rows corresponding to data vectors.
     n_valid : integer
         Number of data vectors to use as validation set.
     corruptor : function(Array, RandomState) or None
         Optional function which applies random 'corruption' / augmentation
         to data, for example dequantising pixel values, adding noise,
         applying random affine transformation to image. Applied on
         initialisation and at end of each training epoch.
     prng : RandomState or None
         Seeded pseudo-random number generator - used to shuffle data
         and for corruptor if specified.
     """
     self.data = data
     self.n_valid = n_valid
     self.n_train = data.shape[0] - n_valid
     self.corruptor = corruptor
     if prng is None:
         prng = np.random.RandomState()
     self.prng = prng
     shuffled_data, self.perm = utils.shuffle(self.data, self.prng)
     self.data_valid, self.data_train = utils.split(shuffled_data, n_valid)
     if corruptor is None:
         self.x_valid = th.shared(
             self.data_valid.astype(th.config.floatX), 'x_valid')
         self.x_train = th.shared(
             self.data_train.astype(th.config.floatX), 'x_train')
     else:
         corrupted_data_valid = self.corruptor(self.data_valid, self.prng)
         corrupted_data_train = self.corruptor(self.data_train, self.prng)
         self.x_valid = th.shared(
             corrupted_data_valid.astype(th.config.floatX), 'x_valid')
         self.x_train = th.shared(
             corrupted_data_train.astype(th.config.floatX), 'x_train')
Ejemplo n.º 58
0
    def _create_collage(
        self,
        user,
        images,
        num_rows=None,
        num_cols=None,
        respect_aspect_ratio=False,
        adaptive_image_resizing=True,
        enable_drop_shadows=False,
        row_major=True,
        shuffle_images=False,
    ):

        # must specify num_cols or num_rows, but not both
        assert (num_cols is not None and num_cols > 0) != (num_rows is not None and num_rows > 0)

        num_images = len(images)
        output = []

        if num_rows is None:
            num_cols = int(num_cols)
            num_rows = int(math.ceil(num_images / num_cols))
        elif num_cols is None:
            num_rows = int(num_rows)
            num_cols = int(math.ceil(num_images / num_rows))

        user_logo_url = "http://static.stamped.com/logos/%s-%s-email-36x36.png" % (
            user.color_primary,
            user.color_secondary,
        )
        try:
            user_logo = utils.getWebImage(user_logo_url)
        except Exception:
            user_logo = None

        user_logo_cache = {}

        def get_user_logo(size):
            if user_logo is None:
                return None

            try:
                return user_logo_cache[size]
            except KeyError:
                logo = user_logo.resize(size, Image.ANTIALIAS)
                user_logo_cache[size] = logo

                return logo

        for size in self._sizes:
            logs.info("[%s] creating %sx%s collage" % (self, size[0], size[1]))

            canvas = Image.new("RGBA", size, (255, 255, 255, 255))
            offsets = []
            indices = []

            if row_major:
                for i in xrange(num_rows):
                    for j in xrange(num_cols):
                        indices.append(len(offsets))
                        offsets.append((i, j))
            else:
                for j in xrange(num_cols):
                    for i in xrange(num_rows):
                        indices.append(len(offsets))
                        offsets.append((i, j))

            if shuffle_images:
                indices = utils.shuffle(indices)

            for index in indices:
                i, j = offsets[index]

                # wrap images around if necessary to fill last row
                index = (i * num_cols + j) % num_images
                image = images[index]

                cell_size, cell_pos, logo_size, logo_pos = self.get_cell_bounds_func(
                    size, num_cols, num_rows, i, j, image
                )

                # adjust cell layout bounds to align to integer grid (helps minimize aliasing)
                cell_size = int(math.ceil(cell_size[0])), int(math.ceil(cell_size[1]))
                cell_pos = int(math.floor(cell_pos[0])), int(math.floor(cell_pos[1]))

                logo_size = int(math.ceil(logo_size[0])), int(math.ceil(logo_size[1]))
                logo_pos = int(math.floor(logo_pos[0])), int(math.floor(logo_pos[1]))

                width = cell_size[0]
                height = cell_size[1]

                if adaptive_image_resizing:
                    if image.size[0] / cell_size[0] < image.size[1] / cell_size[1]:
                        width = cell_size[0]
                        height = (width * image.size[1]) / image.size[0]

                        if not respect_aspect_ratio and height > cell_size[1]:
                            height = int((height + cell_size[1]) * 0.5)
                    else:
                        height = cell_size[1]
                        width = (height * image.size[0]) / image.size[1]

                        if not respect_aspect_ratio and width > cell_size[0]:
                            width = int((width + cell_size[0]) * 0.5)

                cell = image.resize((width, height), Image.ANTIALIAS)
                w = min(width, cell_size[0])
                h = min(height, cell_size[1])
                cell = cell.crop((0, 0, w, h))

                if enable_drop_shadows:
                    self._paste_image_with_drop_shadow(canvas, cell, cell_pos)
                else:
                    canvas.paste(cell, cell_pos)

                # overlay user's stamp logo on top of each entity image
                logo = get_user_logo(logo_size)

                if logo is not None:
                    logo_box = (logo_pos[0], logo_pos[1], logo_pos[0] + logo.size[0], logo_pos[1] + logo.size[1])
                    canvas.paste(logo, logo_box, logo)

            canvas = self._apply_postprocessing(canvas, user)
            output.append(canvas)

        return output
Ejemplo n.º 59
0
import os
p = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if not p in sys.path:
    sys.path.append(p)

import utils
from models import LR, FM, PNN1, PNN2, FNN, CCPM

train_file = '../../output/fm/train.fm'
test_file = '../../output/fm/test.fm'

input_dim = utils.INPUT_DIM

train_data = utils.read_data(train_file)
# train_data = pkl.load(open('../data/train.yx.pkl', 'rb'))
train_data = utils.shuffle(train_data)
test_data = utils.read_data(test_file)
# test_data = pkl.load(open('../data/test.yx.pkl', 'rb'))
# pkl.dump(train_data, open('../data/train.yx.pkl', 'wb'))
# pkl.dump(test_data, open('../data/test.yx.pkl', 'wb'))

if train_data[1].ndim > 1:
    print('label must be 1-dim')
    exit(0)
print('read finish')
print('train data size:', train_data[0].shape)
print('test data size:', test_data[0].shape)

train_size = train_data[0].shape[0]
test_size = test_data[0].shape[0]
num_feas = len(utils.FIELD_SIZES)