def load_data(go_id): positive1 = list() positive2 = list() negative1 = list() negative2 = list() with open(DATA_ROOT + go_id + '.txt') as f: for line in f: line = line.strip().split(' ') label = int(line[0]) seq = line[2][:MAXLEN] hydro = encode_seq_hydro(seq, maxlen=MAXLEN) seq = encode_seq_one_hot(seq, maxlen=MAXLEN) if label == 1: positive1.append(seq) positive2.append(hydro) else: negative1.append(seq) negative2.append(hydro) shuffle(negative1, negative2, seed=0) n = len(positive1) data1 = negative1[:n] + positive1 data2 = negative2[:n] + positive2 labels = [0] * len(negative1) + [1] * len(positive1) shuffle(data1, data2, labels, seed=0) data = ( numpy.array(data1, dtype='float32'), numpy.array(data2, dtype='float32')) return ( numpy.array(labels, dtype='float32'), data)
def load_data(parent_id, go_id): data = list() labels = list() positive = list() negative = list() with open(DATA_ROOT + parent_id + '/' + go_id + '.txt') as f: for line in f: line = line.strip().split(' ') label = int(line[0]) seq = line[2][:MAXLEN] if label == 1: labels.append(1) positive.append(seq) else: labels.append(0) negative.append(seq) shuffle(negative, seed=0) n = len(positive) negative = negative[:n] n = len(positive) labels = [0] * len(negative) + [1] * len(positive) data = negative + positive for i in range(len(data)): data[i] = encode_seq_one_hot(data[i], maxlen=MAXLEN) shuffle(data, labels, seed=0) return numpy.array(labels), numpy.array(data, dtype='float32')
def main(): start_time = time.time() print "Loading all proteins" all_prots = load_all_proteins() shuffle(all_prots) split = 0.8 train_len = int(len(all_prots) * split) # print 'Loading train proteins' # train_set = load_train_proteins() # all_set = set(all_prots.keys()) # print len(all_set), len(train_set) # unseen = all_set - train_set with open(RESULT_ROOT + "train.txt", "w") as f: for prot_id, seq, gos in all_prots[:train_len]: f.write(prot_id + "\t" + seq + "\t" + gos + "\n") with open(RESULT_ROOT + "test.txt", "w") as f: for prot_id, seq, gos in all_prots[train_len:]: f.write(prot_id + "\t" + seq + "\t" + gos + "\n") # print 'Loading unseen proteins' # unseen = load_unseen_proteins() # print 'Loading all proteins' # all_prots = load_all_proteins() # with open(DATA_ROOT + 'unseen-gos.txt', 'w') as f: # for prot_id in unseen: # f.write(prot_id) # f.write('\t' + all_prots[prot_id] + '\n') end_time = time.time() - start_time print "Done in %d seconds" % (end_time,)
def load_data(parent_id, go_id): data = list() labels = list() global nb_classes with open(DATA_ROOT + parent_id + '/' + go_id + '.txt') as f: for line in f: line = line.strip().split('\t') seq = line[1][:MAXLEN] labs = line[2].split('|') data.append(seq) for i in range(len(labs)): labs[i] = int(labs[i]) nb_classes = max(nb_classes, labs[i]) labels.append(labs) nb_classes += 1 for i in range(len(labels)): l = [0] * nb_classes for x in labels[i]: l[x] = 1 labels[i] = l for i in range(len(data)): data[i] = encode_seq_one_hot(data[i], maxlen=MAXLEN) shuffle(data, labels, seed=0) return numpy.array( labels, dtype='float32'), numpy.array(data, dtype='float32')
def main(): start_time = time.time() print 'Loading all proteins' all_prots = load_all_proteins() shuffle(all_prots, seed=0) split = 0.8 train_len = int(len(all_prots) * split) # print 'Loading train proteins' # train_set = load_train_proteins() # all_set = set(all_prots.keys()) # print len(all_set), len(train_set) # unseen = all_set - train_set with open(RESULT_ROOT + 'train.txt', 'w') as f: for prot_id, seq, gos in all_prots[:train_len]: f.write(prot_id + '\t' + seq + '\t' + gos + '\n') with open(RESULT_ROOT + 'test.txt', 'w') as f: for prot_id, seq, gos in all_prots[train_len:]: f.write(prot_id + '\t' + seq + '\t' + gos + '\n') # print 'Loading unseen proteins' # unseen = load_unseen_proteins() # print 'Loading all proteins' # all_prots = load_all_proteins() # with open(DATA_ROOT + 'unseen-gos.txt', 'w') as f: # for prot_id in unseen: # f.write(prot_id) # f.write('\t' + all_prots[prot_id] + '\n') end_time = time.time() - start_time print 'Done in %d seconds' % (end_time, )
def load_data(go_id): positive1 = list() positive2 = list() negative1 = list() negative2 = list() with open(DATA_ROOT + go_id + '.txt') as f: for line in f: line = line.strip().split(' ') label = int(line[0]) seq = line[2][:MAXLEN] hydro = encode_seq_hydro(seq, maxlen=MAXLEN) seq = encode_seq_one_hot(seq, maxlen=MAXLEN) if label == 1: positive1.append(seq) positive2.append(hydro) else: negative1.append(seq) negative2.append(hydro) shuffle(negative1, negative2, seed=0) n = len(positive1) data1 = negative1[:n] + positive1 data2 = negative2[:n] + positive2 labels = [0] * len(negative1) + [1] * len(positive1) shuffle(data1, data2, labels, seed=0) data = (numpy.array(data1, dtype='float32'), numpy.array(data2, dtype='float32')) return (numpy.array(labels, dtype='float32'), data)
def train(self, x_set, y_set): """ Train function, training the model by splitting first the train dataset to train and validation, for each epoch we use shuffle for the original dataset and split it again. at the end of each epoch we use validation function to check accuracy and average loss for the specific epoch. :param x_set: the complete training dataset. :param y_set: the correlated classes. """ loss_sum = 0 for i in range(EPOCHS): x_set, y_set = utils.shuffle(x_set, y_set) train_x, train_y, val_x, val_y = utils.split_validation( x_set, y_set, VALIDATION_SIZE) train_x, train_y = utils.shuffle(train_x, train_y) # running of each example from the train dataset. for x, y in zip(train_x, train_y): x = np.reshape(x, (1, x.shape[0])) z1, h1, z2 = self.feedforward(x) probs = utils.softmax(self.weights2, h1, self.bias2, CLASSES) loss = utils.loss(probs[int(y)]) loss_sum += loss self.backprop(x, y, z1, h1, z2, probs) val_loss, acc = self.validation(val_x, val_y)
def gen_random_annotations(): go_ids = [pheno for pheno in get_phenos() if pheno.startswith("MP:")] shuffle(go_ids) groups = get_gene_groups(DATA_ROOT + 'mouse_pheno_annotations_genes.txt') with open(DATA_ROOT + 'mouse_pheno_annotations_genes_random.txt', 'w') as f: for group in groups: shuffle(go_ids) f.write(go_ids[0]) for go_id in go_ids[1:group]: f.write('\t' + go_id) f.write('\n')
def fit(self, docs, labels, batch_size=200, epochs=50, lstm_dim=200, lr=0.001, validate=False, val_every=100, val_docs=None, val_labels=None): if not self._built: self._build_train(lstm_dim, lr=lr) self._validate_input(docs, batch_size) all_char_ids, all_word_ids = batch_docs n_batches = len(char_ids) // batch_size if validate: self._validate_input(val_docs) if not val_labels: raise Exception('`val_labels must be non-empty list of' '[`label_ids`] for cross validation.') val_char_ids, val_word_ids = val_docs with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) run_train = [self.loss, self.train_step] run_eval = [self.pred_score] _iter = 0 for epoch in range(epochs): utils.shuffle(docs) for i in range(n_batches): _iter += 1 start, end = i*batch_size, (i+1)*batch_size char_ids = all_char_ids[start:end] word_ids = all_word_ids[start:end] label_ids = labels[start:end] loss, _ = sess.run(run_train, feed_dict={self.char_ids: char_ids, self.word_ids: word_ids, self.label_ids: label_ids}) if validate and _iter % val_every == 0: val_score = sess.run(run_eval, feed_dict={self.char_ids: val_char_ids, self.word_ids: val_word_ids, self.label_ids: val_labels}) print('Validation accuracy: {0:.4f}'.format(val_score))
def train(batch_size, class_nums, growth_rate, weight_decay, depth, cifar10_path, train_epoch, lr): inputs = tf.placeholder(tf.float32, [None, 32, 32, 3]) labels = tf.placeholder(tf.int64, [None]) train_phase = tf.placeholder(tf.bool) learning_rate = tf.placeholder(tf.float32) logits = DenseNet(inputs, nums_out=class_nums, growth_rate=growth_rate, train_phase=train_phase, depth=depth) pred = softmax(logits) accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(pred, axis=1), labels), tf.float32)) one_hot_label = to_OneHot(labels, class_nums) cross_entropy_loss = tf.reduce_mean(-tf.log(tf.reduce_sum(pred * one_hot_label, axis=1) + 1e-10)) regular = tf.add_n([tf.nn.l2_loss(var) for var in tf.trainable_variables()]) Opt = tf.train.MomentumOptimizer(learning_rate, momentum=0.9, use_nesterov=True).minimize(cross_entropy_loss + weight_decay * regular) sess = tf.Session() sess.run(tf.global_variables_initializer()) path = cifar10_path + "data_batch_" valid_path = cifar10_path + "data_batch_5" loss_list = [] train_acc_list = [] test_acc_list = [] saver = tf.train.Saver() # saver.restore(sess, "./save_para//.\\densenet.ckpt") # saver.restore(sess, "./save_para/densenet.ckpt") for epoch in range(train_epoch): if epoch == train_epoch // 2 or epoch == train_epoch * 3 // 4: lr /= 10 for i in range(1, 6): if i != 5: data, labels_ = read_cifar_data(path + str(i)) data, labels_ = shuffle(data, labels_) else: data, labels_ = read_cifar_data(path + str(i)) data, labels_ = shuffle(data[:5000], labels_[:5000]) for j in range(data.shape[0] // batch_size - 1): batch_data = data[j * batch_size:j * batch_size + batch_size, :, :, :] batch_labels = labels_[j * batch_size:j * batch_size + batch_size] [_, loss, acc] = sess.run([Opt, cross_entropy_loss, accuracy], feed_dict={inputs: batch_data, labels: batch_labels, train_phase: True, learning_rate: lr}) loss_list.append(loss) train_acc_list.append(acc) if j % 100 == 0: print("Epoch: %d, iter: %d, loss: %f, train_acc: %f"%(epoch, j, loss, acc)) np.savetxt("loss.txt", loss_list) np.savetxt("train_acc.txt", train_acc_list) np.savetxt("test_acc.txt", test_acc_list) if ((epoch + 1) % 5) == 0: vali_acc = validation_acc(inputs, labels, train_phase, accuracy, sess, valid_path) test_acc_list.append(vali_acc) print("Validation Accuracy: %f"%(vali_acc)) saver.save(sess, "./save_para/densenet.ckpt") # if __name__ == "__main__": # train(batch_size=64, class_nums=10, growth_rate=12, weight_decay=1e-4, depth=40, train_epoch=5)
def gen_sgd_random_annotations(): print len(go) go_ids = [go_id for go_id in go if 'is_obsolete' not in go[go_id]] print len(go_ids) print len(go) - len(go_ids) shuffle(go_ids) groups = get_gene_groups() with open('data/sgd_random_annotations.txt', 'w') as f: for group in groups: shuffle(go_ids) f.write(go_ids[0]) for go_id in go_ids[1:group]: f.write('\t' + go_id) f.write('\n')
def gen_go_annotations(): print len(go) go_ids = [go_id for go_id in go if 'is_obsolete' not in go[go_id]] print len(go_ids) print len(go) - len(go_ids) shuffle(go_ids) with open('data/annotations.txt', 'w') as f: for group in range(1, 56): for i in range(100): shuffle(go_ids) f.write(go_ids[0]) for go_id in go_ids[1:group]: f.write('\t' + go_id) f.write('\n')
def gtsrb(root=config.GTSRB): x_train, y_train, x_dev, y_dev, x_test, y_test = [], [], [], [], [], [] classes = np.arange(0, class_num) # 0-42 for i in trange(class_num): class_name = format(classes[i], '05d') prefix = root + '/Images/' + class_name + '/' f = open(prefix + 'GT-' + class_name + '.csv') reader = csv.reader(f, delimiter=';') next(reader, None) x, y = [], [] for row in reader: img = cv2.imread(prefix + row[0]) img = img[np.int(row[4]):np.int(row[6]), np.int(row[3]):np.int(row[5]), :] # np.int()从string转化为int # cv2.imshow('img', img) # cv2.waitKey(0) x.append(img) y.append(i) x, y = utils.shuffle(np.array(x), np.array(y)) x, y = x.tolist(), y.tolist() split = len(y) // 10 x_dev += x[:split] y_dev += y[:split] x_test += x[split:2*split] y_test += y[split:2*split] x_train += x[2*split:] y_train += y[2*split:] f.close() size = (32, 32) x_train = [cv2.resize(x, size) for x in x_train] x_dev = [cv2.resize(x, size) for x in x_dev] x_test = [cv2.resize(x, size) for x in x_test] x_train, y_train = np.array(x_train).astype(np.float32), np.array(y_train) x_dev, y_dev = np.array(x_dev).astype(np.float32), np.array(y_dev) x_test, y_test = np.array(x_test).astype(np.float32), np.array(y_test) x_train, x_dev, x_test = list(map(utils.data_normalize, [x_train, x_dev, x_test])) x_train, y_train = utils.shuffle(x_train, y_train) x_dev, y_dev = utils.shuffle(x_dev, y_dev) x_test, y_test = utils.shuffle(x_test, y_test) pickle.dump((x_train, y_train), open(root + '/train.p', 'wb')) pickle.dump((x_dev, y_dev), open(root + '/dev.p', 'wb')) pickle.dump((x_test, y_test), open(root + '/test.p', 'wb')) # 'w' for write, 'b' for binary; use 'rb' to read
def iterXY(self, X, Y): if self.shuffle: X, Y = shuffle(X, Y) for x_chunk, y_chunk in iter_data(X, Y, size=self.size*20): sort = np.argsort([len(x) for x in x_chunk]) x_chunk = [x_chunk[idx] for idx in sort] y_chunk = [y_chunk[idx] for idx in sort] mb_chunks = [[x_chunk[idx:idx+self.size], y_chunk[idx:idx+self.size]] for idx in range(len(x_chunk))[::self.size]] mb_chunks = shuffle(mb_chunks) for xmb, ymb in mb_chunks: xmb = padded(xmb) yield self.x_dtype(xmb), self.y_dtype(ymb)
def train(self): if self.cnn_type == '2d': y_ = self.build_network_2d() else: y_ = self.build_network() loss = -tf.reduce_mean(self.Y * tf.log(tf.clip_by_value(y_, 1e-10, 1.0))) train_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(loss) correct = tf.equal(tf.argmax(y_, 1), tf.argmax(self.Y, 1)) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) train_x, train_y = shuffle(self.train_x, self.train_y) train_xc, train_yc, val_xc, val_yc = cross_val(train_x, train_y, self.no_exp) sess_config = tf.ConfigProto() sess_config.gpu_options.per_process_gpu_memory_fraction = GPU_MEMORY with tf.Session(config=sess_config) as sess: tf.global_variables_initializer().run() for epoch in range(self.num_epochs): train_xc, train_yc = shuffle(train_xc, train_yc) for i in range(self.num_batches): batch_x = extract_batch_size(train_xc, i, self.batch_size) batch_y = extract_batch_size(train_yc, i, self.batch_size) _, c = sess.run([train_op, loss], feed_dict={self.X: batch_x, self.Y: batch_y, self.is_training: True}) if (epoch + 1) % self.print_val_each_epoch == 0: print("### Epoch: ", epoch + 1, "|Train loss = ", c, "|Val acc = ", sess.run(accuracy, feed_dict={self.X: val_xc, self.Y: val_yc, self.is_training: False}), " ###") # if (epoch + 1) % self.print_test_each_epoch == 0: # print("### 1st After Epoch: ", epoch + 1, # " |Test acc = ", sess.run(accuracy, # feed_dict={self.X: self.test_x, self.Y: self.test_y, # self.is_training: False}), " ###") if (epoch + 1) % self.print_test_each_epoch == 0: test_acc = np.empty(0) for i in range(self.test_x.shape[0] // self.batch_size): batch_x_t = extract_batch_size(self.test_x, i, self.batch_size) batch_y_t = extract_batch_size(self.test_y, i, self.batch_size) test_acc = np.append(test_acc, sess.run(correct, feed_dict={self.X: batch_x_t, self.Y: batch_y_t, self.is_training: False})) # print(test_acc.shape) _test_acc = np.average(test_acc) print("### After Epoch: ", epoch + 1, " |Test acc = ", _test_acc, " ###") if self.print_cm: pred_y = sess.run(tf.argmax(y_, 1), feed_dict={self.X: self.test_x, self.is_training: False}) cm = confusion_matrix(np.argmax(self.test_y, 1), pred_y, ) print(cm)
def fit(self, X, y): # X: [n_samples, input_dim] n_samples, input_dim = X.shape self.W = np.random.randn(input_dim) self.b = 0.0 # pass dataset for max_iter runs for i in range(self.max_iter): # suffle if self.shuffle: utils.shuffle(X, y) for j in range(n_samples): score = self.predict(X[j]) self.W -= self.eta * (score - y[j]) * X[j] self.b -= self.eta * (score - y[j]) * 1
def gen_depth_annotations(): get_go_by_depth('GO:0008150', 1) # Biological process Ontology get_go_by_depth('GO:0005575', 1) # Cellular component Ontology get_go_by_depth('GO:0003674', 1) # Molecular function Ontology with open('data/depth_annotations.txt', 'w') as f: for level in go_depth: print level gos = list(go_depth[level]) for i in range(100): shuffle(gos) n = abs(random.randint(2, min(100, len(gos)) - 1)) f.write(gos[0]) for go_id in gos[1:n]: f.write('\t' + go_id) f.write('\n')
def gen_hp_annotations(): cls = list() with open('data/hp.txt', 'r') as f: for line in f: items = line.strip().split() cls.append(items[0]) shuffle(cls) with open('data/hp_annotations.txt', 'w') as f: for group in range(1, 56): for i in range(100): shuffle(cls) f.write(cls[0]) for hp_id in cls[1:group]: f.write('\t' + hp_id) f.write('\n')
def _get_inits(self, n_inits): init = list(range(len(self.distance_matrix))) inits = [] for i in range(n_inits): new_init = shuffle(init) inits.append(new_init) return inits
def BPR_train_original(cuda_loader, recommend_model, loss_class, epoch, neg_k=1, w=None): Recmodel = recommend_model Recmodel.train() bpr: utils.BPRLoss = loss_class users, posItems, negItems = cuda_loader.get_train_data_at(epoch) users, posItems, negItems = utils.shuffle(users, posItems, negItems) total_batch = len(users) // world.config['bpr_batch_size'] + 1 aver_loss = 0. for (batch_i, (batch_users, batch_pos, batch_neg)) in enumerate( utils.minibatch(users, posItems, negItems, batch_size=world.config['bpr_batch_size'])): cri = bpr.stageOne(batch_users, batch_pos, batch_neg) aver_loss += cri if world.tensorboard: w.add_scalar( f'BPRLoss/BPR', cri, epoch * int(len(users) / world.config['bpr_batch_size']) + batch_i) aver_loss = aver_loss / total_batch return f"[BPR[aver loss{aver_loss:.3e}]"
def prepare_data_for_d(self): """generate positive and negative samples for the discriminator""" motifs = [] labels = [] g_s_args = [] poss = [] negs = [] for i in range(self.graph.n_node): if np.random.rand() < config.update_ratio: pos = random.sample( self.graph.id2motifs[i], min(len(self.graph.id2motifs[i]), config.n_sample_dis)) poss.append(pos) g_s_args.append((i, len(pos), True)) negs, _ = self.sampling(g_s_args) for pos, neg in zip(poss, negs): if len(pos) != 0 and neg is not None: motifs.extend(pos) labels.extend([1] * len(pos)) motifs.extend(neg) labels.extend([0] * len(neg)) motifs, labels = utils.shuffle(motifs, labels) pickle.dump( motifs, open(config.cache_filename_prefix + '.motifs_ford.pkl', 'wb')) pickle.dump( labels, open(config.cache_filename_prefix + '.labels_ford.pkl', 'wb')) return motifs, labels
def main(): divisions = 100 ds = Dataset() ds.load("Bike-Sharing-Dataset/hour.csv") size = ds.get_size() X = [] y = [] percentages = [] #Full X and y from dataset all_X = ds.get_x() all_y = ds.get_y() #Shuffle data and split into divisons for i in range(1, divisions + 1): percentage = (1 / divisions * i) percentages.append(percentage) all_X, all_y = utils.shuffle(all_X, all_y) X.append(all_X[:int(size * percentage)]) y.append(all_y[:int(size * percentage)]) X_train, X_test, y_train, y_test = split(X, y) scores, featureimportances = all_models(X_train, y_train, X_test, y_test) print("scores") print(scores) plt.scatter(percentages, scores) plt.ylabel('Score') plt.xlabel('Percentage of Original Dataset') plt.title('Percentage of Original Dataset vs Score') plt.show() plotFI(featureimportances)
def main(*args, **kwargs): try: if len(args) != 3: raise Exception("Please provide go_id and number of proteins") go_id = args[1] positives, negatives = load_data(go_id) n = int(args[2]) shuffle(positives) shuffle(negatives) with open(DATA_ROOT + go_id + ".small.txt", "w") as f: for line in negatives[:n]: f.write(line + "\n") for line in positives[:n]: f.write(line + "\n") except Exception, e: print e
def main(*args, **kwargs): try: if len(args) != 3: raise Exception("Please provide go_id and number of proteins") go_id = args[1] positives, negatives = load_data(go_id) n = int(args[2]) shuffle(positives) shuffle(negatives) with open(DATA_ROOT + go_id + '.small.txt', 'w') as f: for line in negatives[:n]: f.write(line + '\n') for line in positives[:n]: f.write(line + '\n') except Exception, e: print e
def gen_aim(self): l=utils.copy_list(g.nos) #shuffle nos lt=utils.shuffle(l) #generate answer buff="" r=random.randint(1,2) # for level 1 while True: n=lt[0]; lt.remove(n); buff+=str(n) if len(lt)==0: break if g.level>1: r=random.randint(0,2) if g.signs[r]=='=': n=eval(buff); buff=""; lt.append(n); lt=utils.shuffle(lt) else: buff=buff+g.signs[r] return eval(buff)
def BPR_train_original(dataset, recommend_model, loss_class, epoch, neg_k=1, w=None): Recmodel = recommend_model Recmodel.train() bpr: utils.BPRLoss = loss_class allusers = list(range(dataset.n_users)) S, sam_time = utils.UniformSample_original(allusers, dataset) print(f"BPR[sample time][{sam_time[0]:.1f}={sam_time[1]:.2f}+{sam_time[2]:.2f}]") users = torch.Tensor(S[:, 0]).long() posItems = torch.Tensor(S[:, 1]).long() negItems = torch.Tensor(S[:, 2]).long() users = users.to(world.device) posItems = posItems.to(world.device) negItems = negItems.to(world.device) users, posItems, negItems = utils.shuffle(users, posItems, negItems) total_batch = len(users) // world.config['bpr_batch_size'] + 1 aver_loss = 0. for (batch_i, (batch_users, batch_pos, batch_neg)) in enumerate(utils.minibatch(users, posItems, negItems, batch_size=world.config['bpr_batch_size'])): cri = bpr.stageOne(batch_users, batch_pos, batch_neg) aver_loss += cri if world.tensorboard: w.add_scalar(f'BPRLoss/BPR', cri, epoch * int(len(users) / world.config['bpr_batch_size']) + batch_i) aver_loss = aver_loss / total_batch return f"[BPR[aver loss{aver_loss:.3e}]"
def load_data(path): # load from csv file Data = pd.read_csv(path) # split img_paths = Data["image_path"] X = [] Y = Data["labelid"] # get images for path in img_paths: img = cv2.imread(path , -1) # BGR X.append(img) # convert into numpy arrays X = np.array(X).reshape(-1, H, W, C) Y = np.array(Y).reshape(-1, 1) # 0 - based labels (scaller) # cast the data set to keep good precision X = X.astype(np.float64) # 64 give accurate prevision for mean calc and subtraction # apply preprocessing X = dataset_preprocessing(X, 1) # shuffle X , Y = shuffle(X , Y) # demo print(Y[:100]) print("\n\n") print(Y[500:700]) return X , Y
def train(x, y, model, optimizer, loss_fn, params): model.train() x, y = utils.shuffle(x, y) total = len(y) n_batch = total // params.batch_size x_split, y_split = np.array_split(x, n_batch), np.array_split(y, n_batch) t = trange(n_batch) avg_loss = 0 for i, (x_bch, y_bch) in enumerate(zip(x_split, y_split)): x_bch = torch.from_numpy(x_bch).float().permute( 0, 3, 1, 2).to(device=params.device) y_bch = torch.from_numpy(y_bch).to(device=params.device) y_hat_bch = model(x_bch) loss = loss_fn(y_hat_bch, y_bch, params) optimizer.zero_grad() loss.backward() optimizer.step() t.set_postfix(loss='{:05.3f}'.format(loss.item())) t.update() avg_loss += loss.item() / n_batch return avg_loss
def main(): """Main function.""" # sorted valid RT dataset exists if os.path.isfile(VALID_RT_FILE_NAMES[0]): df = pnd.read_csv(VALID_RT_FILE_NAMES[0]) sorted_rt_triangles = [[row[col] for col in df.columns[:3]] for row in df.to_dict('records')] # unsorted valid RT dataset exists if os.path.isfile(VALID_RT_FILE_NAMES[1]): df = pnd.read_csv(VALID_RT_FILE_NAMES[1]) unsorted_rt_triangles = [[row[col] for col in df.columns[:3]] for row in df.to_dict('records')] # valid RT dataset doesn't exist else: rt_triangles = create_valid_rt_points() sorted_rt_triangles = [] unsorted_rt_triangles = [] for [a, b, c] in rt_triangles: [x, y, z] = sorted([a, b, c]) sorted_rt_triangles.append([x, y, z]) [x, y, z] = shuffle([a, b, c]) unsorted_rt_triangles.append([x, y, z]) write_exp_rt_datasets(sorted_rt_triangles, unsorted_rt_triangles)
def train(model_dir, sample_dir): model_name = 'model_' input_x, input_y, validation_x, validation_y = load_data(dataset_name, batch_size) with tf.Session(config=tf.ConfigProto()) as sess: sess.run(init) ckpt = tf.train.get_checkpoint_state(model_dir) if ckpt and ckpt.model_checkpoint_path: ckpt_name = os.path.basename(ckpt.model_checkpoint_path) print("[*] restore model : %s"%ckpt_name) saver.restore(sess, os.path.join(model_dir, ckpt_name)) else: print("model does not exits") print('============ Start training ==============') iteration_number = input_x.shape[0] // batch_size for epoch in range(epoch_size): start_position = int(epoch % iteration_number) if (start_position == 0): input_x, input_y = shuffle(input_x, input_y) train_X = input_x[start_position*batch_size : (start_position+1)*batch_size] train_Y = input_y[start_position*batch_size : (start_position+1)*batch_size] if (epoch % 10 == 0): _, loss, train_acc = sess.run([train_op, total_loss, train_accuracy], feed_dict={X:train_X, label:train_Y}) print('Train [%d / %d]: accuracy %.4f, loss %.4f' % (epoch, epoch_size, train_acc, loss)) else: sess.run(train_op, feed_dict={X:train_X, label:train_Y}) # save model and sample result if (epoch % 20 == 0): # evaluate iter_num = validation_x[1:1000].shape[0] // batch_size acc = 0.0 for itr in range(iter_num): samples = sess.run([decoded], feed_dict={X:train_X, label:train_Y}) # pos = int(random.random() * 5) pos = itr validate_label = sess.run(argmax_idx, feed_dict={X:validation_x[pos*batch_size:(pos+1)*batch_size]}) validate_acc = 1.0 * np.sum(validate_label == validation_y[pos*batch_size:(pos+1)*batch_size]) / batch_size acc += validate_acc acc = acc / iter_num save_images(np.reshape(samples, (batch_size, 28, 28))[0:100], [10, 10], os.path.join(sample_dir, 'sample_'+str(epoch)+'.png')) save_images(np.reshape(train_X, (batch_size, 28, 28))[0:100], [10, 10], os.path.join(sample_dir, 'input_'+str(epoch)+'.png')) saver.save(sess, './model/' + model_name + str(epoch) + '.ckpt') print("Save model and sample images, val acc: %.4f" % acc)
def next_card(self): if self.rest == []: # ensure no cards are lost! fresh = utils.copy_list(self.deck) fresh = utils.shuffle(fresh) for n in fresh: if n not in self.yes: self.rest.append(n) n = self.rest.pop() return n
def iterXY(self, X, Y): if self.shuffle: X, Y = shuffle(X, Y) for xmb, ymb in iter_data(X, Y, size=self.size): xmb = padded(xmb) yield self.x_dtype(xmb), self.y_dtype(ymb)
def replaceChars(token, subData): subProb = subData["count"] subMatrix = subData["subs"] appliable = {k: subProb[k] for k in subProb.keys() if k in token} subCandidates = list(appliable.keys()) shuffle(subCandidates) tokenBitMask = [0 for char in token] for sub in subCandidates: subProb = appliable[sub] subWith = weighted_choice(subMatrix[sub]) for start in find_all(token, sub): if sum(tokenBitMask[start:start + len(sub)] ) == 0 and probability_boolean(subProb): token = token[:start] + subWith + token[start + len(sub):] tokenBitMask = tokenBitMask[:start] + \ [1 for c in subWith] + tokenBitMask[start+len(sub):] return token
def test(): pre_processor = PreProcessor() start_time = time.time() numberal_data, labels = pre_processor.text2number('./data/aspect/') X, y = utils.shuffle(numberal_data, labels) X, y = utils.shuffle(X, y) X_train, y_train = X[:4200], y[:4200] X_test, y_test = X[4200:], y[4200:] with open('./data/numberal_data/aspect_data_training.pkl', 'wb') as f: print(X_train.shape, y_train.shape) pickle.dump({'sample': X_train, 'label': y_train}, f) with open('./data/numberal_data/aspect_data_test.pkl', 'wb') as f: print(X_test.shape, y_test.shape) pickle.dump({'sample': X_test, 'label': y_test}, f)
def iterXY(self, X, Y): if self.shuffle: X, Y = shuffle(X, Y) for xmb, ymb in iter_data(X, Y, size=self.size): xmb = self.trXt(xmb) ymb = self.trYt(ymb) yield xmb, ymb
def iterXY(self, X, Y): if self.shuffle: X, Y = shuffle(X, Y) for x_chunk, y_chunk in iter_data(X, Y, size=self.size*20): sort = np.argsort([len(x) for x in x_chunk]) x_chunk = [x_chunk[idx] for idx in sort] y_chunk = [y_chunk[idx] for idx in sort] mb_chunks = [[x_chunk[idx:idx+self.size], y_chunk[idx:idx+self.size]] for idx in range(len(x_chunk))[::self.size]] mb_chunks = shuffle(mb_chunks) for xmb, ymb in mb_chunks: xmb = _padded(xmb, final=self.y_lag) if ymb[0].ndim == 2: ymb, padsize = _padded(ymb, return_sizes=True, initial=self.y_lag) yield self.x_dtype(xmb), (self.y_dtype(ymb), padsize.T) else: yield self.x_dtype(xmb), self.y_dtype(ymb)
def shuffle_split_users(users): has_image = (lambda a: a.get('image_url', None) is not None) has_custom_image = (lambda a: has_image(a) and not is_static_profile_image(a['image_url'])) # find all users who have a custom profile image a0 = filter(has_custom_image, users) # find all users who have the default profile image a1 = filter(lambda a: not has_custom_image(a), users) # shuffle them both independently a0 = utils.shuffle(a0) a1 = utils.shuffle(a1) # and combine the results s.t. all users with custom profile images precede # all those without custom profile images a0.extend(a1) return a0
def main(*args, **kwargs): if len(args) < 3: raise Exception('Please provide parent id and go id') parent_id = args[1] go_id = args[2] if len(args) == 4: level = int(args[3]) global CUR_LEVEL global NEXT_LEVEL CUR_LEVEL = 'level_' + str(level) + '/' NEXT_LEVEL = 'level_' + str(level + 1) + '/' df = load_data(parent_id, go_id) go_node = go[go_id] for ch_id in go_node['children']: ch_set = get_subtree_set(ch_id) positives = list() negatives = list() for i in df.index: pos = False for g_id in df['gos'][i]: if g_id in ch_set: pos = True break if pos: positives.append(i) else: negatives.append(i) n = min(len(positives), len(negatives)) if n > 0: shuffle(positives) shuffle(negatives) positives = positives[:n] negatives = negatives[:n] filename = DATA_ROOT + NEXT_LEVEL + go_id + '/' + ch_id + '.pkl' if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) labels = [0] * n + [1] * n index = negatives + positives new_df = df.reindex(index) new_df['labels'] = pd.Series(labels, index=new_df.index) new_df.to_pickle(filename)
def iterXY(self, X, Y): if self.shuffle: X, Y = shuffle(X, Y) self.loader = Loader(X, self.train_load, self.train_transform, self.size) self.proc = Process(target=self.loader.load) self.proc.start() for ymb in iter_data(Y, size=self.size): xmb = self.loader.get() yield xmb, floatX(ymb)
def main(*args, **kwargs): if len(args) != 2: raise Exception('Please provide function id') go_id = args[1] paacs = load_data_by_prot_id(go_id) data = load_training_data(go_id) go_node = go[go_id] go_set = get_subtree_set(go_id) for ch_id in go_node['children']: ch_set = get_subtree_set(ch_id) positives = list() negatives = list() for prot_id, gos in data: if prot_id not in paacs: continue pos = False for g_id in gos: if g_id in ch_set: pos = True break if pos: positives.append(prot_id) else: negatives.append(prot_id) n = len(positives) shuffle(positives) shuffle(negatives) negatives = negatives[:n] with open(DATA_ROOT + 'level_2/' + go_id + '/' + ch_id + '.txt', 'w') as f: for prot_id in negatives: f.write('0 ' + prot_id) for p in paacs[prot_id]: f.write(' ' + str(p)) f.write('\n') for prot_id in positives: f.write('1 ' + prot_id) for p in paacs[prot_id]: f.write(' ' + str(p)) f.write('\n')
def iterXY(self, X, Y): if self.shuffle: X, Y = shuffle(X, Y) for xmb, ymb in iter_data(X, Y, size=self.size): xmb = self.x_dtype(xmb) shape = range(len(xmb.shape)) shape[0] = 1 shape[1] = 0 shape = tuple(shape) xmb = xmb.transpose(*shape) yield xmb, self.y_dtype(ymb)
def iterXY(self, X, Y): if self.shuffle: X, Y = shuffle(X, Y) for xmb, ymb in iter_data(X, Y, size=self.size): xmb = _padded(xmb, final=self.y_lag) if ymb[0].ndim == 2: # sequence prediction ymb, padsize = _padded(ymb, return_sizes=True, initial=self.y_lag) yield self.x_dtype(xmb), (self.y_dtype(ymb), padsize.T) else: yield self.x_dtype(xmb), self.y_dtype(ymb)
def select_proteins(go_id, parent_go_set): node = go[go_id] pos_go_set = get_subtree_set(go_id) neg_go_set = parent_go_set - pos_go_set positives = set() for g_id in pos_go_set: if g_id in go_prot: positives |= go_prot[g_id] negatives = set() for g_id in neg_go_set: if g_id in go_prot: negatives |= go_prot[g_id] negatives = negatives - positives positives = list(positives) negatives = list(negatives) shuffle(positives) shuffle(negatives) min_len = min(len(positives), len(negatives)) # with open(RESULT_ROOT + go_id + '.txt', 'w') as f: labels = list() proteins = list() data = list() for prot_id in negatives[:min_len]: labels.append(0) proteins.append(prot_id) data.append(fofe[prot_id]) for prot_id in positives[:min_len]: labels.append(1) proteins.append(prot_id) data.append(fofe[prot_id]) df = pd.DataFrame({'labels': labels, 'proteins': proteins, 'data': data}) df.to_pickle(RESULT_ROOT + go_id + '.pkl') # numpy.savez( # RESULT_ROOT + go_id + '.npz', # labels=numpy.array(labels), # proteins=numpy.array(proteins), # data=numpy.array(data)) print 'Finished selection for ' + go_id
def gen_aim(self): l = utils.copy_list(g.nos) #shuffle nos lt = utils.shuffle(l) #generate answer buff = "" r = random.randint(1, 2) # for level 1 while True: n = lt[0] lt.remove(n) buff += str(n) if len(lt) == 0: break if g.level > 1: r = random.randint(0, 2) if g.signs[r] == '=': n = eval(buff) buff = "" lt.append(n) lt = utils.shuffle(lt) else: buff = buff + g.signs[r] return eval(buff)
def load_data(parent_id, go_id): data1 = list() data2 = list() labels = list() positive1 = list() negative1 = list() positive2 = list() negative2 = list() with open(DATA_ROOT + parent_id + '/' + go_id + '.txt') as f: for line in f: line = line.strip().split(' ') label = int(line[0]) seq = line[2][:MAXLEN] sq1 = encode_seq_one_hot(seq, maxlen=MAXLEN) sq2 = encode_seq(OGAK980101, seq, maxlen=MAXLEN) sq3 = encode_seq(MEHP950102, seq, maxlen=MAXLEN) sq4 = encode_seq(CROG050101, seq, maxlen=MAXLEN) sq5 = encode_seq(TOBD000101, seq, maxlen=MAXLEN) sq6 = encode_seq(ALTS910101, seq, maxlen=MAXLEN) if label == 1: positive1.append([sq1]) positive2.append(sq1) else: negative1.append([sq1]) negative2.append(sq1) shuffle(negative1, negative2, seed=0) n = min(len(positive1), len(negative1)) data1 = negative1[:n] + positive1[:n] data2 = negative2[:n] + positive2[:n] labels = [0.0] * n + [1.0] * n # Previous was 30 shuffle(data1, data2, labels, seed=0) data = ( numpy.array(data1, dtype='float32'), numpy.array(data2, dtype='float32')) return (numpy.array(labels), data)
def select_proteins(go_id, parent_go_set): node = go[go_id] pos_go_set = get_subtree_set(go_id) neg_go_set = parent_go_set - pos_go_set positives = set() for g_id in pos_go_set: if g_id in go_prot: positives |= go_prot[g_id] negatives = set() for g_id in neg_go_set: if g_id in go_prot: negatives |= go_prot[g_id] negatives = negatives - positives positives = list(positives) negatives = list(negatives) shuffle(positives, seed=10) shuffle(negatives, seed=10) min_len = min(len(positives), len(negatives)) with open(RESULT_ROOT + go_id + '.txt', 'w') as f: for prot_id in negatives[:min_len]: f.write('0 ' + prot_id + ' ' + prot_paac[prot_id] + '\n') for prot_id in positives[:min_len]: f.write('1 ' + prot_id + ' ' + prot_paac[prot_id] + '\n') print 'Finished selection for ' + go_id
def load_data(go_id): data = list() labels = list() pos = 1 positive = list() negative = list() with open(DATA_ROOT + go_id + '.txt') as f: for line in f: line = line.strip().split(' ') label = int(line[0]) seq = [] seq = encode_seq_one_hot(line[2][:500], maxlen=MAXLEN) if label == pos: positive.append(seq) else: negative.append(seq) shuffle(negative, seed=0) n = len(positive) data = negative[:n] + positive labels = [0.0] * n + [1.0] * n # Previous was 30 shuffle(data, labels, seed=0) return numpy.array(labels), numpy.array(data, dtype="float32")
def iterXY(self, X, Y): if self.shuffle: X, Y = shuffle(X, Y) for x_chunk, y_chunk in iter_data(X, Y, size=self.size*20): sort = np.argsort([len(x) for x in x_chunk]) x_chunk = [x_chunk[idx] for idx in sort] y_chunk = [y_chunk[idx] for idx in sort] mb_chunks = [[x_chunk[idx:idx+self.size], y_chunk[idx:idx+self.size]] for idx in range(len(x_chunk))[::self.size]] py_rng.shuffle(mb_chunks) for xmb, ymb in mb_chunks: xmb = self.trXt(xmb) ymb = self.trYt(ymb) yield xmb, ymb
def iterXY(self, X, Y): if self.shuffle: X, Y = shuffle(X, Y) for xmb, ymb in iter_data(X, Y, size=self.size): xmb = self.x_dtype(xmb) shape = range(len(xmb.shape)) shape[0] = 1 shape[1] = 0 shape = tuple(shape) xmb = xmb.transpose(*shape) ymb = self.y_dtype(ymb) if ymb.ndim == 3: # sequence prediction! also reorder ymb. ymb = ymb.transpose(*shape) yield xmb, ymb
def process_user(self, user, categories=None): assert user is not None if categories is None: categories = self._categories retries = 0 while retries < 3: try: for category in categories: ts = { 'user_id' : user.user_id, 'scope' : 'user' } if category != 'default': if category == 'app': ts['subcategory'] = 'app' else: ts['category'] = category ts['limit'] = 100 collage = self._collages[category] stamp_slice = HTTPTimeSlice().dataImport(ts).exportTimeSlice() stamps = self.api.getStampCollection(stamp_slice) entities = map(lambda s: s.entity, stamps) entities = utils.shuffle(entities)[:30] logs.info("creating collage for user '%s' w/ category '%s' and %d entities" % (user.screen_name, category, len(entities))) images = collage.generate_from_user(user, entities) for image in images: filename = "collage-%s-%s-%sx%s.jpg" % (user.screen_name, category, image.size[0], image.size[1]) self.save_image(image, filename) break except Exception, e: logs.warn("unexpected error processing user %s: %s" % (str(user), e)) logs.warn(utils.getFormattedException()) retries += 1 time.sleep(2 ** retries)
def __init__(self, data, n_valid, corruptor=None, prng=None): """ Parameters ---------- data : numpy array Data matrix array with rows corresponding to data vectors. n_valid : integer Number of data vectors to use as validation set. corruptor : function(Array, RandomState) or None Optional function which applies random 'corruption' / augmentation to data, for example dequantising pixel values, adding noise, applying random affine transformation to image. Applied on initialisation and at end of each training epoch. prng : RandomState or None Seeded pseudo-random number generator - used to shuffle data and for corruptor if specified. """ self.data = data self.n_valid = n_valid self.n_train = data.shape[0] - n_valid self.corruptor = corruptor if prng is None: prng = np.random.RandomState() self.prng = prng shuffled_data, self.perm = utils.shuffle(self.data, self.prng) self.data_valid, self.data_train = utils.split(shuffled_data, n_valid) if corruptor is None: self.x_valid = th.shared( self.data_valid.astype(th.config.floatX), 'x_valid') self.x_train = th.shared( self.data_train.astype(th.config.floatX), 'x_train') else: corrupted_data_valid = self.corruptor(self.data_valid, self.prng) corrupted_data_train = self.corruptor(self.data_train, self.prng) self.x_valid = th.shared( corrupted_data_valid.astype(th.config.floatX), 'x_valid') self.x_train = th.shared( corrupted_data_train.astype(th.config.floatX), 'x_train')
def _create_collage( self, user, images, num_rows=None, num_cols=None, respect_aspect_ratio=False, adaptive_image_resizing=True, enable_drop_shadows=False, row_major=True, shuffle_images=False, ): # must specify num_cols or num_rows, but not both assert (num_cols is not None and num_cols > 0) != (num_rows is not None and num_rows > 0) num_images = len(images) output = [] if num_rows is None: num_cols = int(num_cols) num_rows = int(math.ceil(num_images / num_cols)) elif num_cols is None: num_rows = int(num_rows) num_cols = int(math.ceil(num_images / num_rows)) user_logo_url = "http://static.stamped.com/logos/%s-%s-email-36x36.png" % ( user.color_primary, user.color_secondary, ) try: user_logo = utils.getWebImage(user_logo_url) except Exception: user_logo = None user_logo_cache = {} def get_user_logo(size): if user_logo is None: return None try: return user_logo_cache[size] except KeyError: logo = user_logo.resize(size, Image.ANTIALIAS) user_logo_cache[size] = logo return logo for size in self._sizes: logs.info("[%s] creating %sx%s collage" % (self, size[0], size[1])) canvas = Image.new("RGBA", size, (255, 255, 255, 255)) offsets = [] indices = [] if row_major: for i in xrange(num_rows): for j in xrange(num_cols): indices.append(len(offsets)) offsets.append((i, j)) else: for j in xrange(num_cols): for i in xrange(num_rows): indices.append(len(offsets)) offsets.append((i, j)) if shuffle_images: indices = utils.shuffle(indices) for index in indices: i, j = offsets[index] # wrap images around if necessary to fill last row index = (i * num_cols + j) % num_images image = images[index] cell_size, cell_pos, logo_size, logo_pos = self.get_cell_bounds_func( size, num_cols, num_rows, i, j, image ) # adjust cell layout bounds to align to integer grid (helps minimize aliasing) cell_size = int(math.ceil(cell_size[0])), int(math.ceil(cell_size[1])) cell_pos = int(math.floor(cell_pos[0])), int(math.floor(cell_pos[1])) logo_size = int(math.ceil(logo_size[0])), int(math.ceil(logo_size[1])) logo_pos = int(math.floor(logo_pos[0])), int(math.floor(logo_pos[1])) width = cell_size[0] height = cell_size[1] if adaptive_image_resizing: if image.size[0] / cell_size[0] < image.size[1] / cell_size[1]: width = cell_size[0] height = (width * image.size[1]) / image.size[0] if not respect_aspect_ratio and height > cell_size[1]: height = int((height + cell_size[1]) * 0.5) else: height = cell_size[1] width = (height * image.size[0]) / image.size[1] if not respect_aspect_ratio and width > cell_size[0]: width = int((width + cell_size[0]) * 0.5) cell = image.resize((width, height), Image.ANTIALIAS) w = min(width, cell_size[0]) h = min(height, cell_size[1]) cell = cell.crop((0, 0, w, h)) if enable_drop_shadows: self._paste_image_with_drop_shadow(canvas, cell, cell_pos) else: canvas.paste(cell, cell_pos) # overlay user's stamp logo on top of each entity image logo = get_user_logo(logo_size) if logo is not None: logo_box = (logo_pos[0], logo_pos[1], logo_pos[0] + logo.size[0], logo_pos[1] + logo.size[1]) canvas.paste(logo, logo_box, logo) canvas = self._apply_postprocessing(canvas, user) output.append(canvas) return output
import os p = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if not p in sys.path: sys.path.append(p) import utils from models import LR, FM, PNN1, PNN2, FNN, CCPM train_file = '../../output/fm/train.fm' test_file = '../../output/fm/test.fm' input_dim = utils.INPUT_DIM train_data = utils.read_data(train_file) # train_data = pkl.load(open('../data/train.yx.pkl', 'rb')) train_data = utils.shuffle(train_data) test_data = utils.read_data(test_file) # test_data = pkl.load(open('../data/test.yx.pkl', 'rb')) # pkl.dump(train_data, open('../data/train.yx.pkl', 'wb')) # pkl.dump(test_data, open('../data/test.yx.pkl', 'wb')) if train_data[1].ndim > 1: print('label must be 1-dim') exit(0) print('read finish') print('train data size:', train_data[0].shape) print('test data size:', test_data[0].shape) train_size = train_data[0].shape[0] test_size = test_data[0].shape[0] num_feas = len(utils.FIELD_SIZES)