def init(self): # data dataset = self.comboBox.currentText() data_process = self.comboBox_2.currentText() test_size = float(self.doubleSpinBox.value()) data, label = Data().load(dataset, data_process) if data is None: self.textBrowser.setText("[Error]: input error" + str(label)) return self.train_data, self.test_data, self.train_label, self.test_label = train_test_split( data, label, test_size=test_size) n_inputs = len(data[0]) self.label = list(set(i for i in label)) self.n_outputs = len(self.label) n_hidden_layer = int(self.spinBox.value()) hidden_layer = self.lineEdit.text().split() if len(hidden_layer) == 0: self.textBrowser.setText("[Error]: input error") return try: hidden_layer = list(map(int, hidden_layer[:n_hidden_layer])) except Exception as e: self.textBrowser.setText("[Error]: input error, " + str(e)) return self.MLP = MLP_Model(n_inputs, hidden_layer, self.label) self.textBrowser.setText("[Success]: init sucesss") self.mode = 'init'
def eval_input_fn(params): data = Data(dataset='WN18', reverse=True) validation_data = data.get_inputs_and_targets() ds = validation_data return ds
def evaluate(self, data, xs, selected): Xnews_value = self.predict(xs, selected) dataset = Data(data, False) ori_model = TextModel(data) preds_ori = ori_model.predict(Xnews_value) acc_ori = np.mean( np.argmax(preds_ori, axis=-1) == np.argmax(dataset.pred_val, axis=-1)) return acc_ori
def predict_input_fn(params): batch_size = params["batch_size"] data = Data(dataset='WN18', reverse=True) test_data = data.get_inputs_and_targets() # Take out top 10 samples from test data to make the predictions. ds = test_data.take(10).batch(batch_size) return ds
def train_input_fn(params): """train_input_fn defines the input pipeline used for training.""" # Retrieves the batch size for the current shard. The # of shards is # computed according to the input pipeline deployment. See # `tf.contrib.tpu.RunConfig` for details. data = Data(dataset='WN18', reverse=True) train_data = data.get_inputs_and_targets(training=True) ds = train_data.shuffle(buffer_size=1000).repeat() return ds
def make_data(dataset_name): if dataset_name == 'sst': dataset = Data(dataset_name) texts, labels = load_sst_train() x_train = dataset.lt_to_int([t.lower() for t in texts]) y_train = labels x_train_raw = texts x_test = dataset.x_val y_test = dataset.y_val x_test_raw = dataset.x_val_raw return x_train, y_train, x_train_raw, x_test, y_test, x_test_raw
def L2X(args): from build_gumbel_selector import Gumbel_Selection, Gumbel_Selection_Char if args.data == 'agccnn': gumbel_selector = Gumbel_Selection_Char(args.num_feats, args.data, args.train, args.original, args.mask) else: gumbel_selector = Gumbel_Selection( args.num_feats, args.data, args.train, args.original, args.mask, ) if args.train: return None, None else: if args.train_score: dataset = Data(args.data, True) scores_val = gumbel_selector.predict(dataset.x_val) np.save( '{}/results/scores-val-{}-{}-original{}-mask{}.npy'.format( args.data, args.method, args.num_feats, args.original, args.mask), scores_val) scores_train = gumbel_selector.predict(dataset.x_train) np.save( '{}/results/scores-train-{}-{}-original{}-mask{}.npy'.format( args.data, args.method, args.num_feats, args.original, args.mask), scores_train) dataset = Data(args.data, False) st = time.time() scores = gumbel_selector.predict(dataset.x_val) print('Time spent is {}'.format(time.time() - st)) return scores, [time.time() - st]
def gumbel(args): from build_gumbel_transformer import Gumbel_Transform, Gumbel_Transform_Char if args.data == 'agccnn': gumbel_transform = Gumbel_Transform_Char(args.data, args.num_feats, args.method, args.train, args.original, args.mask) else: gumbel_transform = Gumbel_Transform(args.data, args.num_feats, args.max_words, args.method, args.train, args.original, args.mask) if not args.train: dataset = Data(args.data) if args.method == 'L2X': scores = np.load( '{}/results/scores-{}-{}-original{}-mask{}.npy'.format( args.data, args.method, args.num_feats, args.original, args.mask)) elif args.method == 'leave_one_out': scores = np.load('{}/results/scores-{}.npy'.format( args.data, args.method)) changed_xs = [] st = time.time() for k in xrange(1, args.num_feats + 1): selected_index = np.argsort( scores, axis=-1)[:, -k:] # indices of largest k score. selected = np.zeros(scores.shape) selected[np.expand_dims(np.arange(len(scores)), axis=-1), selected_index] = 1.0 changed_x = gumbel_transform.predict(dataset.x_val, selected) changed_xs.append(changed_x) changed_xs = np.array(changed_xs) changed_xs = np.swapaxes(changed_xs, 0, 1) return changed_xs, [time.time() - st] return None, None
def train_bow(dataset_name): # Train the BoW model. data_model = dataset_name + 'bow' dataset = Data(dataset_name) x_train, y_train = dataset.x_train_raw, np.argmax(dataset.y_train, axis=1) x_test, y_test = dataset.x_val_raw, np.argmax(dataset.y_val, axis=1) print('Fitting transform...') vectorizer = CountVectorizer(max_features=20000) x_train_bow = vectorizer.fit_transform(x_train) x_test_bow = vectorizer.transform(x_test) print('Fitting logistic regression...') clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial') clf.fit(x_train_bow, y_train) print('Making prediction...') pred_test = clf.predict_proba(x_test_bow) acc_train = clf.score(x_train_bow, y_train) acc_test = clf.score(x_test_bow, y_test) print('The training accuracy is {}; the test accuracy is {}.'.format( acc_train, acc_test)) # print('The size of bow transformer is {} MB.'.format(sys.getsizeof(vectorizer) * 1e-6)) print('Save model to pickle...') if data_model not in os.listdir('.'): os.mkdir(data_model) with open('{}/vectorizer.pkl'.format(data_model), 'wb') as f: pkl.dump(vectorizer, f) with open('{}/clf.pkl'.format(data_model), 'wb') as f: pkl.dump(clf, f)
def create_original_predictions(args): # save original validation prediction probabilities. dataset = Data(args.data, True) model = TextModel(args.data, False) pred_val = model.predict(dataset.x_val, verbose=True) pred_train = model.predict(dataset.x_train, verbose=True) if 'data' not in os.listdir(args.data): os.mkdir('{}/data'.format(args.data)) np.save('{}/data/pred_val.npy'.format(args.data), pred_val) np.save('{}/data/pred_train.npy'.format(args.data), pred_train) acc_val = np.mean( np.argmax(pred_val, axis=1) == np.argmax(dataset.y_val, axis=1)) acc_train = np.mean( np.argmax(pred_train, axis=1) == np.argmax(dataset.y_train, axis=1)) print('The validation accuracy is {}.'.format(acc_val)) print('The training accuracy is {}.'.format(acc_train)) if args.data != 'agccnn': np.save('{}/data/embedding_matrix.npy'.format(args.data), model.emb_weights)
def prepare_data(self, data_fields, wv_size=600): test_data = Data(self.file_name, self.file_path) test_df = test_data.csv_df(data_fields) # make a copy of the original tweets for later use original_df = test_df.copy() # pre-process data(same as how we trained) test_data.pre_process(test_df) # then convert using word2vec model = test_data.build_wordvec(size=wv_size, verbose=False) # take a look of the max_len of testing. although we still have to use max_len from train max_len_test = test_data.max_len(test_df) data = test_data.convert2vec(test_df, self.max_len_train, model, name='test_' + self.file_name) test_data.save_vec(data, name='test_' + self.file_name) self.data = data self.test_data = test_data self.test_df = test_df self.original_df = original_df print ">>>Done preparing data.<<<\n"
config.pdata = args.pdata config.data_opt = args.data_opt print(args.bk) dataset = args.dataset data_dir = "data/%s/" % dataset torch.backends.cudnn.deterministic = True # For reproducibility seed = 20 np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available: torch.cuda.manual_seed_all(seed) d = Data(data_dir=data_dir, reverse=True, subset_percentage=args.pdata, data_opt=args.data_opt) experiment = Experiment(num_iterations=args.num_iterations, batch_size=args.batch_size, learning_rate=args.lr, decay_rate=args.dr, ent_vec_dim=args.edim, rel_vec_dim=args.rdim, cuda=args.cuda, input_dropout=args.input_dropout, hidden_dropout1=args.hidden_dropout1, hidden_dropout2=args.hidden_dropout2, label_smoothing=args.label_smoothing, bk=args.bk) path = 'model_state.pts' if args.bk: path = 'model_state_sym.pts'
from load_data import Data from model import Model from train import Train if __name__ == "__main__": data = Data() data.load() data.data_augment() data.data_splitting() data.print() dataset = data.get_dataset() testset = data.get_testset() models = Model(dataset[0].shape[1:], 50) m = models.ResNet() m.summary() # train = Train(m, dataset, testset, 50, 32, 'adam', 'sparse_categorical_crossentropy') train = Train(m, dataset, testset, 100, 200, 'adam', 'categorical_crossentropy') train.training() train.evaluate() # tensorboard --logdir logs/scalars --port=7000
ranks.append(rank + 1) for hits_level in range(10): if rank <= hits_level: hits[hits_level].append(1.0) else: hits[hits_level].append(0.0) logger.info('Hits @10: {0}'.format(np.mean(hits[9]))) logger.info('Hits @3: {0}'.format(np.mean(hits[2]))) logger.info('Hits @1: {0}'.format(np.mean(hits[0]))) logger.info('Mean rank: {0}'.format(np.mean(ranks))) logger.info('Mean reciprocal rank: {0}'.format( np.mean(1. / np.array(ranks)))) if __name__ == '__main__': # Load data data = Data(dataset='WN18', reverse=True) # Intialise model hypER = HyperER(len(data.entities), len(data.relations)) # intialise build trainer = Train(hypER, data, num_epoch=100) trainer.train_and_eval() # trainer.evaluate() # trainer.test()
def meta_train(self, train_subset='train'): if train_subset == 'train': train_data_loader = Data('./data', 'train') train_data = train_data_loader.get_data() keys = train_data_loader.keys print('Start to train...') for epoch in range(0, self.num_epochs): #variables for monitoring meta_loss_saved = [] val_accuracies = [] train_accuracies = [] meta_loss = 0 #accumulate the loss of many ensembling networks num_meta_updates_count = 0 meta_loss_avg_print = 0 #meta_mse_avg_print = 0 meta_loss_avg_save = [] #meta_mse_avg_save = [] task_count = 0 #change to maybe do multiple tasks for key in keys: padded_data = train_data_loader.pad(train_data[key]) x_t, y_t, x_v, y_v = train_data_loader.get_train_test( padded_data) chaser, leader, y_pred = self.get_task_prediction( x_t, y_t, x_v, y_v) loss_NLL = self.get_meta_loss(chaser, leader) if torch.isnan(loss_NLL).item(): sys.exit('NaN error') meta_loss = meta_loss + loss_NLL #meta_mse = self.loss(y_pred, y_v) task_count = task_count + 1 if task_count % self.num_tasks_per_minibatch == 0: meta_loss = meta_loss / self.num_tasks_per_minibatch #meta_mse = meta_mse/self.num_tasks_per_minibatch # accumulate into different variables for printing purpose meta_loss_avg_print += meta_loss.item() #meta_mse_avg_print += meta_mse.item() self.op_theta.zero_grad() meta_loss.backward() self.op_theta.step() # Printing losses num_meta_updates_count += 1 if (num_meta_updates_count % self.num_meta_updates_print == 0): meta_loss_avg_save.append(meta_loss_avg_print / num_meta_updates_count) #meta_mse_avg_save.append(meta_mse_avg_print/num_meta_updates_count) print('{0:d}, {1:2.4f}, {1:2.4f}'.format( task_count, meta_loss_avg_save[-1] #meta_mse_avg_save[-1] )) num_meta_updates_count = 0 meta_loss_avg_print = 0 #meta_mse_avg_print = 0 if (task_count % self.num_tasks_save_loss == 0): meta_loss_saved.append(np.mean(meta_loss_avg_save)) meta_loss_avg_save = [] #meta_mse_avg_save = [] # print('Saving loss...') # val_accs, _ = meta_validation( # datasubset=val_set, # num_val_tasks=num_val_tasks, # return_uncertainty=False) # val_acc = np.mean(val_accs) # val_ci95 = 1.96*np.std(val_accs)/np.sqrt(num_val_tasks) # print('Validation accuracy = {0:2.4f} +/- {1:2.4f}'.format(val_acc, val_ci95)) # val_accuracies.append(val_acc) # train_accs, _ = meta_validation( # datasubset=train_set, # num_val_tasks=num_val_tasks, # return_uncertainty=False) # train_acc = np.mean(train_accs) # train_ci95 = 1.96*np.std(train_accs)/np.sqrt(num_val_tasks) # print('Train accuracy = {0:2.4f} +/- {1:2.4f}\n'.format(train_acc, train_ci95)) # train_accuracies.append(train_acc) # reset meta loss meta_loss = 0 if (task_count >= self.num_tasks_per_epoch): break if ((epoch + 1) % self.num_epochs_save == 0): checkpoint = { 'theta': self.theta, 'meta_loss': meta_loss_saved, 'val_accuracy': val_accuracies, 'train_accuracy': train_accuracies, 'op_theta': self.op_theta.state_dict() } print('SAVING WEIGHTS...') checkpoint_filename = ('{0:s}_{1:d}way_{2:d}shot_{3:d}.pt')\ .format('sine_line', self.num_classes_per_task, self.num_training_samples_per_class, epoch + 1) print(checkpoint_filename) torch.save(checkpoint, os.path.join(self.dst_folder, checkpoint_filename)) print(checkpoint['meta_loss']) print()
def __init__(self, data, train = False): self.data = data if data in ['imdbcnn']: filters = 250 hidden_dims = 250 self.embedding_dims = 50 self.maxlen = 400 self.num_classes = 2 self.num_words = 20002 self.type = 'word' if not train: K.set_learning_phase(0) X_ph = Input(shape=(self.maxlen,), dtype='int32') emb_layer = Embedding(self.num_words, self.embedding_dims, input_length=self.maxlen, name = 'embedding_1') emb_out = emb_layer(X_ph) if train: preds = construct_original_network(emb_out, data) else: emb_ph = Input(shape=(self.maxlen,self.embedding_dims), dtype='float32') preds = construct_original_network(emb_ph, data) if not train: model1 = Model(X_ph, emb_out) model2 = Model(emb_ph, preds) pred_out = model2(model1(X_ph)) pred_model = Model(X_ph, pred_out) pred_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) self.pred_model = pred_model grads = [] for c in range(self.num_classes): grads.append(tf.gradients(preds[:,c], emb_ph)) grads = tf.concat(grads, axis = 0) # [num_classes, batchsize, maxlen, embedding_dims] approxs = grads * tf.expand_dims(emb_ph, 0) # [num_classes, batchsize, maxlen, embedding_dims] self.sess = K.get_session() self.grads = grads self.approxs = approxs self.input_ph = X_ph self.emb_out = emb_out self.emb_ph = emb_ph weights_name = 'original.h5'#[i for i in os.listdir('imdblstm/models/') if i.startswith('original')][0] model1.load_weights('{}/models/{}'.format(data, weights_name), by_name=True) model2.load_weights('{}/models/{}'.format(data, weights_name), by_name=True) print('Model constructed.') # For validating the data. emb_weights = emb_layer.get_weights() emb_weights[0][0] = np.zeros(50) emb_layer.set_weights(emb_weights) else: pred_model = Model(X_ph, preds) pred_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) self.pred_model = pred_model from load_data import Data dataset = Data(self.data) self.train(dataset) print('Training is done.')
def __init__(self, path, batch_size): self.data_generator = Data(path, batch_size) self.norm_adj = self.data_generator.get_adj_mat()
def __init__(self, num_feats, data, train=False, load_original=False, masking=True): if data == 'agccnn': hidden_dims = 250 filter_kernels = [7, 7, 3, 3, 3, 3] dense_outputs = 1024 self.charlen = 1014 self.maxlen = None nb_filter = 256 self.num_classes = 4 self.vocab, self.reverse_vocab, self.vocab_size, self.vocab_check = create_vocab_set( ) self.embedding_dims = self.vocab_size K.set_learning_phase(1 if train else 0) #Define what the input shape looks like inputs = Input(shape=(self.charlen, self.vocab_size), name='input', dtype='float32') logits_T = construct_gumbel_selector(inputs, None, None, None, None, 1, network_type='agccnn2') # (?, self.charlen, 1) tau = 0.5 T = Sample_Concrete(tau, num_feats, self.charlen, masking)(logits_T) # (?, self.charlen, 1) if train: batch_size = 40 if not load_original: selected_emb = Multiply()([inputs, T]) #(?, self.charlen, 69) Mean = Lambda(lambda x: K.sum(x, axis=1) / float(maxlen), output_shape=lambda x: [x[0], x[2]]) emb2 = Embedding(self.vocab_size, embedding_dims, input_length=maxlen)(X_ph) net = Mean(emb2) net = Dense(hidden_dims)(net) net = Activation('relu')(net) preds = Dense(2, activation='softmax', name='new_dense')(net) model = Model(inputs=X_ph, outputs=preds) else: selected_emb = Multiply()([inputs, T]) #(?, self.charlen, 69) conv = Conv1D(filters=nb_filter, kernel_size=filter_kernels[0], padding='valid', activation='relu', input_shape=(self.charlen, self.vocab_size), trainable=False)(selected_emb) conv = MaxPooling1D(pool_size=3, trainable=False)(conv) conv1 = Conv1D(filters=nb_filter, kernel_size=filter_kernels[1], padding='valid', activation='relu', trainable=False)(conv) conv1 = MaxPooling1D(pool_size=3, trainable=False)(conv1) conv2 = Conv1D(filters=nb_filter, kernel_size=filter_kernels[2], padding='valid', activation='relu', trainable=False)(conv1) conv3 = Conv1D(filters=nb_filter, kernel_size=filter_kernels[3], padding='valid', activation='relu', trainable=False)(conv2) conv4 = Conv1D(filters=nb_filter, kernel_size=filter_kernels[4], padding='valid', activation='relu', trainable=False)(conv3) conv5 = Conv1D(filters=nb_filter, kernel_size=filter_kernels[5], padding='valid', activation='relu', trainable=False)(conv4) conv5 = MaxPooling1D(pool_size=3)(conv5) conv5 = Flatten()(conv5) #Two dense layers with dropout of .5 z = Dropout(0.5)(Dense(dense_outputs, activation='relu', trainable=False)(conv5)) z = Dropout(0.5)(Dense(dense_outputs, activation='relu', trainable=False)(z)) #Output dense layer with softmax activation preds = Dense(self.num_classes, activation='softmax', name='output', trainable=False)(z) model = Model(inputs=inputs, outputs=preds) if masking: model.compile( loss=negative_xentropy, optimizer='RMSprop', #optimizer, metrics=['acc']) print('Loading original models...') if load_original: model.load_weights( '{}/params/crepe_model_weights-15.h5'.format(data), by_name=True) if not masking: filepath = "{}/models/L2X-{}-{}.hdf5".format( data, num_feats, 'original' if load_original else 'variational') checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') else: filepath = "{}/models/L2X-{}-{}-mask.hdf5".format( data, num_feats, 'original' if load_original else 'variational') checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='min') callbacks_list = [checkpoint] from load_data import Data dataset = Data(data, True) label_train = np.argmax(dataset.pred_train, axis=1) label_val = np.argmax(dataset.pred_val, axis=1) label_train = np.eye(self.num_classes)[label_train] label_val = np.eye(self.num_classes)[label_val] generator_train = mini_batch_generator(dataset.x_train, label_train, self.vocab, self.vocab_size, self.vocab_check, self.charlen, epoch=6, batch_size=batch_size) generator_val = mini_batch_generator(dataset.x_val, label_val, self.vocab, self.vocab_size, self.vocab_check, self.charlen, epoch=6, batch_size=batch_size) model.fit_generator(generator_train, validation_data=generator_val, callbacks=callbacks_list, epochs=5, steps_per_epoch=len(label_train) / batch_size, validation_steps=math.ceil( float(len(label_val)) / batch_size), verbose=True) else: pred_model = Model(inputs, logits_T) if not masking: pred_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc']) weights_name = "{}/models/L2X-{}-{}.hdf5".format( data, num_feats, 'original' if load_original else 'variational') else: pred_model.compile(loss=negative_xentropy, optimizer='adam', metrics=['acc']) weights_name = "{}/models/L2X-{}-{}-mask.hdf5".format( data, num_feats, 'original' if load_original else 'variational') pred_model.load_weights(weights_name, by_name=True) self.pred_model = pred_model
parser.add_argument('--data', type = str, choices = ['imdbcnn'], default = 'imdbcnn') parser.add_argument('--num_neighbors', type = int, default = 4) parser.add_argument('--train', action='store_true') parser.add_argument('--original', action='store_true') parser.add_argument('--max_order', type = int, default = 16) args = parser.parse_args() dict_a = vars(args) if args.method == 'train': model = TextModel(args.data, train = True) else: print('Loading dataset...') dataset = Data(args.data) print('Creating model...') model = TextModel(args.data) dict_a.update({'dataset': dataset, 'model': model}) if args.data not in os.listdir('./'): os.mkdir(args.data) if 'results' not in os.listdir('./{}'.format(args.data)): os.mkdir('{}/results'.format(args.data)) if args.method in ['localshapley','connectedshapley']: dict_a.update({'regression': False}) scores = lcshapley(args)
parser.add_argument('--model', type=str, default="p2v-l", nargs="?", help='Which model to use: p2v-l or p2v-p') parser.add_argument('--num_iters', type=int, default=100, nargs="?", help='Number of iterations') parser.add_argument('--lr', type=float, default=0.1, nargs="?", help='Initial learning rate') parser.add_argument('--dr', type=float, default=0.98, nargs="?", help='Decay rate') parser.add_argument('--batch_size', type=int, default=10000, nargs="?", help='Batch size') parser.add_argument('--num_neg', type=int, default=5, nargs="?", help='Number of negative samples per each positive sample') parser.add_argument('--dim', type=int, default=200, nargs="?", help='Embeddings dimensionality') parser.add_argument('--w_reg', type=float, default=0.5, nargs="?", help='Regularization coefficient for W') parser.add_argument('--c_reg', type=float, default=0.5, nargs="?", help='Regularization coefficient for C') parser.add_argument('--cuda', type=bool, default=True, nargs="?", help='Whether to use cuda (GPU) or not (CPU)') args = parser.parse_args() d = Data(data_dir="data/", fname=args.dataset, min_occurrences=args.min_occurrences, window_size=args.window_size, subsample=args.subsample, t=args.threshold, cutoff=args.cutoff) experiment = Experiment(args.model, num_iterations=args.num_iters, learning_rate=args.lr, batch_size=args.batch_size, corrupt_size=args.num_neg, decay_rate=args.dr, embeddings_dim=args.dim, w_reg=args.w_reg, c_reg=args.c_reg, cuda=args.cuda) experiment.train_and_eval()
nargs="?", help="Dropout after the second hidden layer.") parser.add_argument("--label_smoothing", type=float, default=0.1, nargs="?", help="Amount of label smoothing.") args = parser.parse_args() dataset = args.dataset data_dir = "data/%s/" % dataset torch.backends.cudnn.deterministic = True seed = 20 np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available: torch.cuda.manual_seed_all(seed) d = Data(data_dir=data_dir, reverse=True, path_dataset=args.path_dataset) experiment = Experiment(num_iterations=args.num_iterations, batch_size=args.batch_size, learning_rate=args.lr, decay_rate=args.dr, ent_vec_dim=args.edim, rel_vec_dim=args.rdim, cuda=args.cuda, input_dropout=args.input_dropout, hidden_dropout1=args.hidden_dropout1, hidden_dropout2=args.hidden_dropout2, label_smoothing=args.label_smoothing) experiment.train_and_eval()
# Algo config num_folds = 7 # File config VERSION = 4 MODEL_NAME = 'lgbm' OUTPUT_FOLDER = 'model_outputs/{}_{}/'.format(MODEL_NAME, VERSION) OUTPUT_FILENAME = OUTPUT_FOLDER + MODEL_NAME if not os.path.exists(OUTPUT_FOLDER): os.makedirs(OUTPUT_FOLDER) if __name__ == "__main__": print('Loading Data...') df = Data().read_data() df_x = df.drop(['target'], axis=1) df_y = df[['target']] features = list(df_x.columns.values) X_train, X_test, Y_train, Y_test = train_test_split(df_x, df_y, shuffle=False, train_size=0.8) del df gc.collect() folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=47) oof = np.zeros(X_train.shape[0]) getVal = np.zeros(X_train.shape[0])
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default="ComConV", help='ComConV') parser.add_argument( '--dataset', type=str, default="countries/countries_S3", help= 'FB15k, FB15k-237, WN18, WN18RR, YAGO3-10, countries/countries_S1, ...' ) parser.add_argument('--cuda', type=bool, default=False, help='use cuda or not') parser.add_argument('--get_best_results', type=bool, default=True, help='get best results or not') parser.add_argument('--get_complex_results', type=bool, default=False, help='get complex results or not') parser.add_argument('--num_to_eval', type=int, default=5, help='number to evaluate') # learning parameters parser.add_argument('--learning_rate', type=float, default=1e-1, help='learning rate') parser.add_argument('--batch_size', type=int, default=128, help='batch size') parser.add_argument('--num_iterations', type=int, default=1500, help='iterations number') parser.add_argument('--optimizer_method', type=str, default="RAdam", help='optimizer method') parser.add_argument('--decay_rate', type=float, default=1.0, help='decay rate') parser.add_argument('--label_smoothing', type=float, default=0.1, help='label smoothing') # convolution parameters parser.add_argument('--ent_vec_dim', type=int, default=200, help='entity vector dimension') parser.add_argument('--rel_vec_dim', type=int, default=200, help='relation vector dimension') parser.add_argument('--input_dropout', type=float, default=0.2, help='input dropout') parser.add_argument('--feature_map_dropout', type=float, default=0.2, help='feature map dropout') parser.add_argument('--hidden_dropout', type=float, default=0.3, help='hidden dropout') parser.add_argument('--filt_h', type=int, default=2, help='filter height') parser.add_argument('--filt_w', type=int, default=5, help='filter width') parser.add_argument('--in_channels', type=int, default=1, help='in channels') parser.add_argument('--out_channels', type=int, default=36, help='out channels') args = parser.parse_args() dataset = args.dataset data_dir = "data/%s/" % dataset print(args) # 通过设置随机数种子,固定每一次的训练结果 seed = 777 np.random.seed(seed) torch.manual_seed(seed) torch.backends.cudnn.deterministic = True data = Data(data_dir=data_dir, reverse=True) run = RunModel(data, modelname=args.model_name, optimizer_method=args.optimizer_method, num_iterations=args.num_iterations, batch_size=args.batch_size, learning_rate=args.learning_rate, decay_rate=args.decay_rate, ent_vec_dim=args.ent_vec_dim, rel_vec_dim=args.rel_vec_dim, cuda=args.cuda, input_dropout=args.input_dropout, hidden_dropout=args.hidden_dropout, feature_map_dropout=args.feature_map_dropout, in_channels=args.in_channels, out_channels=args.out_channels, filt_h=args.filt_h, filt_w=args.filt_w, label_smoothing=args.label_smoothing, num_to_eval=args.num_to_eval, get_best_results=args.get_best_results, get_complex_results=args.get_complex_results, regular_method="", regular_rate=1e-4) run.train_and_eval()
def main(config: argparse.Namespace) -> None: # TODO docstring with open(config.config_file, 'r') as cfg: experiments: dict = yaml.load(cfg) print('loading data') data = Data(config.yelp_file, config.geneea_file) data.print(f'Processing file {config.config_file}') print('generating samples') datasize: int = data.generate_sample(experiments['config']['chunks'], LikeTypeEnum.USEFUL) stats: DataGraph = DataGraph('', 'number of instances', 'percentage') # texts_tokenized = (self._tokenize(row.text) for index, row # in self.data.iterrows()) # words_freqs = nltk.FreqDist(w.lower() for tokens in texts_tokenized # for w in tokens) # # # TODO statistics # # for x in all_words: # # print(all_words[x]) # # # self.print('total number of words:', sum(all_words.values())) # # self.print('unique words:', len(all_words)) # # self.print('words present only once:', # # sum(c for c in all_words.values() if c == 1)) # # all_words.plot(30) # # # only the right frequencies # self.gram_words = words_freqs.copy() # for w, count in words_freqs.items(): # if count > 200 or count == 20: # # TODO Measure # del self.gram_words[w] # # self.gram_words = frozenset(self.gram_words.keys()) # calculate mutual information of all features if wanted # and dump it into text files if experiments['config']['mi']: for x in FeatureSetEnum: if x == FeatureSetEnum.BIGRAMS or \ x == FeatureSetEnum.TRIGRAMS or \ x == FeatureSetEnum.FOURGRAMS: continue if x == FeatureSetEnum.UNIGRAMS: # TODO REMOVE continue # get data data.set_statfile(f'mi_{x}') data.print(f'Mutual Information of {x}.') train = data.get_feature_dict(SampleTypeEnum.TRAIN, {x}) test = data.get_feature_dict(SampleTypeEnum.TEST, {x}) instances = train + test # get matrix matrix_convertor = featurematrixconversion.Preprocessor({}) vector_instances = matrix_convertor.process( instances, SampleTypeEnum.TRAIN) # calculate mutual info matrix_gen, labels_gen = zip(*vector_instances) matrix = sparse.vstack(matrix_gen) labels = list(labels_gen) mi = mutual_info_classif(matrix, labels) # dump data for f_name, f_mi in zip(matrix_convertor.all_fs, mi): data.print(f'{f_name} {f_mi}') data.set_statfile(f'statistics') first_run: bool = True while True: train_size: int \ = int(datasize - datasize / experiments['config']['chunks']) train_size_log: int = int(ceil(log2(train_size)) + 1) data.max_tfidf = experiments['config']['max_tfidf'] data.max_ngrams = experiments['config']['max_ngrams'] for ex in experiments['tasks']: # convert features to set: features: Set[FeatureSetEnum] \ = {FeatureSetEnum[f] for f in ex['features']} train_set = data.get_feature_dict(SampleTypeEnum.TRAIN, features, ex['extra_data']) test_set = data.get_feature_dict(SampleTypeEnum.TEST, features, ex['extra_data']) if first_run: unique_features: set = set() for inst in train_set: unique_features = unique_features.union(set( inst[0].keys())) data.print( f'Number of unique features for {ex["name"]}: {len(unique_features)}' ) unique_features = set() l_curves = experiments['config']['l_curves'] start_size: int = 1 if l_curves \ else train_size_log-1 for t_size in map(lambda x: min(2**x, train_size), range(start_size, train_size_log)): if l_curves: train_set_copy = train_set[:t_size] test_set_copy = test_set[:] else: train_set_copy = train_set test_set_copy = test_set # preprocess data for pp in ex['preprocessing']: prep: PreprocessorBase \ = getattr(preprocessors, pp).Preprocessor(ex['config']) train_set_copy = prep.process(train_set_copy, SampleTypeEnum.TRAIN) test_set_copy = prep.process(test_set_copy, SampleTypeEnum.TEST) if first_run and hasattr(train_set[0][0], 'keys'): unique_features: set = set() for inst in train_set: unique_features = unique_features.union( set(inst[0].keys())) data.print( f'Number of unique features after preprocessing for {ex["name"]}: {len(unique_features)}' ) unique_features = set() cls: ClassifierBase \ = getattr(classifiers, ex['classificator']).Classifier(ex['config']) cls.train(train_set_copy) evaluation: dict \ = compute_evaluation_scores(cls, test_set_copy, LikeTypeEnum.USEFUL) stats.add_points(len(train_set_copy), ex['name'], evaluation) if l_curves: evaluation: dict \ = compute_evaluation_scores(cls, train_set_copy, LikeTypeEnum.USEFUL) stats.add_points(len(train_set_copy), ex['name'] + '-train', evaluation) first_run = False if not data.prepare_next_dataset(): break # aggregate results here for g in experiments['graphs']: stats.name = g['name'] stats.set_view(g['data']) data.plot(stats)
type=float, default=0.5, help='Dropout rate (1 - keep probability).') args = parser.parse_args() for arg in vars(args): print('{0} = {1}'.format(arg, getattr(args, arg))) torch.manual_seed(args.seed) # training on the first GPU if not on CPU device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print('Training on device = {}'.format(device)) """ =========================================================================== Loading data =========================================================================== """ data = Data(path=args.data_path, dataset=args.dataset, split=args.split) print('Loaded {0} dataset with {1} nodes and {2} edges'.format( args.dataset, data.n_node, data.n_edge)) feature = data.feature.to(device) label = data.label.to(device) train = Dataset(data.idx_train) val = Dataset(data.idx_val) test = Dataset(data.idx_test) train_loader = DataLoader(dataset=train, batch_size=args.batch_size) val_loader = DataLoader(dataset=val, batch_size=args.batch_size) test_loader = DataLoader(dataset=test, batch_size=args.batch_size) sampler = Sampler(data.adj, args.aggregator) """ =========================================================================== Training ===========================================================================
def main(args): a4a = Data("a4a", A4A_FEATURES) a4a_testing = Data("a4a.t", A4A_FEATURES) iris = Data("iris.scale", IRIS_FEATURES) iris_testing = Data("iris.t", IRIS_FEATURES) if not args.algorithm: raise AssertionError("Please specify which ML Algorithm you would like to use with -a. Exiting...") if args.algorithm == 'perceptron' or args.algorithm == 'Perceptron': # Can specify max_lrate and max_epochs with -l and -e if not args.lrate: max_lrate = 0.1 else: max_lrate = args.lrate if not args.epochs: epochs = 1000 else: epochs = args.epochs if args.verbose: print("Beginning perceptron categorization... \n") for lrate in np.arange(0, max_lrate, 0.001): # --------------------------------------------------------------------- IRIS ----------------------------------------------------------------- # init_w_iris = [1 for _ in range(IRIS_FEATURES)] if args.verbose: print("\nIRIS:\n") """ Perceptron makes two passes for multi-classification. First pass sets datapoints with labels 2 or 3 as -1, and classifies a point as either 1, or 2/3. Second pass distinguishes between 2 and 3 by comparing them alone. This is kind of messy tbh. """ iris_y_second_pass = [] iris_x_second_pass = [] for i in range(len(iris.y)): if iris.y[i] == 1: continue elif iris.y[i] == 2: iris.y[i] = -1 iris_x_second_pass.append(iris.x[i]) iris_y_second_pass.append(-1) elif iris.y[i] == 3: iris.y[i] = -1 iris_x_second_pass.append(iris.x[i]) iris_y_second_pass.append(1) p2 = Perceptron(iris_testing.x[0], init_w_iris, bias=1) p2.train_weights(iris.x, iris.y, lrate=lrate, epochs=epochs, verbose=args.verbose) p3 = Perceptron(iris_testing.x[0], init_w_iris, bias=1) p3.train_weights(iris_x_second_pass, iris_y_second_pass, lrate=lrate, epochs=epochs, verbose=args.verbose) iris_error = 0 iris_start_time = time.time() for j in range(len(iris_testing.x)): p2.set_x(iris_testing.x[j]) prediction = p2.predict() if args.verbose: print(f"Prediction for {iris_testing.x[j]}: {prediction}. Recorded classification is {iris_testing.y[j]}") if prediction == 1 and iris_testing.y[j] == 1: iris_error += 1 elif prediction == -1 and (iris_testing.y[j] == 2 or iris_testing.y[j] == 3): iris_error += 1 for k in range(len(iris_testing.x)): if iris_testing.y[k] != 1: p3.set_x(iris_testing.x[k]) prediction = p3.predict() if args.verbose: print(f"Prediction for {iris_testing.x[k]}: {prediction}. Recorded classification is {iris_testing.y[k]}") if iris_testing.y[k] == 2 and prediction == -1: iris_error += 1 elif iris_testing.y[k] == 3 and prediction == 1: iris_error += 1 iris_error = iris_error / ( len(iris_testing.y) + 10 ) iris_total_time = time.time() - iris_start_time # --------------------------------------------------------------------- A4A ------------------------------------------------------------------ # init_w_a4a = [1 for _ in range(A4A_FEATURES)] if args.verbose: print("\nA4A:\n") p = Perceptron(a4a_testing.x[0], init_w_a4a, bias=1) p.train_weights(a4a.x, a4a.y, lrate=lrate, epochs=epochs, verbose=args.verbose) a4a_error = 0 a4a_start_time = time.time() for i in range(len(a4a_testing.x)): p.set_x(a4a_testing.x[i]) prediction = p.predict() #if args.verbose: #print(f"Prediction for {a4a_testing.x[i]}: {prediction}. Recorded classification is {a4a_testing.y[i]}") if prediction == a4a_testing.y[i]: a4a_error += 1 a4a_error = a4a_error / len(a4a_testing.y) a4a_total_time = time.time() - a4a_start_time if args.verbose: print(f"Iris misclassification error: {iris_error}\na4a misclassification error: {a4a_error}\n") print(f"Iris classification time: {iris_total_time}\na4a classification time: {a4a_total_time}") elif args.algorithm == 'kNN' or args.algorithm == 'knn': if args.verbose: print("Beginning k-Nearest Neighbors categorization... \n") # can specify k and distance with -k and -d if not args.k: max_k=25 else: max_k = args.k if not args.distance: distance_metric = 'euclidean' else: distance_metric = args.distance for k in range(1, max_k): # --------------------------------------------------------------------- IRIS ----------------------------------------------------------------- # iris_knn = kNN(iris.x, iris.y) iris_error = 0 iris_start_time = time.time() for i in range(len(iris_testing.x)): y = iris_knn.classify(new_x=iris_testing.x[i], k=k, distance_metric=distance_metric, verbose=args.verbose) if args.verbose: print(f"Prediction for {iris_testing.x[i]}: {y}. Recorded classification is {iris_testing.y[i]}") if y == iris_testing.y[i]: iris_error += 1 iris_error = iris_error / len(iris_testing.y) iris_total_time = time.time() - iris_start_time # --------------------------------------------------------------------- A4A ------------------------------------------------------------------ # a4a_knn = kNN(a4a.x, a4a.y) a4a_error = 0 a4a_start_time = time.time() for j in range(len(a4a_testing.x)): y = a4a_knn.classify(new_x=a4a_testing.x[j], k=k, distance_metric=distance_metric, verbose=args.verbose) if args.verbose: print(f"Prediction for {a4a_testing.x[j]}: {y}. Recorded classification is {a4a_testing.y[j]}") if y == a4a_testing.y[j]: a4a_error += 1 a4a_error = a4a_error / len(a4a_testing.y) a4a_total_time = time.time() - a4a_start_time if args.verbose: print(f"Iris misclassification error: {iris_error}\na4a misclassification error: {a4a_error}\n") print(f"Iris classification time: {iris_total_time}\na4a classification time: {a4a_total_time}") elif args.algorithm == 'decision' or args.algorithm == 'tree' or args.algorithm == 'decision_tree': # --------------------------------------------------------------------- IRIS ----------------------------------------------------------------- # iris_dt = DecisionTree(iris.x, iris.y) #left_x, left_y, right_x, right_y = iris_dt.split(0, 0) # --------------------------------------------------------------------- A4A ------------------------------------------------------------------ # a4a_dt = DecisionTree(a4a.x, a4a.y) print("This part made optional, therefore not implemented for the sake of time. ") return
'--dataset', type=str, default="FB15k-237", nargs="?", help='Which dataset to use: FB15k, FB15k-237, WN18 or WN18RR') args = parser.parse_args() model_name = args.algorithm dataset = args.dataset data_dir = "data/%s/" % dataset torch.backends.cudnn.deterministic = True seed = 42 np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available: torch.cuda.manual_seed_all(seed) d = Data(data_dir=data_dir, reverse=True) experiment = Experiment(model_name, num_iterations=800, batch_size=128, learning_rate=0.001, decay_rate=0.99, ent_vec_dim=200, rel_vec_dim=200, cuda=True, input_dropout=0.2, hidden_dropout=0.3, feature_map_dropout=0.2, in_channels=1, out_channels=32, filt_h=1, filt_w=9,
def __init__(self, num_feats, data, train=False, load_original=False, masking=True): if data == 'imdbcnn': num_words = 20002 maxlen = 400 embedding_dims = 50 hidden_dims = 250 weights_name = "original.h5" emb_name = 'embedding_1' batch_size = 40 self.num_classes = 2 num_epoch = 5 elif data == 'yahoolstm': num_words = 20001 maxlen = 400 embedding_dims = 300 hidden_dims = 250 weights_name = "original-0-7.hdf5" emb_name = 'embedding' self.num_classes = 10 batch_size = 1000 num_epoch = 1 Mean = Lambda(lambda x: K.sum(x, axis=1) / float(num_feats), output_shape=lambda x: [x[0], x[2]]) X_ph = Input(shape=(maxlen, ), dtype='int32') logits_T = construct_gumbel_selector(X_ph, num_words, embedding_dims, hidden_dims, maxlen, 1, network_type='cnn') tau = 0.5 sc_layer = Sample_Concrete(tau, num_feats, maxlen, masking) T = sc_layer(logits_T) if train: if not load_original: filters = 250 kernel_size = 3 print('transfer constucted') emb_layer = Embedding(num_words, embedding_dims, input_length=maxlen, trainable=False) emb2 = emb_layer(X_ph) selected_emb = Multiply()([emb2, T]) net = Dropout(0.2, trainable=False)(selected_emb) net = Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1, trainable=False)(net) net = Dense(hidden_dims, trainable=False)(net) net = GlobalMaxPooling1D()(net) net = Dense(hidden_dims, trainable=False)(net) net = Dropout(0.2, trainable=False)(net) net = Activation('relu', trainable=False)(net) net = Dense(self.num_classes, trainable=False)(net) preds = Activation('softmax', trainable=False)(net) model = Model(inputs=X_ph, outputs=preds) else: print('original constucted') emb_layer = Embedding(num_words, embedding_dims, input_length=maxlen, trainable=False) emb2 = emb_layer(X_ph) selected_emb = Multiply()([emb2, T]) preds = construct_original_network(selected_emb, data, trainable=False) model = Model(inputs=X_ph, outputs=preds) model.compile( loss=negative_xentropy, optimizer='RMSprop', #optimizer, metrics=['acc']) if load_original: print('Loading original models...') model.load_weights('{}/models/{}'.format(data, weights_name), by_name=True) else: model.load_weights('{}/models/transfer.hdf5'.format(data), by_name=True) if data == 'imdbcnn': emb_weights = emb_layer.get_weights() emb_weights[0][0] = np.zeros(50) emb_layer.set_weights(emb_weights) from load_data import Data dataset = Data(data, True) label_train = np.argmax(dataset.pred_train, axis=1) label_val = np.argmax(dataset.pred_val, axis=1) label_val = np.eye(self.num_classes)[label_val] label_train = np.argmax(dataset.pred_train, axis=1) filepath = "{}/models/L2X-{}-{}-mask.hdf5".format( data, num_feats, 'original' if load_original else 'transfer') checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='min') callbacks_list = [checkpoint] model.fit(dataset.x_train, label_train, validation_data=(dataset.x_val, label_val), callbacks=callbacks_list, epochs=num_epoch, batch_size=batch_size) else: pred_model = Model(X_ph, logits_T) pred_model.compile(loss=negative_xentropy, optimizer='RMSprop', metrics=['acc']) weights_name = "{}/models/L2X-{}-{}-mask.hdf5".format( data, num_feats, 'original' if load_original else 'transfer') pred_model.load_weights(weights_name, by_name=True) self.pred_model = pred_model
help="Entity embedding dimensionality.") parser.add_argument("--rdim", type=int, default=200, nargs="?", help="Relation embedding dimensionality.") parser.add_argument("--cuda", type=bool, default=True, nargs="?", help="Whether to use cuda (GPU) or not (CPU).") parser.add_argument("--input_dropout", type=float, default=0.3, nargs="?", help="Input layer dropout.") parser.add_argument("--hidden_dropout1", type=float, default=0.4, nargs="?", help="Dropout after the first hidden layer.") parser.add_argument("--hidden_dropout2", type=float, default=0.5, nargs="?", help="Dropout after the second hidden layer.") parser.add_argument("--label_smoothing", type=float, default=0.1, nargs="?", help="Amount of label smoothing.") args = parser.parse_args() dataset = args.dataset data_dir = "data/%s/" % dataset torch.backends.cudnn.deterministic = True seed = 20 np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available: torch.cuda.manual_seed_all(seed) d = Data(data_dir=data_dir, reverse=False) experiment = Experiment(num_iterations=args.num_iterations, batch_size=args.batch_size, learning_rate=args.lr, decay_rate=args.dr, ent_vec_dim=args.edim, rel_vec_dim=args.rdim, cuda=args.cuda, input_dropout=args.input_dropout, hidden_dropout1=args.hidden_dropout1, hidden_dropout2=args.hidden_dropout2, label_smoothing=args.label_smoothing) experiment.train_and_eval()
data_file = file_path + 'data/sports-600.npy' label_file = file_path + 'data/labels.npy' data = np.load(data_file) label = np.load(label_file) # load original tweets # --------------------------------------------------------------------------------- sports_dic = { 'basketball': 1, 'hockey': 2, 'baseball': 3, 'tennis': 4, 'volleyball': 5 } sp_data = Data(sports_dic, file_path) sp_df = sp_data.csv_df(['text']) # load data rm_hashtags = ['#' + s for s in sports_dic.keys()] sp_data.pre_process(sp_df, rm_list=rm_hashtags) # pre-process data sp_df.drop(['tokenized'], axis=1, inplace=True) # --------------------------------------------------------------------------------- # set up lstm structure n_classes = 5 hm_epochs = 20 batch_size = 50 chunk_size = data.shape[2] n_chunks = data.shape[1] rnn_size = 300 # height x width