def main(): train_file = os.path.join('../data', 'train.jsonl') df = data.get_train(train_file) #phase_1_evaluate_small_random_samples(df) train_set, _ = train_test_split( df, train_size=5, random_state=seed) # need to look into NEI ratios and balance phase_1(train_set)
def main(): np.random.seed(0) # inp, labels = get_train(True) N = len(labels) train_inp, train_labels = inp[:N * 4 // 5], labels[:N * 4 // 5] test_inp, test_labels = inp[N * 4 // 5:N], labels[N * 4 // 5:N] weights = initialize([2500, 1]) # train_labels = np.zeros((2000)) # weights = [np.ones((3,3)), np.ones((3,1))] # train_inp = np.ones((2,1,3)) losses = [] for i in range(10): out, train_pred = feed_forward(weights, train_inp) losses.append(get_error(train_labels, train_pred)) weights = train_logreg(weights, out, train_labels, train_pred) print('***TRAIN***') evaluate(train_labels, train_pred) print('***TEST***') _, test_pred = feed_forward(weights, test_inp) evaluate(test_labels, test_pred) plot(losses, '0 hidden layers')
def main(): np.random.seed(0) # 0 42 50 inp, labels = get_train(True) N = len(labels) train_inp, train_labels = inp[:N*4//5], labels[:N*4//5] test_inp, test_labels = inp[N*4//5:N], labels[N*4//5:N] for n_units in [30, 40, 50]: weights = initialize([2500, n_units, 1]) # train_labels = np.zeros((2000)) # weights = [np.ones((3,3)), np.ones((3,1))] # train_inp = np.ones((2,1,3)) losses = [] for i in range(150): out, train_pred = feed_forward(weights, train_inp) losses.append(get_error(train_labels, train_pred)) weights = backpropagate(weights, out, train_labels, train_pred) print('***TRAIN***') evaluate(train_labels, train_pred) print('***TEST***') _, test_pred = feed_forward(weights, test_inp) evaluate(test_labels, test_pred) plot(losses, f'Hidden Units = {n_units}')
metadata_path_all = glob.glob(sys.argv[1] + "*") print("shape of metadata_path_all") print(len(metadata_path_all)) if len(sys.argv) >= 3: subset = sys.argv[2] assert subset in ['train', 'valid', 'test', 'train_valid'] else: subset = 'test' if subset == "test": X, mask, _, num_seq = data.get_test() elif subset == "train": X_train, _, _, _, mask_train, _, num_seq = data.get_train() elif subset == "train_valid": X_train, X_valid, _, _, mask_train, mask_valid, num_seq = data.get_train() X = np.concatenate((X_train[:-30], X_valid)) mask = np.concatenate((mask_train[:-30], mask_valid)) else: _, X, _, _, _, mask, num_seq = data.get_train() for metadata_path in metadata_path_all: print("Loading metadata file %s" % metadata_path) metadata = np.load(metadata_path) config_name = metadata['config_name']
def main(): sym_y = T.imatrix('target_output') sym_mask = T.matrix('mask') sym_x = T.tensor3() TOL = 1e-5 num_epochs = config.epochs batch_size = config.batch_size #### DATA #### # print "@@@@TESTING@@@@" # l_in = nn.layers.InputLayer(shape=(None, 700, 42)) # l_dim_a = nn.layers.DimshuffleLayer( # l_in, (0,2,1)) # l_conv_a = nn.layers.Conv1DLayer( # incoming=l_dim_a, num_filters=42, border_mode='same', # filter_size=3, stride=1, nonlinearity=nn.nonlinearities.rectify) # l_dim_b = nn.layers.DimshuffleLayer( # l_conv_a, (0,2,1)) # out = nn.layers.get_output(l_dim_b, sym_x) # testvar = np.ones((128, 700, 42)).astype('float32') # print "@@@@EVAL@@@@" # john = out.eval({sym_x: testvar}) # print("Johns shape") # print(john.shape) print("Building network ...") ##########################DEBUG########################## l_in, l_out = config.build_model() ##########################DEBUG########################## all_layers = nn.layers.get_all_layers(l_out) num_params = nn.layers.count_params(l_out) print(" number of parameters: %d" % num_params) print(" layer output shapes:") for layer in all_layers: name = layer.__class__.__name__ print(" %s %s" % (name, nn.layers.get_output_shape(layer))) print("Creating cost function") # lasagne.layers.get_output produces a variable for the output of the net out_train = nn.layers.get_output( l_out, sym_x, deterministic=False) # testvar = np.ones((128, 700, 42)).astype('float32') # john = out_train.eval({sym_x: testvar}) # print("@@@@@JOHN@@@@@") # print(john.shape) # print(john.reshape((-1, num_classes)).shape) print("Creating eval function") out_eval = nn.layers.get_output( l_out, sym_x, deterministic=True) probs_flat = out_train.reshape((-1, num_classes)) lambda_reg = config.lambda_reg all_params = nn.layers.get_all_params(l_out) for i, p in enumerate(all_params): if p.ndim == 3: values = p.get_value() if side == 'right': values[..., int(values.shape[2] / 2.0 - 0.5):] = 0 p.set_value(values) all_params[i] = p[..., : int(values.shape[2] / 2.0 - 0.5)] else: values[..., : int(values.shape[2] / 2.0 + 0.5)] = 0 p.set_value(values) all_params[i] = p[..., int(values.shape[2] / 2.0 + 0.5):] params = [el for el in all_params if el.name == "W" or el.name == "gamma"] reg_term = sum(T.sum(p ** 2) for p in params) cost = T.nnet.categorical_crossentropy(T.clip(probs_flat, TOL, 1 - TOL), sym_y.flatten()) cost = T.sum(cost * sym_mask.flatten()) / T.sum(sym_mask) + lambda_reg * reg_term # Retrieve all parameters from the network all_params = [el for el in all_params if el.name == "W" or el.name == "gamma" or el.name == "beta"] # Setting the weights if hasattr(config, 'set_weights'): nn.layers.set_all_param_values(l_out, config.set_weights()) # Compute SGD updates for training print("Computing updates ...") if hasattr(config, 'learning_rate_schedule'): learning_rate_schedule = config.learning_rate_schedule # Import learning rate schedule else: learning_rate_schedule = {0: config.learning_rate} learning_rate = theano.shared(np.float32(learning_rate_schedule[0])) all_grads = T.grad(cost, all_params) cut_norm = config.cut_grad updates, norm_calc = nn.updates.total_norm_constraint(all_grads, max_norm=cut_norm, return_norm=True) if optimizer == "rmsprop": updates = nn.updates.rmsprop(updates, all_params, learning_rate) elif optimizer == "adadelta": updates = nn.updates.adadelta(updates, all_params, learning_rate) elif optimizer == "adagrad": updates = nn.updates.adagrad(updates, all_params, learning_rate) elif optimizer == "nag": momentum_schedule = config.momentum_schedule momentum = theano.shared(np.float32(momentum_schedule[0])) updates = nn.updates.nesterov_momentum(updates, all_params, learning_rate, momentum) else: sys.exit("please choose either <rmsprop/adagrad/adadelta/nag> in configfile") # Theano functions for training and computing cost print ("config.batch_size %d" % batch_size) print ("data.num_classes %d" % num_classes) if hasattr(config, 'build_model'): print("has build model") print("Compiling train ...") # Use this for training (see deterministic = False above) train = theano.function( [sym_x, sym_y, sym_mask], [cost, out_train, norm_calc], updates=updates) print("Compiling eval ...") # use this for eval (deterministic = True + no updates) eval = theano.function([sym_x, sym_y, sym_mask], [cost, out_eval]) # Start timers start_time = time.time() prev_time = start_time all_losses_train = [] all_accuracy_train = [] all_losses_eval_train = [] all_losses_eval_valid = [] all_losses_eval_test = [] all_accuracy_eval_train = [] all_accuracy_eval_valid = [] all_accuracy_eval_test = [] all_mean_norm = [] import data X_train, X_valid, y_train, y_valid, mask_train, mask_valid, num_seq_train \ = data.get_train() X_train, X_valid = X_train[..., 21:], X_valid[..., 21:] # Only train with pssm scores print("y shape") print(y_valid.shape) print("X shape") print(X_valid.shape) # Start training for i in range(y_train.shape[0]): for j in range(y_train.shape[1]): if y_train[i][j] == 5: y_train[i][j] = 1 else: y_train[i][j] = 0 for i in range(y_valid.shape[0]): for j in range(y_valid.shape[1]): if y_valid[i][j] == 5: y_valid[i][j] = 1 else: y_valid[i][j] = 0 for epoch in range(num_epochs): if (epoch % 10) == 0: print ("Epoch %d of %d" % (epoch + 1, num_epochs)) if epoch in learning_rate_schedule: lr = np.float32(learning_rate_schedule[epoch]) print (" setting learning rate to %.7f" % lr) learning_rate.set_value(lr) if optimizer == "nag": if epoch in momentum_schedule: mu = np.float32(momentum_schedule[epoch]) print (" setting learning rate to %.7f" % mu) momentum.set_value(mu) # print "Shuffling data" seq_names = np.arange(0, num_seq_train) np.random.shuffle(seq_names) X_train = X_train[seq_names] y_train = y_train[seq_names] mask_train = mask_train[seq_names] num_batches = num_seq_train // batch_size losses = [] preds = [] norms = [] for i in range(num_batches): idx = range(i * batch_size, (i + 1) * batch_size) x_batch = X_train[idx] y_batch = y_train[idx] mask_batch = mask_train[idx] loss, out, batch_norm = train(x_batch, y_batch, mask_batch) # print(batch_norm) norms.append(batch_norm) preds.append(out) losses.append(loss) # if ((i+1) % config.write_every_batch == 0) | (i == 0): # if i == 0: # start_place = 0 # else: # start_place = i-config.write_every_batch # print "Batch %d of %d" % (i + 1, num_batches) # print " curbatch training loss: %.5f" % np.mean(losses[start_place:(i+1)]) # print " curbatch training acc: %.5f" % np.mean(accuracy[start_place:(i+1)]) predictions = np.concatenate(preds, axis=0) loss_train = np.mean(losses) all_losses_train.append(loss_train) acc_train = utils.proteins_acc(predictions, y_train[0:num_batches * batch_size], mask_train[0:num_batches * batch_size]) all_accuracy_train.append(acc_train) mean_norm = np.mean(norms) all_mean_norm.append(mean_norm) if 1 == 1: print (" average training loss: %.5f" % loss_train) print (" average training accuracy: %.5f" % acc_train) print (" average norm: %.5f" % mean_norm) sets = [ # ('train', X_train, y_train, mask_train, all_losses_eval_train, all_accuracy_eval_train), ('valid', X_valid, y_valid, mask_valid, all_losses_eval_valid, all_accuracy_eval_valid)] for subset, X, y, mask, all_losses, all_accuracy in sets: print (" validating: %s loss" % subset) preds = [] num_batches = np.size(X, axis=0) // config.batch_size for i in range(num_batches): ## +1 to get the "rest" # print(i) idx = range(i * batch_size, (i + 1) * batch_size) x_batch = X[idx] y_batch = y[idx] mask_batch = mask[idx] loss, out = eval(x_batch, y_batch, mask_batch) preds.append(out) # acc = utils.proteins_acc(out, y_batch, mask_batch) losses.append(loss) # accuracy.append(acc) predictions = np.concatenate(preds, axis=0) # print " pred" # print(predictions.shape) # print(predictions.dtype) loss_eval = np.mean(losses) all_losses.append(loss_eval) # acc_eval = np.mean(accuracy) acc_eval = utils.proteins_acc(predictions, y, mask) all_accuracy.append(acc_eval) print (" average evaluation loss (%s): %.5f" % (subset, loss_eval)) print (" average evaluation accuracy (%s): %.5f" % (subset, acc_eval)) now = time.time() time_since_start = now - start_time time_since_prev = now - prev_time prev_time = now est_time_left = time_since_prev * (num_epochs - epoch) eta = datetime.now() + timedelta(seconds=est_time_left) eta_str = eta.strftime("%c") print (" %s since start (%.2f s)" % (utils.hms(time_since_start), time_since_prev)) print (" estimated %s to go (ETA: %s)" % (utils.hms(est_time_left), eta_str)) print() if (epoch >= config.start_saving_at) and ((epoch % config.save_every) == 0): print (" saving parameters and metadata") with open((metadata_path + side + "-%d" % (epoch) + ".pkl"), 'wb') as f: pickle.dump({ 'config_name': config_name, 'param_values': nn.layers.get_all_param_values(l_out), 'losses_train': all_losses_train, 'accuracy_train': all_accuracy_train, 'losses_eval_train': all_losses_eval_train, 'losses_eval_valid': all_losses_eval_valid, 'losses_eval_test': all_losses_eval_test, 'accuracy_eval_valid': all_accuracy_eval_valid, 'accuracy_eval_train': all_accuracy_eval_train, 'accuracy_eval_test': all_accuracy_eval_test, 'mean_norm': all_mean_norm, 'time_since_start': time_since_start, 'i': i, }, f, pickle.HIGHEST_PROTOCOL) print (" stored in %s" % metadata_path) print()
deltrnstim = np.hstack([np.roll(trnstim, d, 0) for d in delays]) delvalstim = np.hstack([np.roll(valstim, d, 0) for d in delays]) #sdeltrnstim = scipy.sparse.csr_matrix(deltrnstim) #sdelvalstim = scipy.sparse.csr_matrix(delvalstim) zs = lambda m: (m - m.mean(0)) / m.std(0) sdeltrnstim = deltrnstim = np.nan_to_num(zs(deltrnstim)) sdelvalstim = delvalstim = np.nan_to_num(zs(delvalstim)) # Select some voxels ebamask = cortex.get_roi_mask("MLfs", "20121210ML_auto1", roi="EBA")["EBA"] > 0 # Load training, test fMRI data trndata = data.get_train(masked=ebamask)[:numtime] valdata = data.get_val(masked=ebamask) from ridge import _RidgeGridCV ridge = _RidgeGridCV(alpha_min=1., alpha_max=1000., n_grid_points=5, n_grid_refinements=2, cv=2) ridge_coefs = ridge.fit(deltrnstim, trndata).coef_.T Uridge, sridge, VridgeT = np.linalg.svd(ridge_coefs, full_matrices=False) ranks = [1, 2, 5, 10] results = [] corr_scores = [] r2_scores = []
batch, NUM_SENTS, RETRIEVER, SELECTOR, oracle_doc_ret=isinstance(RETRIEVER, data.OracleDocRetriever), ) if __name__ == "__main__": device = "cuda" if torch.cuda.is_available() else "cpu" ########################################################################### # Setup the datasets/loaders # ########################################################################### train = data.get_train(TRAIN_PATH) train, test = train_test_split(train) torch.cuda.empty_cache() train_dataset = data.FastDataset(train) test_dataset = data.TestDataset(test) train_loader = DataLoader( train_dataset, batch_size=64, shuffle=True, collate_fn=prepare, num_workers=0, # doesn't work with more than 1 ) test_loader = DataLoader(
def main(): sym_y = T.imatrix('target_output') sym_mask = T.matrix('mask') sym_x = T.tensor3() TOL = 1e-5 num_epochs = config.epochs batch_size = config.batch_size #### DATA #### # print "@@@@TESTING@@@@" # l_in = nn.layers.InputLayer(shape=(None, 700, 42)) # l_dim_a = nn.layers.DimshuffleLayer( # l_in, (0,2,1)) # l_conv_a = nn.layers.Conv1DLayer( # incoming=l_dim_a, num_filters=42, border_mode='same', # filter_size=3, stride=1, nonlinearity=nn.nonlinearities.rectify) # l_dim_b = nn.layers.DimshuffleLayer( # l_conv_a, (0,2,1)) # out = nn.layers.get_output(l_dim_b, sym_x) # testvar = np.ones((128, 700, 42)).astype('float32') # print "@@@@EVAL@@@@" # john = out.eval({sym_x: testvar}) # print("Johns shape") # print(john.shape) print("Building network ...") ##########################DEBUG########################## l_in, l_out = config.build_model() ##########################DEBUG########################## all_layers = nn.layers.get_all_layers(l_out) num_params = nn.layers.count_params(l_out) print(" number of parameters: %d" % num_params) print(" layer output shapes:") for layer in all_layers: name = string.ljust(layer.__class__.__name__, 32) print(" %s %s" % (name, nn.layers.get_output_shape(layer))) print("Creating cost function") # lasagne.layers.get_output produces a variable for the output of the net out_train = nn.layers.get_output( l_out, sym_x, deterministic=False) # testvar = np.ones((128, 700, 42)).astype('float32') # john = out_train.eval({sym_x: testvar}) # print("@@@@@JOHN@@@@@") # print(john.shape) # print(john.reshape((-1, num_classes)).shape) print("Creating eval function") out_eval = nn.layers.get_output( l_out, sym_x, deterministic=True) probs_flat = out_train.reshape((-1, num_classes)) lambda_reg = config.lambda_reg params = nn.layers.get_all_params(l_out, regularizable=True) reg_term = sum(T.sum(p**2) for p in params) cost = T.nnet.categorical_crossentropy(T.clip(probs_flat, TOL, 1-TOL), sym_y.flatten()) cost = T.sum(cost*sym_mask.flatten()) / T.sum(sym_mask) + lambda_reg * reg_term # Retrieve all parameters from the network all_params = nn.layers.get_all_params(l_out, trainable=True) # Setting the weights if hasattr(config, 'set_weights'): nn.layers.set_all_param_values(l_out, config.set_weights()) # Compute SGD updates for training print("Computing updates ...") if hasattr(config, 'learning_rate_schedule'): learning_rate_schedule = config.learning_rate_schedule # Import learning rate schedule else: learning_rate_schedule = { 0: config.learning_rate } learning_rate = theano.shared(np.float32(learning_rate_schedule[0])) all_grads = T.grad(cost, all_params) cut_norm = config.cut_grad updates, norm_calc = nn.updates.total_norm_constraint(all_grads, max_norm=cut_norm, return_norm=True) if optimizer == "rmsprop": updates = nn.updates.rmsprop(updates, all_params, learning_rate) elif optimizer == "adadelta": updates = nn.updates.adadelta(updates, all_params, learning_rate) elif optimizer == "adagrad": updates = nn.updates.adagrad(updates, all_params, learning_rate) elif optimizer == "nag": momentum_schedule = config.momentum_schedule momentum = theano.shared(np.float32(momentum_schedule[0])) updates = nn.updates.nesterov_momentum(updates, all_params, learning_rate, momentum) else: sys.exit("please choose either <rmsprop/adagrad/adadelta/nag> in configfile") # Theano functions for training and computing cost print "config.batch_size %d" %batch_size print "data.num_classes %d" %num_classes if hasattr(config, 'build_model'): print("has build model") print("Compiling train ...") # Use this for training (see deterministic = False above) train = theano.function( [sym_x, sym_y, sym_mask], [cost, out_train, norm_calc], updates=updates) print("Compiling eval ...") # use this for eval (deterministic = True + no updates) eval = theano.function([sym_x, sym_y, sym_mask], [cost, out_eval]) # Start timers start_time = time.time() prev_time = start_time all_losses_train = [] all_accuracy_train = [] all_losses_eval_train = [] all_losses_eval_valid = [] all_losses_eval_test = [] all_accuracy_eval_train = [] all_accuracy_eval_valid = [] all_accuracy_eval_test = [] all_mean_norm = [] import data X_train, X_valid, y_train, y_valid, mask_train, mask_valid, num_seq_train \ = data.get_train() print("y shape") print(y_valid.shape) print("X shape") print(X_valid.shape) # Start training for epoch in range(num_epochs): if (epoch % 10) == 0: print "Epoch %d of %d" % (epoch + 1, num_epochs) if epoch in learning_rate_schedule: lr = np.float32(learning_rate_schedule[epoch]) print " setting learning rate to %.7f" % lr learning_rate.set_value(lr) if optimizer == "nag": if epoch in momentum_schedule: mu = np.float32(momentum_schedule[epoch]) print " setting learning rate to %.7f" % mu momentum.set_value(mu) # print "Shuffling data" seq_names = np.arange(0,num_seq_train) np.random.shuffle(seq_names) X_train = X_train[seq_names] y_train = y_train[seq_names] mask_train = mask_train[seq_names] num_batches = num_seq_train // batch_size losses = [] preds = [] norms = [] for i in range(num_batches): idx = range(i*batch_size, (i+1)*batch_size) x_batch = X_train[idx] y_batch = y_train[idx] mask_batch = mask_train[idx] loss, out, batch_norm = train(x_batch, y_batch, mask_batch) # print(batch_norm) norms.append(batch_norm) preds.append(out) losses.append(loss) # if ((i+1) % config.write_every_batch == 0) | (i == 0): # if i == 0: # start_place = 0 # else: # start_place = i-config.write_every_batch # print "Batch %d of %d" % (i + 1, num_batches) # print " curbatch training loss: %.5f" % np.mean(losses[start_place:(i+1)]) # print " curbatch training acc: %.5f" % np.mean(accuracy[start_place:(i+1)]) predictions = np.concatenate(preds, axis = 0) loss_train = np.mean(losses) all_losses_train.append(loss_train) acc_train = utils.proteins_acc(predictions, y_train[0:num_batches*batch_size], mask_train[0:num_batches*batch_size]) all_accuracy_train.append(acc_train) mean_norm = np.mean(norms) all_mean_norm.append(mean_norm) if 1==1: print " average training loss: %.5f" % loss_train print " average training accuracy: %.5f" % acc_train print " average norm: %.5f" % mean_norm sets = [#('train', X_train, y_train, mask_train, all_losses_eval_train, all_accuracy_eval_train), ('valid', X_valid, y_valid, mask_valid, all_losses_eval_valid, all_accuracy_eval_valid)] for subset, X, y, mask, all_losses, all_accuracy in sets: print " validating: %s loss" % subset preds = [] num_batches = np.size(X,axis=0) // config.batch_size for i in range(num_batches): ## +1 to get the "rest" # print(i) idx = range(i*batch_size, (i+1)*batch_size) x_batch = X[idx] y_batch = y[idx] mask_batch = mask[idx] loss, out = eval(x_batch, y_batch, mask_batch) preds.append(out) # acc = utils.proteins_acc(out, y_batch, mask_batch) losses.append(loss) # accuracy.append(acc) predictions = np.concatenate(preds, axis = 0) # print " pred" # print(predictions.shape) # print(predictions.dtype) loss_eval = np.mean(losses) all_losses.append(loss_eval) # acc_eval = np.mean(accuracy) acc_eval = utils.proteins_acc(predictions, y, mask) all_accuracy.append(acc_eval) print " average evaluation loss (%s): %.5f" % (subset, loss_eval) print " average evaluation accuracy (%s): %.5f" % (subset, acc_eval) now = time.time() time_since_start = now - start_time time_since_prev = now - prev_time prev_time = now est_time_left = time_since_start * num_epochs eta = datetime.now() + timedelta(seconds=est_time_left) eta_str = eta.strftime("%c") print " %s since start (%.2f s)" % (utils.hms(time_since_start), time_since_prev) print " estimated %s to go (ETA: %s)" % (utils.hms(est_time_left), eta_str) print if (epoch >= config.start_saving_at) and ((epoch % config.save_every) == 0): print " saving parameters and metadata" with open((metadata_path + "-%d" % (epoch) + ".pkl"), 'w') as f: pickle.dump({ 'config_name': config_name, 'param_values': nn.layers.get_all_param_values(l_out), 'losses_train': all_losses_train, 'accuracy_train': all_accuracy_train, 'losses_eval_train': all_losses_eval_train, 'losses_eval_valid': all_losses_eval_valid, 'losses_eval_test': all_losses_eval_test, 'accuracy_eval_valid': all_accuracy_eval_valid, 'accuracy_eval_train': all_accuracy_eval_train, 'accuracy_eval_test': all_accuracy_eval_test, 'mean_norm' : all_mean_norm, 'time_since_start': time_since_start, 'i': i, }, f, pickle.HIGHEST_PROTOCOL) print " stored in %s" % metadata_path print
# -*- coding: utf-8 -*- import numpy as np import data #%% Simple Mean-based Modell # define the logmean def logmean(x): return np.exp(np.mean(np.log(x+1)))-1 # load the training data print('load training data') df_train = data.get_train(nrows = 10000) # compute the means for different configurations print('compute means') mean_tab = df_train.groupby('ProductId').agg({'AdjDemand': logmean}) mean_tab2 = df_train.groupby(['ProductId', 'ClientId']).agg({'AdjDemand': logmean}) global_mean = logmean(df_train['AdjDemand']) # generate estimation for each ProductID-ClientID-pair def estimate(key): key = tuple(key) # key needs to be a tuple try: est = mean_tab2.at[key,'AdjDemand'] except KeyError: try :
def main(): #unzip raw_content file os.system("unzip zipRawcontent; mkdir data; mv raw_content data/") os.system("cp trainfile data/train.tsv") os.system("cp testfile data/test.tsv") os.system("mkdir ../generated ") data = get_train() + get_test() f = file('extracted_text', 'w') for i, item in enumerate(data): # status update if (i % 500) == 0: print i, datetime.datetime.now().time() # parse file data = {} soup = boil_soup(item['urlid']) # given boilerplate data['boilerplate'] = [item['title'], item['body']] # extract text extractor = Extractor(extractor='ArticleExtractor', html=unicode(soup)) data['boilerpipe'] = [extractor.getText()] # remove non-text tags for tag in ['script', 'style']: for el in soup.find_all(tag): el.extract() # extract text for each tag for tag in TAGS: items = [] for el in soup.find_all(tag): el.extract() if tag == 'img': try: items.append(el['alt']) except KeyError: pass try: items.append(el['title']) except KeyError: pass else: items.append(el.text) data[tag] = items # extract meta tags meta = soup.find_all('meta') for el in meta: prop = el.get('property') if el.get('property') else el.get('name') if not prop: continue prop = prop.lower() try: s = unicode(el['content']) except: continue data['meta-'+prop] = s.split(u',') if prop == 'keywords' else [s] # preprocess string for item in data: data[item] = map(clean_string, data[item]) data[item] = filter(None, data[item]) print >>f, json.dumps(data) f.close()
from multi_task_ridge import _multi_target_ridge as multi_task_ridge beta_old = betas_indep # beta_old = np.load("/auto/k8/meickenberg/cache/thresh_-1.00_0.80_mt_ridge_with_corr_gamma_300.00.npz")['beta'] import os cachedir = os.environ["DEFAULT_CACHE_DIR"] from delayed import make_delayed from ridge import _multi_corr_score X_train_raw = data.get_wordnet(mode="train") X_train = make_delayed(X_train_raw, [2, 3, 4]) X_val_raw = data.get_wordnet(mode="val") X_val = make_delayed(X_val_raw, [2, 3, 4]) Y_train = data.get_train() Y_val = data.get_val() print "Starting loop" import time for gamma in [100, 500, 1000, 5000, 10000]: t = time.time() print "evaluating gamma=%f" % gamma beta_new = multi_task_ridge(X_train, Y_train, M=M_matrix, gamma=gamma, A=A, alpha=1., warmstart=beta_old, maxiter=61) y_pred_new = X_val.dot(beta_new)
""" import numpy as np import data #%% Simple Mean-based Modell # define the logmean def logmean(x): return np.exp(np.mean(np.log(x + 1))) - 1 # load the training data print('load training data') df_train = data.get_train(nrows=10000) # compute the means for different configurations print('compute means') mean_tab = df_train.groupby('ProductId').agg({'AdjDemand': logmean}) mean_tab2 = df_train.groupby(['ProductId', 'ClientId']).agg({'AdjDemand': logmean}) global_mean = logmean(df_train['AdjDemand']) # generate estimation for each ProductID-ClientID-pair def estimate(key): key = tuple(key) # key needs to be a tuple try: est = mean_tab2.at[key, 'AdjDemand'] except KeyError:
numtime = 1000 # load stimuli trnstim = data.get_wordnet("train") valstim = data.get_wordnet("val")[90:] delays = [2, 3, 4] deltrnstim = np.hstack([np.roll(trnstim, d, 0) for d in delays]) delvalstim = np.hstack([np.roll(valstim, d, 0) for d in delays]) sdeltrnstim = scipy.sparse.csr_matrix(deltrnstim) ebamask = cortex.get_roi_mask("MLfs", "20121210ML_auto1", roi="EBA")["EBA"] > 0 trndata = data.get_train(masked=ebamask) # use first block for noise covariance estimation valdata_repeats = data.get_val(masked=ebamask, repeats=True)[:90] # use second and third block for evaluation valdata = data.get_val(masked=ebamask)[90:] # zscore it? valdata_repeats = ((valdata_repeats - valdata_repeats.mean(0)[np.newaxis, ...]) / valdata_repeats.std(0)[np.newaxis, ...]) valdata_noise = valdata_repeats - valdata_repeats.mean(-1)[..., np.newaxis] # fit Independent Ridge Regression
def main(): data = get_train() + get_test() f = file('generated/extracted_text', 'w') for i, item in enumerate(data): # status update if (i % 500) == 0: print i, datetime.datetime.now().time() # parse file data = {} soup = boil_soup(item['urlid']) # given boilerplate data['boilerplate'] = [item['title'], item['body']] # extract text extractor = Extractor(extractor='ArticleExtractor', html=unicode(soup)) data['boilerpipe'] = [extractor.getText()] # remove non-text tags for tag in ['script', 'style']: for el in soup.find_all(tag): el.extract() # extract text for each tag for tag in TAGS: items = [] for el in soup.find_all(tag): el.extract() if tag == 'img': try: items.append(el['alt']) except KeyError: pass try: items.append(el['title']) except KeyError: pass else: items.append(el.text) data[tag] = items # extract meta tags meta = soup.find_all('meta') for el in meta: prop = el.get('property') if el.get('property') else el.get('name') if not prop: continue prop = prop.lower() try: s = unicode(el['content']) except: continue data['meta-' + prop] = s.split(u',') if prop == 'keywords' else [s] # preprocess string for item in data: data[item] = map(clean_string, data[item]) data[item] = filter(None, data[item]) print >> f, json.dumps(data) f.close()
outdir = "../data/clean/" index = data.index_wiki('../data/wiki-pages') for file in tqdm(index.keys()): wiki = data.get_wiki(file) lines = wiki["lines"].apply( lambda l: "<SPLIT>".join(data.clean_article(l))) wiki["text"] = lines wiki = wiki.drop("lines", axis=1).reset_index() new_file = outdir + file.split("/")[-1] wiki.to_json(new_file, orient="records", lines=True) ########################################################################### # Setup # ########################################################################### # Load the data train = data.get_train("../data/train.jsonl") train = train.explode("evidence").reset_index() train, test = train_test_split(train) # Load the model embedder = ret.SentEmbed("distilroberta-base-msmarco-v2") # Build the dataset objects and loaders train_dataset = data.SentenceDataset(train, embedder, "../data/wiki.db", 4) test_dataset = data.SentenceDataset(test, embedder, "../data/wiki.db", 4) train_loader = DataLoader( train_dataset, batch_size=64, shuffle=True, collate_fn=train_dataset.collate,
sdeltrnstim = deltrnstim = np.nan_to_num(zs(deltrnstim)) sdelvalstim = delvalstim = np.nan_to_num(zs(delvalstim)) # Select some voxels cort_mask = cortex.get_cortical_mask("MLfs", "20121210ML_auto1", "thick") #rois = ["V1", "V2", "V3"] rois = ["V1"] masks = [cortex.get_roi_mask("MLfs", "20121210ML_auto1", roi=roi)[roi] > 0 for roi in rois] roimask = reduce(lambda x, y: (x + y), masks) wardmask = cort_mask - roimask # Load training, test fMRI data trndata_roi = np.nan_to_num(data.get_train(masked=roimask)[:numtime]) trndata_ward = np.nan_to_num(data.get_train(masked=wardmask)[:numtime]) connectivity = image.grid_to_graph(n_x=wardmask.shape[0], n_y=wardmask.shape[1], n_z=wardmask.shape[2], mask=wardmask) ward = WardAgglomeration(n_clusters=numclusters, connectivity=connectivity, memory='nilearn_cache') ward.fit(trndata_ward) labels = ward.labels_ trndata_collapsed = np.array([trndata_ward[:, labels == i].mean(1) for i in range(numclusters)]) trndata = np.hstack((trndata_roi, trndata_collapsed.T)) valdata = data.get_val(masked=roimask)