def main(args): if len(args) < 1: sys.stderr.write("Error - one required arguments: <data directory>\n") sys.exit(-1) working_dir = args[0] print("Reading data...") Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(working_dir) Y_array = np.array(Y) # print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape))) num_examples, dimension = X_array.shape num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet) num_y_examples = len(Y) assert num_examples == num_y_examples Y_adj, indices = ctk_io.flatten_outputs(Y_array) train_x, valid_x, train_y, valid_y = train_test_split(X_array, Y_array, test_size=0.2, random_state=18) optim = RandomSearch( lambda: get_random_config(), lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y, len(feature_alphabet), num_outputs), ) best_config = optim.optimize() print("Best config: %s" % best_config)
def main(args): if len(args) < 1: sys.stderr.write("Error - one required arguments: <data directory>\n") sys.exit(-1) working_dir = args[0] print("Reading data...") Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data( working_dir) Y_array = np.array(Y) #print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape))) num_examples, dimension = X_array.shape num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet) num_y_examples = len(Y) assert num_examples == num_y_examples Y_adj, indices = ctk_io.flatten_outputs(Y_array) train_x, valid_x, train_y, valid_y = train_test_split(X_array, Y_array, test_size=0.2, random_state=18) optim = RandomSearch( lambda: get_random_config(), lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y, len(feature_alphabet), num_outputs)) best_config = optim.optimize() print("Best config: %s" % best_config)
def main(args): if len(args) < 1: sys.stderr.write("Error - one required arguments: <data directory>\n") sys.exit(-1) working_dir = args[0] print("Reading data...") Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(working_dir) X_segments, dimensions = split_entity_data(X_array, feature_alphabet) Y_array = np.array(Y) Y_adj, indices = ctk_io.flatten_outputs(Y_array) num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet) num_y_examples = len(Y) train_x0, valid_x0, train_x1, valid_x1, train_x2, valid_x2, train_y, valid_y = train_test_split(X_segments[0], X_segments[1], X_segments[2], Y_array, test_size=0.2, random_state=18) train_x = [train_x0, train_x1, train_x2] valid_x = [valid_x0, valid_x1, valid_x2] optim = RandomSearch(lambda: get_random_config(), lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y, len(feature_alphabet), num_outputs ) ) best_config = optim.optimize() print("Best config: %s" % best_config)
def main(args): if len(args) < 1: sys.stderr.write("Error - one required arguments: <data directory>\n") sys.exit(-1) working_dir = args[0] print("Reading data...") Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data( working_dir) X_segments, dimensions = split_entity_data(X_array, feature_alphabet) Y_array = np.array(Y) Y_adj, indices = ctk_io.flatten_outputs(Y_array) num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet) num_y_examples = len(Y) train_x0, valid_x0, train_x1, valid_x1, train_x2, valid_x2, train_y, valid_y = train_test_split( X_segments[0], X_segments[1], X_segments[2], Y_array, test_size=0.2, random_state=18) train_x = [train_x0, train_x1, train_x2] valid_x = [valid_x0, valid_x1, valid_x2] optim = RandomSearch( lambda: get_random_config(), lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y, len(feature_alphabet), num_outputs)) best_config = optim.optimize() print("Best config: %s" % best_config)
def main(args): #np.random.seed(1337) if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] data_file = os.path.join(working_dir, 'training-data.liblinear') # learn alphabet from training data provider = dataset.DatasetProvider(data_file) # now load training examples and labels train_x, train_y = provider.load(data_file) # turn x and y into numpy array among other things maxlen = max([len(seq) for seq in train_x]) classes = len(set(train_y)) train_x = pad_sequences(train_x, maxlen=maxlen) train_y = to_categorical(np.array(train_y), classes) #loading pre-trained embedding file: embeddings_index = {} f = open(os.path.join(working_dir, 'mimic.txt')) values = f.readline().split() EMBEDDING_WORDNUM = int(values[0]) EMBEDDING_DIM = int(values[1]) for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() print('load embeddings for %s=%s words.' % (len(embeddings_index), EMBEDDING_WORDNUM)) # prepare embedding matrix nb_words = len(provider.word2int) embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM)) for word, i in provider.word2int.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector print 'train_x shape:', train_x.shape print 'train_y shape:', train_y.shape #train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.1, random_state=18) optim = RandomSearch( lambda: get_random_config(), lambda x, y: run_one_eval( x, y, train_x, train_y, maxlen, len(provider.word2int), classes, embedding_matrix, EMBEDDING_DIM)) best_config = optim.optimize() print("Best config: %s" % best_config) sys.exit(0)
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] print("Reading data...") Y, outcome_map, outcome_list, X, feature_alphabet = ctk_io.read_multitask_token_sequence_data(working_dir) start_ind = feature_alphabet[start_symbol] end_ind = feature_alphabet[end_symbol] train_x, valid_x, train_y, valid_y = train_test_split(X, Y, test_size=0.2, random_state=7) # X_distance = get_distance_features(X, start_ind, end_ind) print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape))) num_examples, dimension = X.shape num_y_examples, num_labels = Y.shape assert num_examples == num_y_examples weights = None if len(args) > 1: weights = ctk_io.read_embeddings(args[1], feats_alphabet) train_y_adj, train_indices = ctk_io.flatten_outputs(train_y) valid_y_adj, valid_indices = ctk_io.flatten_outputs(valid_y) if not train_indices == valid_indices: print("Error: training and valid sets have different index sets -- may be missing some labels in one set or the other") sys.exit(-1) output_dims_list = [] train_y_list = [] valid_y_list = [] indices = train_indices for i in range(len(indices)-1): label_dims = indices[i+1] - indices[i] output_dims_list.append(label_dims) if label_dims == 1: train_y_list.append(train_y_adj[:, indices[i]]) valid_y_list.append(valid_y_adj[:, indices[i]]) else: train_y_list.append(train_y_adj[:, indices[i]:indices[i+1]]) valid_y_list.append(valid_y_adj[:, indices[i]:indices[i+1]]) print("Dimensions of label %d are %s" % (i, str(train_y_list[-1].shape) ) ) ## pass a function to the search that it uses to get a random config ## and a function that it will get an eval given (e)pochs and (c)onfig file: optim = RandomSearch(lambda: get_random_config(weights), lambda e, c: run_one_eval(e, c, train_x, train_y_list, valid_x, valid_y_list, len(feature_alphabet), output_dims_list, weights ) ) best_config = optim.optimize(max_iter=27) open(os.path.join(working_dir, 'model_0.config'), 'w').write( str(best_config) ) print("Best config returned by optimizer is %s" % str(best_config) )
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory> [(optional) weights file]\n") sys.exit(-1) working_dir = args[0] (labels, label_alphabet, feats, feats_alphabet) = ctk_io.read_bio_sequence_data(working_dir) weights = None if len(args) > 1: weights = ctk_io.read_embeddings(args[1], feats_alphabet) maxlen = max([len(seq) for seq in feats]) all_x = pad_sequences(feats, maxlen=maxlen) all_y = ctk_io.expand_labels(pad_sequences(labels, maxlen=maxlen), label_alphabet) train_x, valid_x, train_y, valid_y = train_test_split(all_x, all_y, test_size=0.2, random_state=7) optim = RandomSearch(lambda: get_random_config(weights), lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y, len(feats_alphabet), len(label_alphabet), weights ) ) best_config = optim.optimize() open(os.path.join(working_dir, 'model_0.config'), 'w').write( str(best_config) ) print("Best config returned by optimizer is %s" % str(best_config) ) if not best_config['pretrain']: weights = None model = get_model_for_config(train_x.shape, len(feats_alphabet), len(label_alphabet), best_config, weights=weights) model.fit(all_x, all_y, nb_epoch=40, batch_size=best_config['batch_size'], verbose=1, validation_split=0.1) model.summary() json_string = model.to_json() open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string) model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True) fn = open(os.path.join(working_dir, 'alphabets.pkl'), 'w') pickle.dump( (feats_alphabet, label_alphabet), fn) fn.close() with ZipFile(os.path.join(working_dir, 'script.model'), 'w') as myzip: myzip.write(os.path.join(working_dir, 'model_0.json'), 'model_0.json') myzip.write(os.path.join(working_dir, 'model_0.h5'), 'model_0.h5') myzip.write(os.path.join(working_dir, 'alphabets.pkl'), 'alphabets.pkl')
base = os.environ['DATA_ROOT'] train_dir = os.path.join(base, cfg.get('data', 'train')) code_file = os.path.join(base, cfg.get('data', 'codes')) provider = dataset.DatasetProvider(train_dir, code_file, cfg.getint('args', 'min_token_freq'), cfg.getint('args', 'max_tokens_in_file'), cfg.getint('args', 'min_examples_per_code'), use_cuis=False) x, y = provider.load(tokens_as_set=False) maxlen = max([len(seq) for seq in x]) x = pad_sequences(x, maxlen=maxlen) y = np.array(y) print('x shape:', x.shape) print('y shape:', y.shape) print('max seq len:', maxlen) print('vocab size:', x.max() + 1) print('number of features:', len(provider.token2int)) print('number of labels:', len(provider.code2int)) model = CnnCodePredictionModel() search = RandomSearch(model, x, y) best_config = search.optimize(max_iter=64) print('best config:', best_config)
def main(args): if len(args) < 1: sys.stderr.write( "Error - one required argument: <data directory> [(optional) weights file]\n" ) sys.exit(-1) working_dir = args[0] (labels, label_alphabet, feats, feats_alphabet) = ctk_io.read_bio_sequence_data(working_dir) weights = None if len(args) > 1: weights = ctk_io.read_embeddings(args[1], feats_alphabet) maxlen = max([len(seq) for seq in feats]) all_x = pad_sequences(feats, maxlen=maxlen) all_y = ctk_io.expand_labels(pad_sequences(labels, maxlen=maxlen), label_alphabet) train_x, valid_x, train_y, valid_y = train_test_split(all_x, all_y, test_size=0.2, random_state=7) optim = RandomSearch( lambda: get_random_config(weights), lambda x, y: run_one_eval( x, y, train_x, train_y, valid_x, valid_y, len(feats_alphabet), len(label_alphabet), weights)) best_config = optim.optimize() open(os.path.join(working_dir, 'model_0.config'), 'w').write(str(best_config)) print("Best config returned by optimizer is %s" % str(best_config)) if not best_config['pretrain']: weights = None model = get_model_for_config(train_x.shape, len(feats_alphabet), len(label_alphabet), best_config, weights=weights) model.fit(all_x, all_y, nb_epoch=40, batch_size=best_config['batch_size'], verbose=1, validation_split=0.1) model.summary() json_string = model.to_json() open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string) model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True) fn = open(os.path.join(working_dir, 'alphabets.pkl'), 'w') pickle.dump((feats_alphabet, label_alphabet), fn) fn.close() with ZipFile(os.path.join(working_dir, 'script.model'), 'w') as myzip: myzip.write(os.path.join(working_dir, 'model_0.json'), 'model_0.json') myzip.write(os.path.join(working_dir, 'model_0.h5'), 'model_0.h5') myzip.write(os.path.join(working_dir, 'alphabets.pkl'), 'alphabets.pkl')
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] print("Reading data...") Y, outcome_map, outcome_list, X, feature_alphabet = ctk_io.read_multitask_token_sequence_data( working_dir) start_ind = feature_alphabet[start_symbol] end_ind = feature_alphabet[end_symbol] train_x, valid_x, train_y, valid_y = train_test_split(X, Y, test_size=0.2, random_state=7) # X_distance = get_distance_features(X, start_ind, end_ind) print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape))) num_examples, dimension = X.shape num_y_examples, num_labels = Y.shape assert num_examples == num_y_examples weights = None if len(args) > 1: weights = ctk_io.read_embeddings(args[1], feats_alphabet) train_y_adj, train_indices = ctk_io.flatten_outputs(train_y) valid_y_adj, valid_indices = ctk_io.flatten_outputs(valid_y) if not train_indices == valid_indices: print( "Error: training and valid sets have different index sets -- may be missing some labels in one set or the other" ) sys.exit(-1) output_dims_list = [] train_y_list = [] valid_y_list = [] indices = train_indices for i in range(len(indices) - 1): label_dims = indices[i + 1] - indices[i] output_dims_list.append(label_dims) if label_dims == 1: train_y_list.append(train_y_adj[:, indices[i]]) valid_y_list.append(valid_y_adj[:, indices[i]]) else: train_y_list.append(train_y_adj[:, indices[i]:indices[i + 1]]) valid_y_list.append(valid_y_adj[:, indices[i]:indices[i + 1]]) print("Dimensions of label %d are %s" % (i, str(train_y_list[-1].shape))) ## pass a function to the search that it uses to get a random config ## and a function that it will get an eval given (e)pochs and (c)onfig file: optim = RandomSearch( lambda: get_random_config(weights), lambda e, c: run_one_eval( e, c, train_x, train_y_list, valid_x, valid_y_list, len(feature_alphabet), output_dims_list, weights)) best_config = optim.optimize(max_iter=27) open(os.path.join(working_dir, 'model_0.config'), 'w').write(str(best_config)) print("Best config returned by optimizer is %s" % str(best_config))