def main(args): if len(args) < 1: sys.stderr.write("Error - one required arguments: <data directory>\n") sys.exit(-1) working_dir = args[0] print("Reading data...") Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(working_dir) Y_array = np.array(Y) # print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape))) num_examples, dimension = X_array.shape num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet) num_y_examples = len(Y) assert num_examples == num_y_examples Y_adj, indices = ctk_io.flatten_outputs(Y_array) train_x, valid_x, train_y, valid_y = train_test_split(X_array, Y_array, test_size=0.2, random_state=18) optim = RandomSearch( lambda: get_random_config(), lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y, len(feature_alphabet), num_outputs), ) best_config = optim.optimize() print("Best config: %s" % best_config)
def main(args): if len(args) < 1: sys.stderr.write("Error - one required arguments: <data directory>\n") sys.exit(-1) working_dir = args[0] print("Reading data...") Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data( working_dir) Y_array = np.array(Y) #print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape))) num_examples, dimension = X_array.shape num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet) num_y_examples = len(Y) assert num_examples == num_y_examples Y_adj, indices = ctk_io.flatten_outputs(Y_array) train_x, valid_x, train_y, valid_y = train_test_split(X_array, Y_array, test_size=0.2, random_state=18) optim = RandomSearch( lambda: get_random_config(), lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y, len(feature_alphabet), num_outputs)) best_config = optim.optimize() print("Best config: %s" % best_config)
def main(args): if len(args) < 1: sys.stderr.write("Error - one required arguments: <data directory>\n") sys.exit(-1) working_dir = args[0] print("Reading data...") Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(working_dir) X_segments, dimensions = split_entity_data(X_array, feature_alphabet) Y_array = np.array(Y) Y_adj, indices = ctk_io.flatten_outputs(Y_array) num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet) num_y_examples = len(Y) train_x0, valid_x0, train_x1, valid_x1, train_x2, valid_x2, train_y, valid_y = train_test_split(X_segments[0], X_segments[1], X_segments[2], Y_array, test_size=0.2, random_state=18) train_x = [train_x0, train_x1, train_x2] valid_x = [valid_x0, valid_x1, valid_x2] optim = RandomSearch(lambda: get_random_config(), lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y, len(feature_alphabet), num_outputs ) ) best_config = optim.optimize() print("Best config: %s" % best_config)
def main(args): if len(args) < 1: sys.stderr.write("Error - one required arguments: <data directory>\n") sys.exit(-1) working_dir = args[0] print("Reading data...") Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data( working_dir) X_segments, dimensions = split_entity_data(X_array, feature_alphabet) Y_array = np.array(Y) Y_adj, indices = ctk_io.flatten_outputs(Y_array) num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet) num_y_examples = len(Y) train_x0, valid_x0, train_x1, valid_x1, train_x2, valid_x2, train_y, valid_y = train_test_split( X_segments[0], X_segments[1], X_segments[2], Y_array, test_size=0.2, random_state=18) train_x = [train_x0, train_x1, train_x2] valid_x = [valid_x0, valid_x1, valid_x2] optim = RandomSearch( lambda: get_random_config(), lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y, len(feature_alphabet), num_outputs)) best_config = optim.optimize() print("Best config: %s" % best_config)
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] (train_y, label_alphabet, train_x, feats_alphabet) = ctk_io.read_token_sequence_data(working_dir) init_vectors = None #used for pre-trained embeddings #load embeddings file embedingFile = '/Users/chenlin/Programming/ctakesWorkspace/neural-temporal/src/main/resources/org/apache/ctakes/temporal/thyme_word2vec_timex_50.vec' weights = ctk_io.read_embeddings(embedingFile, feats_alphabet) # if len(args) > 1 and best_config['pretrain'] == True: # weights = ctk_io.read_embeddings(args[1], feats_alphabet) # elif best_config['pretrain'] and len(args) == 1: # sys.stderr.write("Error: Pretrain specified but no weights file given!") # sys.exit(-1) # turn x and y into numpy array among other things maxlen = max([len(seq) for seq in train_x]) outcomes = set(train_y) classes = len(outcomes) train_x = pad_sequences(train_x, maxlen=maxlen) train_y = to_categorical(np.array(train_y), classes) #pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb")) #pickle.dump(dataset1.alphabet, open(os.path.join(working_dir, 'alphabet.p'),"wb")) #test_x = pad_sequences(test_x, maxlen=maxlen) #test_y = to_categorical(np.array(test_y), classes) print 'train_x shape:', train_x.shape print 'train_y shape:', train_y.shape branches = [] # models to be merged train_xs = [] # train x for each branch #test_xs = [] # test x for each branch filtlens = "1,2,3,4,5" for filter_len in filtlens.split(','): branch = Sequential() branch.add(Embedding(len(feats_alphabet), weights.shape[1], input_length=maxlen, weights=[weights], trainable = False)) branch.add(Convolution1D(nb_filter=200, filter_length=int(filter_len), border_mode='valid', activation='relu', subsample_length=1)) branch.add(MaxPooling1D(pool_length=2)) branch.add(Flatten()) branches.append(branch) train_xs.append(train_x) #test_xs.append(test_x) branch = Sequential() branch.add(Embedding(len(feats_alphabet), weights.shape[1], input_length=maxlen, weights=[weights], trainable = False)) branch.add(Convolution1D(nb_filter=200, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) branch.add(Convolution1D(nb_filter=200, filter_length=3, border_mode='same', activation='relu', subsample_length=1)) branch.add(MaxPooling1D(pool_length=2)) branch.add(Flatten()) branches.append(branch) train_xs.append(train_x) model = Sequential() model.add(Merge(branches, mode='concat')) model.add(Dense(250))#cfg.getint('cnn', 'hidden'))) model.add(Dropout(0.25))#cfg.getfloat('cnn', 'dropout'))) model.add(Activation('relu')) model.add(Dropout(0.25))#cfg.getfloat('cnn', 'dropout'))) model.add(Dense(classes)) model.add(Activation('softmax')) optimizer = RMSprop(lr=0.0001,#cfg.getfloat('cnn', 'learnrt'), rho=0.9, epsilon=1e-08) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) model.fit(train_xs, train_y, nb_epoch=20,#cfg.getint('cnn', 'epochs'), batch_size=50,#cfg.getint('cnn', 'batches'), verbose=1, validation_split=0.1, class_weight=None) model.summary() json_string = model.to_json() open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string) model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True) fn = open(os.path.join(working_dir, 'alphabets.pkl'), 'w') pickle.dump( (feats_alphabet, label_alphabet, maxlen), fn) fn.close() with ZipFile(os.path.join(working_dir, 'script.model'), 'w') as myzip: myzip.write(os.path.join(working_dir, 'model_0.json'), 'model_0.json') myzip.write(os.path.join(working_dir, 'model_0.h5'), 'model_0.h5') myzip.write(os.path.join(working_dir, 'alphabets.pkl'), 'alphabets.pkl') sys.exit(0)
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <working_dir>\n") sys.exit(-1) working_dir = args[0] ### Extract existing model: print("Extracting existing model") with ZipFile(os.path.join(working_dir, 'script.model'), 'r') as myzip: myzip.extract('model.h5', working_dir) myzip.extract('alphabets.pkl', working_dir) (feature_alphabet, label_alphabet) = pickle.load( open(os.path.join(working_dir, 'alphabets.pkl'), 'r' ) ) label_lookup = {val:key for (key,val) in label_alphabet.iteritems()} model = load_model(os.path.join(working_dir, "model.h5")) #config = model.get_config() #model = Container.from_config(config) ## Find the model params needed by CNN method and get a cnn with one extra FC layer: # nn_models.get_cnn_model(X_array.shape, len(feature_alphabet), num_outputs, conv_layers=convs, fc_layers=layers, # embed_dim=embed_dim, filter_widths=width) print("Building new model with extra layer") convs = [] dense = [] for layer in model.layers: if 'convolution' in layer.name: convs.append(layer) if 'dense' in layer.name: dense.append(layer) filters = [x.filter_length for x in convs] nb_filters = (convs[0].nb_filter,) fc_widths = [x.output_dim for x in dense] fc_widths.append(fc_widths[-1] //2) new_model = nn_models.get_cnn_model(model.layers[0].input_shape, model.layers[1].input_dim, model.layers[-1].output_dim, conv_layers=nb_filters, fc_layers=fc_widths, embed_dim=model.layers[1].output_dim, filter_widths=filters ) ## Just so i don't accidentally try to refer to this later del model ## Change the name of the output layer so that we don't try to read those weights in -- we will have a different number of parameters: #new_model.layers[-1].name = "NewOutput" ## Load as many weights as possible taking advantage of consistently named layers: new_model.load_weights(os.path.join(working_dir, "model.h5"), by_name=True) ## Re-load data and retrain model: print("Reading data...") Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(working_dir) Y_array = np.array(Y) #print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape))) num_examples, dimension = X_array.shape num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet) num_y_examples = len(Y) Y_adj, indices = ctk_io.flatten_outputs(Y_array) out_counts = Y_adj.sum(0) stopper = nn_models.get_early_stopper() print("Retraining model") new_model.fit(X_array, Y_adj, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.2) #, #callbacks=[stopper]) #, #class_weight=class_weights) new_model.summary() new_model.save(os.path.join(working_dir, 'new_model.h5'), overwrite=True) with ZipFile(os.path.join(working_dir, 'extended.model'), 'w') as myzip: myzip.write(os.path.join(working_dir, 'new_model.h5'), 'model.h5') myzip.write(os.path.join(working_dir, 'alphabets.pkl'), 'alphabets.pkl')
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <working_dir>\n") sys.exit(-1) working_dir = args[0] ### Extract existing model: print("Extracting existing model") with ZipFile(os.path.join(working_dir, 'script.model'), 'r') as myzip: myzip.extract('model.h5', working_dir) myzip.extract('alphabets.pkl', working_dir) (feature_alphabet, label_alphabet) = pickle.load( open(os.path.join(working_dir, 'alphabets.pkl'), 'r')) label_lookup = {val: key for (key, val) in label_alphabet.iteritems()} model = load_model(os.path.join(working_dir, "model.h5")) #config = model.get_config() #model = Container.from_config(config) ## Find the model params needed by CNN method and get a cnn with one extra FC layer: # nn_models.get_cnn_model(X_array.shape, len(feature_alphabet), num_outputs, conv_layers=convs, fc_layers=layers, # embed_dim=embed_dim, filter_widths=width) print("Building new model with extra layer") convs = [] dense = [] for layer in model.layers: if 'convolution' in layer.name: convs.append(layer) if 'dense' in layer.name: dense.append(layer) filters = [x.filter_length for x in convs] nb_filters = (convs[0].nb_filter, ) fc_widths = [x.output_dim for x in dense] #fc_widths.append(fc_widths[-1] //2) fc_widths.append(fc_widths[-1]) new_model = nn_models.get_cnn_model(model.layers[0].input_shape, model.layers[1].input_dim, model.layers[-1].output_dim, conv_layers=nb_filters, fc_layers=fc_widths, embed_dim=model.layers[1].output_dim, filter_widths=filters) ## Just so i don't accidentally try to refer to this later del model ## Change the name of the output layer so that we don't try to read those weights in -- we will have a different number of parameters: #new_model.layers[-1].name = "NewOutput" ## Load as many weights as possible taking advantage of consistently named layers: new_model.load_weights(os.path.join(working_dir, "model.h5"), by_name=True) ## Re-load data and retrain model: print("Reading data...") Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data( working_dir) Y_array = np.array(Y) #print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape))) num_examples, dimension = X_array.shape num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet) num_y_examples = len(Y) Y_adj, indices = ctk_io.flatten_outputs(Y_array) out_counts = Y_adj.sum(0) stopper = nn_models.get_early_stopper() print("Retraining model") new_model.fit(X_array, Y_adj, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.2) #, #callbacks=[stopper]) #, #class_weight=class_weights) new_model.summary() new_model.save(os.path.join(working_dir, 'new_model.h5'), overwrite=True) with ZipFile(os.path.join(working_dir, 'extended.model'), 'w') as myzip: myzip.write(os.path.join(working_dir, 'new_model.h5'), 'model.h5') myzip.write(os.path.join(working_dir, 'alphabets.pkl'), 'alphabets.pkl')
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] (train_y, label_alphabet, train_x, feats_alphabet) = ctk_io.read_token_sequence_data(working_dir) init_vectors = None #used for pre-trained embeddings # turn x and y into numpy array among other things maxlen = max([len(seq) for seq in train_x]) outcomes = set(train_y) classes = len(outcomes) train_x = pad_sequences(train_x, maxlen=maxlen) train_y = to_categorical(np.array(train_y), classes) #pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb")) #pickle.dump(dataset1.alphabet, open(os.path.join(working_dir, 'alphabet.p'),"wb")) #test_x = pad_sequences(test_x, maxlen=maxlen) #test_y = to_categorical(np.array(test_y), classes) print 'train_x shape:', train_x.shape print 'train_y shape:', train_y.shape #branches = [] # models to be merged #train_xs = [] # train x for each branch #test_xs = [] # test x for each branch model = resnet(maxlen, feats_alphabet, classes) optimizer = RMSprop( lr=0.0001, #cfg.getfloat('cnn', 'learnrt'), rho=0.9, epsilon=1e-08) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) #{'0':'accuracy'})# model.fit( train_x, train_y, nb_epoch=10, #cfg.getint('cnn', 'epochs'), batch_size=50, #cfg.getint('cnn', 'batches'), verbose=1, validation_split=0.1, class_weight=None) model.summary() json_string = model.to_json() open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string) model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True) fn = open(os.path.join(working_dir, 'alphabets.pkl'), 'w') pickle.dump((feats_alphabet, label_alphabet, maxlen), fn) fn.close() with ZipFile(os.path.join(working_dir, 'script.model'), 'w') as myzip: myzip.write(os.path.join(working_dir, 'model_0.json'), 'model_0.json') myzip.write(os.path.join(working_dir, 'model_0.h5'), 'model_0.h5') myzip.write(os.path.join(working_dir, 'alphabets.pkl'), 'alphabets.pkl') sys.exit(0)
def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: <data directory>\n") sys.exit(-1) working_dir = args[0] (train_y, label_alphabet, train_x, feats_alphabet) = ctk_io.read_token_sequence_data(working_dir) init_vectors = None #used for pre-trained embeddings # turn x and y into numpy array among other things maxlen = max([len(seq) for seq in train_x]) outcomes = set(train_y) classes = len(outcomes) train_x = pad_sequences(train_x, maxlen=maxlen) train_y = to_categorical(np.array(train_y), classes) #pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb")) #pickle.dump(dataset1.alphabet, open(os.path.join(working_dir, 'alphabet.p'),"wb")) #test_x = pad_sequences(test_x, maxlen=maxlen) #test_y = to_categorical(np.array(test_y), classes) print 'train_x shape:', train_x.shape print 'train_y shape:', train_y.shape #branches = [] # models to be merged #train_xs = [] # train x for each branch #test_xs = [] # test x for each branch model = resnet(maxlen, feats_alphabet, classes) optimizer = RMSprop(lr=0.0001,#cfg.getfloat('cnn', 'learnrt'), rho=0.9, epsilon=1e-08) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics= ['accuracy'])#{'0':'accuracy'})# model.fit(train_x, train_y, nb_epoch=10,#cfg.getint('cnn', 'epochs'), batch_size=50,#cfg.getint('cnn', 'batches'), verbose=1, validation_split=0.1, class_weight=None) model.summary() json_string = model.to_json() open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string) model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True) fn = open(os.path.join(working_dir, 'alphabets.pkl'), 'w') pickle.dump( (feats_alphabet, label_alphabet, maxlen), fn) fn.close() with ZipFile(os.path.join(working_dir, 'script.model'), 'w') as myzip: myzip.write(os.path.join(working_dir, 'model_0.json'), 'model_0.json') myzip.write(os.path.join(working_dir, 'model_0.h5'), 'model_0.h5') myzip.write(os.path.join(working_dir, 'alphabets.pkl'), 'alphabets.pkl') sys.exit(0)