def main( model_dir=None, train_dir=None, dev_dir=None, is_runtime=False, nr_hidden=64, max_length=100, # Shape dropout=0.5, learn_rate=0.001, # General NN config nb_epoch=5, batch_size=256, nr_examples=-1, ): # Training params if model_dir is not None: model_dir = pathlib.Path(model_dir) model_dir.mkdir(parents=True, exist_ok=True) (model_dir / "vocab").mkdir(parents=True, exist_ok=True) if train_dir is None or dev_dir is None: imdb_data = thinc.extra.datasets.imdb() if is_runtime: if dev_dir is None: dev_texts, dev_labels = zip(*imdb_data[1]) else: dev_texts, dev_labels = read_data(dev_dir) acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length) print(acc) else: if train_dir is None: train_texts, train_labels = zip(*imdb_data[0]) else: print("Read data") train_texts, train_labels = read_data(train_dir, limit=nr_examples) if dev_dir is None: dev_texts, dev_labels = zip(*imdb_data[1]) else: dev_texts, dev_labels = read_data(dev_dir, imdb_data, limit=nr_examples) train_labels = numpy.asarray(train_labels, dtype="int32") dev_labels = numpy.asarray(dev_labels, dtype="int32") lstm = train( train_texts, train_labels, dev_texts, dev_labels, {"nr_hidden": nr_hidden, "max_length": max_length, "nr_class": 1}, {"dropout": dropout, "lr": learn_rate}, {}, nb_epoch=nb_epoch, batch_size=batch_size, persist_vocab_path=model_dir / "vocab" ) weights = lstm.get_weights() if model_dir is not None: model_dir.mkdir(parents=True, exist_ok=True) with (model_dir / "model").open("wb") as file_: pickle.dump(weights[1:], file_) with (model_dir / "config.json").open("w") as file_: file_.write(lstm.to_json())
def main( model_dir=None, train_dir=None, dev_dir=None, is_runtime=False, nr_hidden=64, max_length=100, # Shape dropout=0.5, learn_rate=0.001, # General NN config nb_epoch=5, batch_size=256, nr_examples=-1): # Training params if model_dir is not None: print('We set the model_dir as ' + model_dir) model_dir = pathlib.Path(model_dir) if train_dir is None or dev_dir is None: imdb_data = thinc.extra.datasets.imdb() if is_runtime: if dev_dir is None: dev_texts, dev_labels = zip(*imdb_data[1]) else: dev_texts, dev_labels = read_data(dev_dir) acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length) print(acc) else: if train_dir is None: train_texts, train_labels = zip(*imdb_data[0]) else: print("Read data") train_texts, train_labels = read_data(train_dir, limit=nr_examples) if dev_dir is None: dev_texts, dev_labels = zip(*imdb_data[1]) else: dev_texts, dev_labels = read_data(dev_dir, imdb_data, limit=nr_examples) train_labels = np.asarray(train_labels, dtype='int32') dev_labels = np.asarray(dev_labels, dtype='int32') lstm = train(train_texts, train_labels, dev_texts, dev_labels, { 'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 1 }, { 'dropout': dropout, 'lr': learn_rate }, {}, nb_epoch=nb_epoch, batch_size=batch_size) weights = lstm.get_weights() if model_dir is not None: with (model_dir / 'model').open('wb') as file_: pickle.dump(weights[1:], file_) with (model_dir / 'config.json').open('w') as file_: file_.write(lstm.to_json())
def main( model_dir=None, train_dir=None, dev_dir=None, is_runtime=False, nr_hidden=64, max_length=100, # Shape dropout=0.5, learn_rate=0.001, # General NN config nb_epoch=5, batch_size=256, nr_examples=-1, ): # Training params if model_dir is not None: model_dir = pathlib.Path(model_dir) if train_dir is None or dev_dir is None: imdb_data = thinc.extra.datasets.imdb() if is_runtime: if dev_dir is None: dev_texts, dev_labels = zip(*imdb_data[1]) else: dev_texts, dev_labels = read_data(dev_dir) acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length) print(acc) else: if train_dir is None: train_texts, train_labels = zip(*imdb_data[0]) else: print("Read data") train_texts, train_labels = read_data(train_dir, limit=nr_examples) if dev_dir is None: dev_texts, dev_labels = zip(*imdb_data[1]) else: dev_texts, dev_labels = read_data(dev_dir, imdb_data, limit=nr_examples) train_labels = numpy.asarray(train_labels, dtype="int32") dev_labels = numpy.asarray(dev_labels, dtype="int32") lstm = train( train_texts, train_labels, dev_texts, dev_labels, {"nr_hidden": nr_hidden, "max_length": max_length, "nr_class": 1}, {"dropout": dropout, "lr": learn_rate}, {}, nb_epoch=nb_epoch, batch_size=batch_size, ) weights = lstm.get_weights() if model_dir is not None: with (model_dir / "model").open("wb") as file_: pickle.dump(weights[1:], file_) with (model_dir / "config.json").open("w") as file_: file_.write(lstm.to_json())
def main( model_dir='model_lstm', is_runtime=False, nr_hidden=64, max_length=100, # Shape dropout=0.5, learn_rate=0.001, # General NN config nb_epoch=1, batch_size=256, nr_examples=-1, ): # Training params if model_dir is not None: model_dir = pathlib.Path(model_dir) # if train_dir is None or dev_dir is None: # imdb_data = thinc.extra.datasets.imdb() # if is_runtime: # if dev_dir is None: # dev_texts, dev_labels = zip(*imdb_data[1]) # else: # dev_texts, dev_labels = read_data(dev_dir) # acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length) # print(acc) # else: # if train_dir is None: # train_texts, train_labels = zip(*imdb_data[0]) # else: # print("Read data") # train_texts, train_labels = read_data(train_dir, limit=nr_examples) # if dev_dir is None: # dev_texts, dev_labels = zip(*imdb_data[1]) # else: # dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples) # print("Loading Train and Test Data.....") # with open('dev.txt','rb') as f: # dev_docs=pickle.load(f) # with open('train.txt','rb') as f: # train_docs=pickle.load(f) # print('Loading Complete..') # train_labels = numpy.asarray(train_labels, dtype="int32") # dev_labels = numpy.asarray(dev_labels, dtype="int32") lstm = train( {"nr_hidden": nr_hidden, "max_length": max_length, "nr_class": 1}, {"dropout": dropout, "lr": learn_rate}, {}, nb_epoch=nb_epoch, batch_size=batch_size, ) weights = lstm.get_weights() if model_dir is not None: with (model_dir / "model").open("wb") as file_: pickle.dump(weights[1:], file_) with (model_dir / "config.json").open("w") as file_: file_.write(lstm.to_json())
def main(): model_dir = pathlib.Path("./model/") nr_hidden = 64 max_length = 100 # Shape dropout = 0.5 learn_rate = 0.02 nb_epoch = 10 batch_size = 256 nr_class = 4 print("Read data") sentences, labels = load_corpus() sentences = remove_stopwords(sentences) size = len(sentences) train_texts, train_labels = sentences[:int(size * 0.6)], labels[:int(size * 0.6)] dev_texts, dev_labels = sentences[int(size * 0.6):int(size * 0.8)], labels[ int(size * 0.6):int(size * 0.8)] test_texts, test_labels = sentences[int(size * 0.8):], labels[int(size * 0.8):] train_labels = numpy.asarray(train_labels, dtype='int32') dev_labels = numpy.asarray(dev_labels, dtype='int32') lstm = train(train_texts, train_labels, dev_texts, dev_labels, { 'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': nr_class }, { 'dropout': dropout, 'lr': learn_rate }, {}, nb_epoch=nb_epoch, batch_size=batch_size) print("Model has been trained!") weights = lstm.get_weights() if model_dir is not None: with (model_dir / 'model').open('wb') as file_: pickle.dump(weights[1:], file_) with (model_dir / 'config.json').open('w') as file_: file_.write(lstm.to_json()) nb_correct = evaluate(model_dir, test_texts, test_labels) print("Percent of correct prediction: " + str(nb_correct))
def main( model_dir=None, train_dir='test_plag_data.txt', dev_dir='dev_plag_data.txt', is_runtime=False, nr_hidden=64, max_length=100, # Shape dropout=0.5, learn_rate=0.001, # General NN config nb_epoch=5, batch_size=100, nr_examples=-1): # Training params if model_dir is not None: model_dir = pathlib.Path(model_dir) data = read_data(train_dir) if is_runtime: dev_texts, dev_labels = read_data(dev_dir) acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length) print(acc) else: print("Read data") train_labels, train_texts, train_texts2 = read_data(train_dir, limit=nr_examples) dev_labels, dev_texts, dev_texts2 = read_data(dev_dir, limit=nr_examples) train_labels = numpy.asarray(train_labels, dtype='int32') dev_labels = numpy.asarray(dev_labels, dtype='int32') lstm = train(train_texts, train_texts2, train_labels, dev_texts, dev_labels, { 'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 1 }, { 'dropout': dropout, 'lr': learn_rate }, {}, nb_epoch=nb_epoch, batch_size=batch_size) weights = lstm.get_weights() if model_dir is not None: with (model_dir / 'model').open('wb') as file_: pickle.dump(weights[1:], file_) with (model_dir / 'config.json').open('wb') as file_: file_.write(lstm.to_json())
def main( model_dir='/Users/masha/Data/Model', train_dir='/Users/masha/Data/Train', nr_hidden=128, max_length=100, dropout=0.2, learn_rate=0.0001, nb_epoch=150, batch_size=64, #nr_examples=-1, training_portion=.8, ): # Training params if model_dir is not None: model_dir = pathlib.Path(model_dir) if train_dir is None: print('Please provide training directory!') train_texts, train_labels, val_texts, val_labels = read_data( train_dir, training_portion) model = train_model(train_texts, train_labels, val_texts, val_labels, { "nr_hidden": nr_hidden, "max_length": max_length, "nr_class": 1 }, { "dropout": dropout, "lr": learn_rate }, {}, nb_epoch=nb_epoch, batch_size=batch_size) weights = model.get_weights() if model_dir is not None: with (model_dir / "model").open("wb") as file_: pickle.dump(weights[1:], file_) with (model_dir / "config.json").open("w") as file_: file_.write(model.to_json())
def train( model_dir, train_dir, dev_dir, # fs locations model_type='lstm', feature_shapes=None, # neural network type(s): overall or defined per feature via shapes nr_examples=-1, max_entries=-1, # restrict data to a subset image_embedding_function=None, # image data, e.g. enable by providing a function like 'vgg16.VGG16' dropout=0.5, learn_rate=0.001, setting=None, # General NN config (via individual parameters or setting dict) nb_epoch=100, batch_size=100, early_stopping_window=5, # Training params nb_threads=1, nb_threads_parse=10 # performance: resource restrictions ): global cache if nb_threads > 0: # restrict number of tensorflow threads session_conf = tf.ConfigProto(intra_op_parallelism_threads=nb_threads, inter_op_parallelism_threads=nb_threads) backend.set_session(backend.tf.Session(config=session_conf)) assert dev_dir is not None, 'dev_dir is not set' dev_dir = pathlib.Path(dev_dir) if model_dir is not None: model_dir = pathlib.Path(model_dir) model_dir.mkdir(parents=True, exist_ok=True) logger_fh = logging.FileHandler((model_dir / 'log.txt')) logger_fh.setLevel(logging.DEBUG) logger_fh.setFormatter( logging.Formatter('%(asctime)s %(levelname)s %(message)s')) logger.addHandler(logger_fh) else: logger_fh = None assert train_dir is not None, 'train_dir is not set' train_dir = pathlib.Path(train_dir) # some defaults... if feature_shapes is None or feature_shapes == '': lstm_shapes = { 'targetParagraphs': { 'model': create_lstm.__name__, 'max_length': 500, 'nr_hidden': 64 }, 'postText': { 'model': create_lstm.__name__, 'max_length': 50, 'nr_hidden': 30 }, 'targetTitle': { 'model': create_lstm.__name__, 'max_length': 50, 'nr_hidden': 30 }, 'targetKeywords': { 'model': create_lstm.__name__, 'max_length': 100, 'nr_hidden': 30 }, 'targetDescription': { 'model': create_lstm.__name__, 'max_length': 100, 'nr_hidden': 30 }, } #lstm_shapes = { # 'postText,targetTitle,targetDescription,targetParagraphs,targetKeywords': # {'model': create_lstm.__name__, 'max_length': 500, 'nr_hidden': 128}, # # 'postMedia': {'model': create_cnn_image.__name__, 'input_shape': None} #} # max_length, filter_length, nb_filter cnn_shapes = { 'targetParagraphs': { 'model': create_cnn.__name__, 'max_length': 500, 'filter_length': 10, 'nb_filter': 200 }, 'postText': { 'model': create_cnn.__name__, 'max_length': 50, 'filter_length': 3, 'nb_filter': 50 }, 'targetTitle': { 'model': create_cnn.__name__, 'max_length': 50, 'filter_length': 2, 'nb_filter': 50 }, 'targetKeywords': { 'model': create_cnn.__name__, 'max_length': 100, 'filter_length': 1, 'nb_filter': 50 }, 'targetDescription': { 'model': create_cnn.__name__, 'max_length': 100, 'filter_length': 5, 'nb_filter': 50 }, } if model_type == 'lstm': logger.info('use lstm model') feature_shapes = lstm_shapes elif model_type == 'cnn': logger.info('use cnn model') feature_shapes = cnn_shapes #elif model_type == 'cnn2': # logger.info('use cnn2 model') # shapes = cnn_shapes #elif model_type == 'lstm_stacked': # logger.info('use lstm_stacked model') # shapes = lstm_shapes else: raise ValueError('unknown model_type=%s. use one of: %s' % (model_type, ' '.join( ['lstm', 'cnn', 'cnn2', 'lstm_stacked']))) else: feature_shapes = json.loads(feature_shapes) logger.info("Read data") train_records, _ = read_data(train_dir, limit=nr_examples, dont_shuffle=True) dev_records, _ = read_data(dev_dir, limit=nr_examples, dont_shuffle=True) if 'nlp' not in cache: cache['nlp'] = get_nlp() #nlp = get_nlp() use_images = image_embedding_function is not None and image_embedding_function.strip( ) != '' if use_images: logger.debug('use image data') else: del feature_shapes[IMAGE_KEY] cache['train_X_and_labels'] = cache.get('train_X_and_labels', {}) preprocessing_cache_key = json.dumps( (feature_shapes, max_entries, image_embedding_function, str(train_dir), str(dev_dir)), sort_keys=True) if preprocessing_cache_key not in cache['train_X_and_labels']: cache['train_X_and_labels'][ preprocessing_cache_key] = records_to_features( records=train_records, nlp=cache['nlp'], shapes=feature_shapes, nb_threads_parse=nb_threads_parse, max_entries=max_entries, key_image=IMAGE_KEY, data_dir=train_dir, image_model_function_name=image_embedding_function) train_X, train_labels = cache['train_X_and_labels'][ preprocessing_cache_key] cache['dev_X_and_labels'] = cache.get('dev_X_and_labels', {}) if preprocessing_cache_key not in cache['dev_X_and_labels']: cache['dev_X_and_labels'][ preprocessing_cache_key] = records_to_features( records=dev_records, nlp=cache['nlp'], shapes=feature_shapes, nb_threads_parse=nb_threads_parse, max_entries=max_entries, key_image=IMAGE_KEY, data_dir=dev_dir, image_model_function_name=image_embedding_function) dev_X, dev_labels = cache['dev_X_and_labels'][preprocessing_cache_key] #train_X, train_labels = records_to_features(records=train_records, nlp=cache['nlp'], shapes=feature_shapes, # nb_threads_parse=nb_threads_parse, max_entries=max_entries, # key_image=key_image, data_dir=train_dir, # image_model_function_name=image_embedding_function) #dev_X, dev_labels = records_to_features(records=dev_records, nlp=cache['nlp'], shapes=feature_shapes, # nb_threads_parse=nb_threads_parse, max_entries=max_entries, # key_image=key_image, data_dir=dev_dir, # image_model_function_name=image_embedding_function) if setting is None or setting == '': # default setting setting = {'final_layers': [512]} else: setting = json.loads(setting) # set dropout and learning rate if not already in setting setting['dropout'] = setting.get('dropout', None) or dropout setting['learn_rate'] = setting.get('learn_rate', None) or learn_rate # set image data settings if not given if use_images: if IMAGE_KEY not in feature_shapes: feature_shapes[IMAGE_KEY] = { 'model': create_cnn_image.__name__, 'layers': [128] } feature_shapes[IMAGE_KEY]['input_shape'] = train_X[IMAGE_KEY].shape[1:] # add "image available" flag feature_shapes[IMAGE_FLAG_KEY] = { 'model': create_identity.__name__, 'input_shape': train_X[IMAGE_FLAG_KEY].shape[1:] } logger.info('use setting: %s' % json.dumps(setting).replace(' ', '')) logger.info('use feature_shapes: %s' % json.dumps(feature_shapes).replace(' ', '')) model = create_model(embedding_weights=get_embeddings(cache['nlp'].vocab), feature_shapes=feature_shapes, setting=setting) metric = 'val_mean_squared_error' metric_best_func = min early_stopping_callback = EarlyStopping(monitor=metric, min_delta=1e-4, patience=early_stopping_window, verbose=1) callbacks = [early_stopping_callback] if model_dir is not None: callbacks.append( ModelCheckpoint(filepath=str(model_dir / 'model_weights'), monitor=metric, verbose=0, save_best_only=True, save_weights_only=True, mode='auto', period=1)) callbacks.append( CSVLogger(str(model_dir / "log.tsv"), append=True, separator='\t')) history_callback = model.fit(as_list(train_X), train_labels, validation_data=(as_list(dev_X), dev_labels), epochs=nb_epoch, batch_size=batch_size, callbacks=callbacks) metric_history = history_callback.history[metric] if model_dir is not None: logger.info('remove embeddings from model...') # remove embeddings from saved model (already included in spacy model) # reload best weights model.load_weights(str(model_dir / 'model_weights')) weights = model.get_weights() with (model_dir / 'model_weights').open('wb') as file_: pickle.dump(weights[1:], file_) # save model config with (model_dir / 'model_config.json').open('w') as file_: config_json = model.to_json() config_dict = json.loads(config_json) config_dict[ IMAGE_EMBEDDING_FUNCTION_KEY] = image_embedding_function json.dump(config_dict, file_) #file_.write(model.to_json()) if logger_fh is not None: logger.removeHandler(logger_fh) return metric, \ metric_best_func(metric_history), \ early_stopping_callback.stopped_epoch + 1 if early_stopping_callback.stopped_epoch > 0 else nb_epoch
def main( model_dir=None, train_dir=None, dev_dir=None, is_runtime=False, nr_hidden=64, max_length=100, # Shape dropout=0.5, learn_rate=0.001, # General NN config nb_epoch=5, batch_size=100, nr_examples=-1): # Training params df = load_document() data_set = DataSet.from_np_array(df['review'], numpy.asarray(df['sentimens'], dtype='int32'), class_names=[1, 2, 3, 4, 5], shuffle=True) #if model_dir is not None: # model_dir = pathlib.Path(model_dir) #if train_dir is None or dev_dir is None: # imdb_data = thinc.extra.datasets.imdb() #if is_runtime: # if dev_dir is None: # dev_texts, dev_labels = zip(*imdb_data[1]) # else: # dev_texts, dev_labels = read_data(dev_dir) # acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length) # print(acc) #else: # if train_dir is None: # train_texts, train_labels = zip(*imdb_data[0]) # else: # print("Read data") # train_texts, train_labels = read_data(train_dir, limit=nr_examples) # if dev_dir is None: # dev_texts, dev_labels = zip(*imdb_data[1]) # else: # dev_texts, dev_labels = read_data(dev_dir, imdb_data, limit=nr_examples) # train_labels = numpy.asarray(train_labels, dtype='int32') # dev_labels = numpy.asarray(dev_labels, dtype='int32') lstm = train(data_set.x_train, data_set.y_train, data_set.x_val, data_set.y_val, { 'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 1 }, { 'dropout': dropout, 'lr': learn_rate }, {}, nb_epoch=nb_epoch, batch_size=batch_size) weights = lstm.get_weights() if model_dir is not None: with (model_dir / 'model').open('wb') as file_: pickle.dump(weights[1:], file_) with (model_dir / 'config.json').open('wb') as file_: file_.write(lstm.to_json())