def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm): ''' generic_predict() Train a model that works for both prose and nonprose @param p_or_n. A string that indicates "prose", "nonprose", or "all" @param tokenized_sents. A list of sentences, where each sentence is tokenized into words @param vocab. A dictionary mapping word tokens to numeric indices. @param clf. An encoding of the trained keras model. @param use_lstm. Bool indicating whether clf is a CRF or LSTM. ''' # If nothing to predict, skip actual prediction if len(tokenized_sents) == 0: print '\tnothing to predict ' + p_or_n return [] print '\tvectorizing words ' + p_or_n if use_lstm: # vectorize tokenized sentences X = [] for sent in tokenized_sents: id_seq = [] for w in sent: if w in vocab: id_seq.append(vocab[w]) else: id_seq.append(vocab['oov']) X.append(id_seq) else: # vectorize validation X text_features = extract_features(tokenized_sents) flat_X_feats = vocab.transform(flatten(text_features)) X = reconstruct_list(flat_X_feats, save_list_structure(text_features)) print '\tpredicting labels ' + p_or_n # Predict labels if use_lstm: predictions = keras_ml.predict(clf, X) else: predictions = crf.predict(clf, X) # Format labels from output return predictions
def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, hyperparams): ''' generic_predict() Train a model that works for both prose and nonprose @param p_or_n. A string that indicates "prose", "nonprose", or "all" @param tokenized_sents. A list of sentences, where each sentence is tokenized into words @param vocab. A dictionary mapping word tokens to numeric indices. @param clf. An encoding of the trained keras model. @param use_lstm. Bool indicating whether clf is a CRF or LSTM. ''' # use_lstm=self._use_lstm if use_lstm: #parameters=hd.load_parameters_from_file("LSTM_parameters.txt") parameters['use_pretrained_model'] = True #model_folder="./models/NN_models" predictions = [] sys.stdout.write('\n use_lstm \n') dataset = Exp.Dataset() fictional_labels = copy.deepcopy(tokenized_sents) for idx, x in enumerate(fictional_labels): for val_id, value in enumerate(x): fictional_labels[idx][val_id] = 'O' Datasets_tokens = {} Datasets_labels = {} Datasets_tokens['deploy'] = tokenized_sents Datasets_labels['deploy'] = fictional_labels token_to_vector = dataset.load_dataset( Datasets_tokens, Datasets_labels, "", parameters, token_to_vector=tokens_to_vec, pretrained_dataset=pretrained_dataset) print(dataset.token_indices.keys()) parameters['Feature_vector_length'] = dataset.feature_vector_size parameters['use_features_before_final_lstm'] = False dataset.update_dataset("", ['deploy'], Datasets_tokens, Datasets_labels) del Datasets_tokens del Datasets_labels #model=current_model model = entity_model.EntityLSTM(dataset, parameters) os.mkdir(parameters['conll_like_result_folder']) test_temp = os.path.join(parameters['conll_like_result_folder'], 'test/') train_temp = os.path.join(parameters['conll_like_result_folder'], 'train/') valid_temp = os.path.join(parameters['conll_like_result_folder'], 'valid/') os.mkdir(test_temp) os.mkdir(train_temp) os.mkdir(valid_temp) sess = tf.Session() with sess.as_default(): #model=entity_model.EntityLSTM(dataset,parameters) transition_params_trained = model.restore_from_pretrained_model( parameters, dataset, sess, token_to_vector=token_to_vector, pretrained_dataset=pretrained_dataset) del token_to_vector predictions = training_predict_LSTM.prediction_step( sess, dataset, "deploy", model, 0, parameters['conll_like_result_folder'], transition_params_trained) sess.close() tf.reset_default_graph() shutil.rmtree(parameters['conll_like_result_folder']) return predictions, model # If nothing to predict, skip actual prediction if len(tokenized_sents) == 0: sys.stdout.write('\tnothing to predict %s\n' % p_or_n) return [] sys.stdout.write('\tvectorizing words %s\n' % p_or_n) if use_lstm: print('todo: incorporate lstm') # vectorize tokenized sentences #X = [] #for sent in tokenized_sents: # id_seq = [] # for w in sent: # if w in vocab: # id_seq.append(vocab[w]) # else: # id_seq.append(vocab['oov']) # X.append(id_seq) else: from cliner.feature_extraction.features import extract_features # vectorize validation X text_features = extract_features(tokenized_sents) flat_X_feats = vocab.transform(flatten(text_features)) X = reconstruct_list(flat_X_feats, save_list_structure(text_features)) sys.stdout.write('\tpredicting labels %s\n' % p_or_n) # Predict labels if use_lstm: print("TEST_PREDICT") exit() else: from cliner.machine_learning import crf predictions = crf.predict(clf, X) # Format labels from output return predictions