def predict(): list_of_files = glob.glob( 'output/models/*') # * means all if need specific format then *.csv model_path = max(list_of_files, key=os.path.getctime) print("Generating ONNX from model:", model_path) model = load_model_from_disk(model_path, force_cpu=True) input_sequences = ["AAAAAAA", "AAA"] input_sequences_embedded = [x for x in model.embed(input_sequences)] input_sequences_padded = torch.nn.utils.rnn.pad_sequence( input_sequences_embedded) batch_sizes_list = [] for x in input_sequences: batch_sizes_list.append(len(x)) batch_sizes = torch.IntTensor(batch_sizes_list) emmissions, start_transitions, transitions, end_transitions = model( input_sequences_padded) predicted_labels, predicted_types, predicted_topologies = decode( emmissions, batch_sizes, start_transitions, transitions, end_transitions) predicted_labels_2, predicted_types_2, predicted_topologies_2 = \ decode_numpy(emmissions.detach().numpy(), batch_sizes.detach().numpy(), start_transitions.detach().numpy(), transitions.detach().numpy(), end_transitions.detach().numpy()) for idx, val in enumerate(predicted_labels): assert np.array_equal(val.detach().numpy(), predicted_labels_2[idx]) assert np.array_equal(predicted_types.detach().numpy(), predicted_types_2) for idx, val in enumerate(predicted_topologies): for idx2, val2 in enumerate(val): assert np.array_equal(val2.detach().numpy(), predicted_topologies_2[idx][idx2]) print("Exporting to ONNX...") output_path = "./tests/output/tmhmm3.onnx" onnx_from_model(model, (input_sequences_padded), output_path) print("Wrote ONNX to", output_path)
def run_experiment(parser, use_gpu): parser.add_argument('--minibatch-size-validation', dest='minibatch_size_validation', type=int, default=8, help='Size of each minibatch during evaluation.') parser.add_argument('--hidden-size', dest='hidden_size', type=int, default=64, help='Hidden size.') parser.add_argument('--learning-rate', dest='learning_rate', type=float, default=0.0002, help='Learning rate to use during training.') parser.add_argument('--cv-partition', dest='cv_partition', type=int, default=0, help='Run a particular cross validation rotation.') parser.add_argument('--model-mode', dest='model_mode', type=int, default=2, help='Which model to use.') parser.add_argument('--input-data', dest='input_data', type=str, default='data/raw/TMHMM3.train.3line.latest', help='Path of input data file.') parser.add_argument('--pre-trained-model-paths', dest='pre_trained_model_paths', type=str, default=None, help='Paths of pre-trained models.') parser.add_argument('--profile-path', dest='profile_path', type=str, default="", help='Profiles to use for embedding.') args, _unknown = parser.parse_known_args() result_matrices = np.zeros((5, 5), dtype=np.int64) if args.model_mode == 0: model_mode = TMHMM3Mode.LSTM elif args.model_mode == 1: model_mode = TMHMM3Mode.LSTM_CRF elif args.model_mode == 2: model_mode = TMHMM3Mode.LSTM_CRF_HMM elif args.model_mode == 3: model_mode = TMHMM3Mode.LSTM_CRF_MARG else: print("ERROR: No model defined") print("Using model:", model_mode) if args.profile_path != "": embedding = "PROFILE" else: embedding = "BLOSUM62" use_marg_prob = False all_prediction_data = [] for cv_partition in [0, 1, 2, 3, 4]: # prepare data sets train_set, val_set, test_set = load_data_from_disk(filename=args.input_data, partition_rotation=cv_partition) # topology data set train_set_topology = list(filter(lambda x: x[3] == 0 or x[3] == 1, train_set)) val_set_topology = list(filter(lambda x: x[3] == 0 or x[3] == 1, val_set)) test_set_topology = list(filter(lambda x: x[3] == 0 or x[3] == 1, test_set)) if not args.silent: print("Loaded ", len(train_set), "training,", len(val_set), "validation and", len(test_set), "test samples") print("Processing data...") pre_processed_path = "data/preprocessed/preprocessed_data_" + str( hashlib.sha256(args.input_data.encode()).hexdigest())[:8] + "_cv" \ + str(cv_partition) + ".pickle" if not os.path.isfile(pre_processed_path): input_data_processed = list([TMDataset.from_disk(set, use_gpu) for set in [train_set, val_set, test_set, train_set_topology, val_set_topology, test_set_topology]]) pickle.dump(input_data_processed, open(pre_processed_path, "wb")) input_data_processed = pickle.load(open(pre_processed_path, "rb")) train_preprocessed_set = input_data_processed[0] validation_preprocessed_set = input_data_processed[1] test_preprocessed_set = input_data_processed[2] train_preprocessed_set_topology = input_data_processed[3] validation_preprocessed_set_topology = input_data_processed[4] _test_preprocessed_set_topology = input_data_processed[5] print("Completed preprocessing of data...") train_loader = tm_contruct_dataloader_from_disk(train_preprocessed_set, args.minibatch_size, balance_classes=True) validation_loader = tm_contruct_dataloader_from_disk(validation_preprocessed_set, args.minibatch_size_validation, balance_classes=True) test_loader = tm_contruct_dataloader_from_disk( test_preprocessed_set if args.evaluate_on_test else validation_preprocessed_set, args.minibatch_size_validation) train_loader_topology = \ tm_contruct_dataloader_from_disk(train_preprocessed_set_topology, args.minibatch_size) validation_loader_topology = \ tm_contruct_dataloader_from_disk(validation_preprocessed_set_topology, args .minibatch_size_validation) type_predictor_model_path = None if args.pre_trained_model_paths is None: for (experiment_id, train_data, validation_data) in [ ("TRAIN_TYPE_CV" + str(cv_partition) + "-" + str(model_mode) + "-HS" + str(args.hidden_size) + "-F" + str(args.input_data.split(".")[-2]) + "-P" + str(args.profile_path.split("_")[-1]), train_loader, validation_loader), ("TRAIN_TOPOLOGY_CV" + str(cv_partition) + "-" + str(model_mode) + "-HS" + str(args.hidden_size) + "-F" + str(args.input_data.split(".")[-2]) + "-P" + str(args.profile_path.split("_")[-1]), train_loader_topology, validation_loader_topology)]: type_predictor = None if type_predictor_model_path is not None: type_predictor = load_model_from_disk(type_predictor_model_path, force_cpu=False) model = load_model_from_disk(type_predictor_model_path, force_cpu=False) model.type_classifier = type_predictor model.type_01loss_values = [] model.topology_01loss_values = [] else: model = TMHMM3( embedding, args.hidden_size, use_gpu, model_mode, use_marg_prob, type_predictor, args.profile_path) model_path = train_model(data_set_identifier=experiment_id, model=model, train_loader=train_data, validation_loader=validation_data, learning_rate=args.learning_rate, minibatch_size=args.minibatch_size, eval_interval=args.eval_interval, hide_ui=args.hide_ui, use_gpu=use_gpu, minimum_updates=args.minimum_updates) # let the GC collect the model del model write_out(model_path) # if we just trained a type predictor, save it for later if "TRAIN_TYPE" in experiment_id: type_predictor_model_path = model_path else: # use the pre-trained model model_path = args.pre_trained_model_paths.split(",")[cv_partition] # test model write_out("Testing model...") model = load_model_from_disk(model_path, force_cpu=False) _loss, json_data, prediction_data = model.evaluate_model(test_loader) all_prediction_data.append(post_process_prediction_data(prediction_data)) result_matrix = np.array(json_data['confusion_matrix']) result_matrices += result_matrix write_out(result_matrix) set_experiment_id( "TEST-" + str(model_mode) + "-HS" + str(args.hidden_size) + "-F" + str(args.input_data.split(".")[-2]), args.learning_rate, args.minibatch_size) write_out(result_matrices) write_prediction_data_to_disk("\n".join(all_prediction_data))
def run_experiment(parser, use_gpu): # parse experiment specific command line arguments parser.add_argument('--learning-rate', dest='learning_rate', type=float, default=0.001, help='Learning rate to use during training.') parser.add_argument('--embed-size', dest='embed_size', type=int, default=21, help='Embedding size.') args, _unknown = parser.parse_known_args() all_prediction_data = [] result_matrices = [] # pre-process data preprocessed_training_file = process_single_raw_data( training_file, use_gpu=use_gpu, force_pre_processing_overwrite=False) preprocessed_validation_file = process_single_raw_data( validation_file, use_gpu=use_gpu, force_pre_processing_overwrite=False) preprocessed_test_file = process_single_raw_data( test_file, use_gpu=use_gpu, force_pre_processing_overwrite=False) # run experiment # model = ExampleModel(args.embed_size, args.minibatch_size, use_gpu=use_gpu) # embed size = 21 # model = SimpleRCNN(args.embed_size, args.minibatch_size, use_gpu=use_gpu) # embed size = 21 model = DeepResRCNN_100(args.embed_size, args.minibatch_size, use_gpu=use_gpu) # embed size = 21 train_loader = contruct_dataloader_from_disk(preprocessed_training_file, args.minibatch_size) validation_loader = contruct_dataloader_from_disk( preprocessed_validation_file, args.minibatch_size) train_model_path = train_model(data_set_identifier="TRAIN", model=model, train_loader=train_loader, validation_loader=validation_loader, learning_rate=args.learning_rate, minibatch_size=args.minibatch_size, eval_interval=args.eval_interval, hide_ui=args.hide_ui, use_gpu=use_gpu, minimum_updates=args.minimum_updates) print(train_model_path) # test model test_loader = contruct_dataloader_from_disk(preprocessed_test_file, args.minibatch_size) write_out("Testing model...") model = load_model_from_disk(train_model_path, force_cpu=False) _loss, json_data, _ = model.evaluate_model(test_loader) all_prediction_data.append(json_data) # all_prediction_data.append(model.post_process_prediction_data(prediction_data)) result_matrix = np.array(json_data['confusion_matrix']) result_matrices += result_matrix write_out(result_matrix) set_experiment_id( "TEST-" + str(args.hidden_size) + "-F" + str(args.input_data.split(".")[-2]), args.learning_rate, args.minibatch_size) write_out(result_matrices) write_prediction_data_to_disk("\n".join(all_prediction_data))