def load_model(self, data_folder): index_path = join_path(data_folder, ['train_data', 'index.trained']) vals_path = join_path(data_folder, ['train_data', 'vals.trained']) dict_path = join_path(data_folder, ['train_data', 'dict_word2idx.pkl']) dict_word2idx = pickle_load(dict_path) self.dict_idx2word = {v: k for k, v in dict_word2idx.items()} self.index = faiss.read_index(index_path) self.vals = np.load(vals_path) print('Model loaded')
def __init__(self, output_path, es_patience, callbacks): if output_path is None: callbacks = None final_model = None cb = None else: # Make directory for outputs check_mkdir(join_path(output_path, 'checkpoints')) check_mkdir(join_path(output_path, 'models')) checkpoint_model = join_path(output_path, ['checkpoints', 'epoch{epoch}.h5']) best_model = join_path(output_path, ['models', 'best_model.h5']) final_model = join_path(output_path, ['models', 'final_model.h5']) cb = [] if 'es' in callbacks: cb.append( EarlyStopping(patience=es_patience, restore_best_weights=True)) if 'checkpoint_model' in callbacks: cb.append( ModelCheckpoint(filepath=checkpoint_model, monitor='val_loss')) if 'best_model' in callbacks: cb.append( ModelCheckpoint(filepath=best_model, monitor='val_loss', save_best_only=True)) if len(cb) == 0: cb = None self.params = { 'batch_size': 10000, 'epochs': 1000, 'output_path': output_path, 'callbacks': cb, 'final_model': final_model }
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--extracted_pkl", default=None) parser.add_argument("--tensor_folder", default=None) parser.add_argument("--test_ratio", type=float, default=0.01) parser.add_argument("--data_folder", default=None) parser.add_argument("--random_seed", type=int, default=1) args = parser.parse_args() if args.data_folder is None: extracted_pkl = args.extracted_pkl tensor_folder = args.tensor_folder else: extracted_pkl = join_path(args.data_folder, ['extracted_txt', 'extracted.pkl']) params_path = join_path(args.data_folder, 'params.pkl') try: params = pickle_load(params_path) except Exception: params = {} params['encode_data_test_ratio'] = args.test_ratio params['encode_data_random_seed'] = args.random_seed pickle_save(params, params_path) dataset = pickle_load(extracted_pkl) random.Random(1).shuffle(dataset) test_size = int(len(dataset) * args.test_ratio) test_set = dataset[:test_size]
parser = argparse.ArgumentParser() parser.add_argument("--extracted_pkl", default=None) parser.add_argument("--savepath", default=None) parser.add_argument("--data_folder", default=None) args = parser.parse_args() if args.data_folder is None: if args.extracted_pkl is not None: extracted_pkl = args.extracted_pkl else: raise FileNotFoundError('No file path was provided') savepath = args.savepath else: setup_folder(args.data_folder) extracted_pkl = join_path(args.data_folder, ['extracted_txt', 'extracted.pkl']) savepath = join_path(args.data_folder, ['definition', 'def_dict.pkl']) dataset = pickle_load(extracted_pkl) print(f'Total number of applications: {len(dataset)}') def_example = extract_definition(dataset) print(f'Number of unique term: {len(def_example)}') print(f'Saving to {savepath}') pickle_save(def_example, savepath)
from os import listdir from src.utils.general import join_path if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--data_folder", default=None) parser.add_argument("--random_seed", type=int, default=1) parser.add_argument("--ncentroids", type=int, default=256) parser.add_argument("--code_size", type=int, default=64) parser.add_argument("--probe", type=int, default=4) args = parser.parse_args() train_folder_path = join_path(args.data_folder, 'train_data') train_files = [ npfile for npfile in listdir(train_folder_path) if npfile.endswith('keys.npy') ] random.Random(args.random_seed).shuffle(train_files) sample_size = min(1000, int(len(train_files) * 0.1)) sample_files = train_files[sample_size:] sample_file_path = join_path(train_folder_path, sample_files[0]) sample_np = np.load(sample_file_path) dimension = sample_np.shape[1] sample_np_list = []
parser.add_argument( "--layers_act", default=['tanh', 'linear', 'tanh', 'linear', 'tanh', 'linear']) parser.add_argument("--epochs", default=10) parser.add_argument("--es_patience", default=2) args = parser.parse_args() if args.data_folder is None: model_output = args.model_output tensor_folder = args.tensor_folder else: date = datetime.datetime.today().strftime('%Y_%m_%d') if args.remark is not None: date = date + '_' + args.remark model_output = join_path(args.data_folder, ['models', date]) check_mkdir(model_output) tensor_folder = join_path(args.data_folder, 'tensor') tensor_list = listdir(tensor_folder) model = DNN(model_output, es_patience=args.es_patience, callbacks=['es']) if args.data_folder is not None: params_path = join_path(args.data_folder, 'params.pkl') try: params = pickle_load(params_path) except Exception: params = {} model_params = model.get_params() model_params['epochs'] = args.epochs
parser.add_argument("--combinefile", default=None) parser.add_argument("--savepath", default=None) parser.add_argument("--data_folder", default=None) args = parser.parse_args() if args.data_folder is None: checkpoint_save = args.checkpoint_folder combinefile = args.combinefile savepath = args.savepath if (checkpoint_save is None) and (combinefile is None): raise FileNotFoundError('No file path was provided') else: setup_folder(args.data_folder) checkpoint_save = join_path(args.data_folder, 'search_chunks') savepath = join_path(args.data_folder, ['extracted_txt', 'extracted.pkl']) combinefile = None try: output_list = pickle_load(combinefile) except Exception: output_list = combine_checkpoint_file(checkpoint_save) app_list = extract_app(output_list) print(f'Total number of applications: {len(app_list)}') processed_data = [] for app in tqdm(app_list): intro_text = process_intro(app[0]) claims = process_claim(app[1])
parser.add_argument("--savepath", default=None) parser.add_argument("--data_folder", default=None) parser.add_argument("--max_length", type=int, default=50) parser.add_argument("--error_word_list", nargs='+', default=[]) parser.add_argument("--min_freq", type=int, default=4) args = parser.parse_args() if args.data_folder is None: if args.checkpoint_folder is not None: chunk_list = combine_checkpoint_file(args.checkpoint_folder) else: raise FileNotFoundError('No file path was provided') savepath = args.savepath else: savepath = join_path(args.data_folder, ['vocab', 'vocab_tensor.pkl']) chunk_folder = join_path(args.data_folder, 'search_chunks') chunk_list = combine_checkpoint_file(chunk_folder) params_path = join_path(args.data_folder, 'params.pkl') try: params = pickle_load(params_path) except Exception: params = {} params['extract_vocab_max_length'] = args.max_length params['extract_vocab_error_word_list'] = args.error_word_list params['extract_vocabmin_freq'] = args.min_freq pickle_save(params, params_path) app_list = extract_app(chunk_list) random.shuffle(app_list) subset_size = min(len(app_list), 100)
parser.add_argument("--extracted_pkl", default=None) parser.add_argument("--tensor_folder", default=None) parser.add_argument("--test_ratio", type=float, default=0.01) parser.add_argument("--data_folder", default=None) parser.add_argument("--random_seed", type=int, default=1) parser.add_argument("--chunk_size", type=int, default=300) parser.add_argument("--mode", default='attention') args = parser.parse_args() if args.data_folder is None: extracted_pkl = args.extracted_pkl tensor_folder = args.tensor_folder else: extracted_pkl = join_path(args.data_folder, ['extracted_txt', 'extracted.pkl']) tensor_folder = join_path(args.data_folder, ['tensor', args.mode]) params_path = join_path(args.data_folder, 'params.pkl') check_mkdir(tensor_folder) try: params = pickle_load(params_path) except Exception: params = {} params['encode_data_test_ratio'] = args.test_ratio params['encode_data_random_seed'] = args.random_seed params['encode_data_chunk_size'] = args.chunk_size pickle_save(params, params_path) dataset = pickle_load(extracted_pkl)
parser.add_argument("--mp", type=bool, default=False) parser.add_argument("--num_chunks", type=int, default=None) parser.add_argument("--keywords", nargs='+', default=['bio', 'pharm', 'medic']) args = parser.parse_args() target_doc = None if args.data_folder is None: checkpoint_save = args.checkpoint_save savepath = args.savepath params_path = None else: setup_folder(args.data_folder) checkpoint_save = join_path(args.data_folder, 'search_chunks') savepath = join_path(args.data_folder, ['search_chunks', 'raw.pkl']) params_path = join_path(args.data_folder, 'params.pkl') params = {'preprocess_numdays': args.numdays, 'preprocess_keywords': args.keywords} if args.numdays is not None: date_list = generate_datelist(args.numdays) params['preprocess_date_list'] = date_list if args.mp: if args.num_chunks is None: num_chunks = cpu_count() else: num_chunks = args.num_chunks
parser.add_argument("--pretrain_path", default=None) parser.add_argument("--model_output", default=None) parser.add_argument("--tensor_folder", default=None) parser.add_argument("--data_folder", default=None) parser.add_argument("--remark", default=None) args = parser.parse_args() if args.data_folder is None: model_output = args.model_output tensor_folder = args.tensor_folder else: date = datetime.datetime.today().strftime('%Y_%m_%d') if args.remark is not None: date = date + '_' + args.remark model_output = join_path(args.data_folder, ['models', date]) check_mkdir(model_output) tensor_folder = join_path(args.data_folder, 'tensor') tensor_list = listdir(tensor_folder) train_tensors = [pickle_load(join_path(tensor_folder, name)) for name in tensor_list if 'train' in name] X_train_list = [X for X, y in train_tensors] y_train_list = [y for X, y in train_tensors] X_train = tf.concat(X_train_list, axis=0) y_train = tf.concat(y_train_list, axis=0) X_test, y_test = pickle_load(join_path(tensor_folder, 'test.pkl')) del X_train_list, y_train_list gc.collect()