Ejemplo n.º 1
0
    def load_model(self, data_folder):
        index_path = join_path(data_folder, ['train_data', 'index.trained'])
        vals_path = join_path(data_folder, ['train_data', 'vals.trained'])
        dict_path = join_path(data_folder, ['train_data', 'dict_word2idx.pkl'])

        dict_word2idx = pickle_load(dict_path)
        self.dict_idx2word = {v: k for k, v in dict_word2idx.items()}

        self.index = faiss.read_index(index_path)

        self.vals = np.load(vals_path)

        print('Model loaded')
Ejemplo n.º 2
0
    def __init__(self, output_path, es_patience, callbacks):

        if output_path is None:
            callbacks = None
            final_model = None
            cb = None

        else:
            # Make directory for outputs
            check_mkdir(join_path(output_path, 'checkpoints'))
            check_mkdir(join_path(output_path, 'models'))

            checkpoint_model = join_path(output_path,
                                         ['checkpoints', 'epoch{epoch}.h5'])
            best_model = join_path(output_path, ['models', 'best_model.h5'])
            final_model = join_path(output_path, ['models', 'final_model.h5'])

            cb = []

            if 'es' in callbacks:
                cb.append(
                    EarlyStopping(patience=es_patience,
                                  restore_best_weights=True))

            if 'checkpoint_model' in callbacks:
                cb.append(
                    ModelCheckpoint(filepath=checkpoint_model,
                                    monitor='val_loss'))

            if 'best_model' in callbacks:
                cb.append(
                    ModelCheckpoint(filepath=best_model,
                                    monitor='val_loss',
                                    save_best_only=True))

            if len(cb) == 0:
                cb = None

        self.params = {
            'batch_size': 10000,
            'epochs': 1000,
            'output_path': output_path,
            'callbacks': cb,
            'final_model': final_model
        }
Ejemplo n.º 3
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument("--extracted_pkl", default=None)
    parser.add_argument("--tensor_folder", default=None)
    parser.add_argument("--test_ratio", type=float, default=0.01)
    parser.add_argument("--data_folder", default=None)
    parser.add_argument("--random_seed", type=int, default=1)

    args = parser.parse_args()

    if args.data_folder is None:
        extracted_pkl = args.extracted_pkl
        tensor_folder = args.tensor_folder
    else:
        extracted_pkl = join_path(args.data_folder,
                                  ['extracted_txt', 'extracted.pkl'])
        params_path = join_path(args.data_folder, 'params.pkl')

        try:
            params = pickle_load(params_path)
        except Exception:
            params = {}
        params['encode_data_test_ratio'] = args.test_ratio
        params['encode_data_random_seed'] = args.random_seed
        pickle_save(params, params_path)

    dataset = pickle_load(extracted_pkl)

    random.Random(1).shuffle(dataset)
    test_size = int(len(dataset) * args.test_ratio)
    test_set = dataset[:test_size]
    parser = argparse.ArgumentParser()

    parser.add_argument("--extracted_pkl", default=None)
    parser.add_argument("--savepath", default=None)
    parser.add_argument("--data_folder", default=None)

    args = parser.parse_args()

    if args.data_folder is None:
        if args.extracted_pkl is not None:
            extracted_pkl = args.extracted_pkl
        else:
            raise FileNotFoundError('No file path was provided')
        savepath = args.savepath
    else:
        setup_folder(args.data_folder)
        extracted_pkl = join_path(args.data_folder,
                                  ['extracted_txt', 'extracted.pkl'])
        savepath = join_path(args.data_folder, ['definition', 'def_dict.pkl'])

    dataset = pickle_load(extracted_pkl)

    print(f'Total number of applications: {len(dataset)}')

    def_example = extract_definition(dataset)

    print(f'Number of unique term: {len(def_example)}')

    print(f'Saving to {savepath}')
    pickle_save(def_example, savepath)
Ejemplo n.º 5
0
from os import listdir

from src.utils.general import join_path

if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument("--data_folder", default=None)
    parser.add_argument("--random_seed", type=int, default=1)
    parser.add_argument("--ncentroids", type=int, default=256)
    parser.add_argument("--code_size", type=int, default=64)
    parser.add_argument("--probe", type=int, default=4)

    args = parser.parse_args()

    train_folder_path = join_path(args.data_folder, 'train_data')

    train_files = [
        npfile for npfile in listdir(train_folder_path)
        if npfile.endswith('keys.npy')
    ]

    random.Random(args.random_seed).shuffle(train_files)
    sample_size = min(1000, int(len(train_files) * 0.1))
    sample_files = train_files[sample_size:]

    sample_file_path = join_path(train_folder_path, sample_files[0])
    sample_np = np.load(sample_file_path)
    dimension = sample_np.shape[1]

    sample_np_list = []
Ejemplo n.º 6
0
    parser.add_argument(
        "--layers_act",
        default=['tanh', 'linear', 'tanh', 'linear', 'tanh', 'linear'])
    parser.add_argument("--epochs", default=10)
    parser.add_argument("--es_patience", default=2)

    args = parser.parse_args()

    if args.data_folder is None:
        model_output = args.model_output
        tensor_folder = args.tensor_folder
    else:
        date = datetime.datetime.today().strftime('%Y_%m_%d')
        if args.remark is not None:
            date = date + '_' + args.remark
        model_output = join_path(args.data_folder, ['models', date])
        check_mkdir(model_output)
        tensor_folder = join_path(args.data_folder, 'tensor')

    tensor_list = listdir(tensor_folder)

    model = DNN(model_output, es_patience=args.es_patience, callbacks=['es'])

    if args.data_folder is not None:
        params_path = join_path(args.data_folder, 'params.pkl')
        try:
            params = pickle_load(params_path)
        except Exception:
            params = {}
        model_params = model.get_params()
        model_params['epochs'] = args.epochs
    parser.add_argument("--combinefile", default=None)
    parser.add_argument("--savepath", default=None)
    parser.add_argument("--data_folder", default=None)

    args = parser.parse_args()

    if args.data_folder is None:
        checkpoint_save = args.checkpoint_folder
        combinefile = args.combinefile
        savepath = args.savepath
        if (checkpoint_save is None) and (combinefile is None):
            raise FileNotFoundError('No file path was provided')

    else:
        setup_folder(args.data_folder)
        checkpoint_save = join_path(args.data_folder, 'search_chunks')
        savepath = join_path(args.data_folder, ['extracted_txt',
                                                'extracted.pkl'])
        combinefile = None
    try:
        output_list = pickle_load(combinefile)
    except Exception:
        output_list = combine_checkpoint_file(checkpoint_save)

    app_list = extract_app(output_list)
    print(f'Total number of applications: {len(app_list)}')

    processed_data = []
    for app in tqdm(app_list):
        intro_text = process_intro(app[0])
        claims = process_claim(app[1])
Ejemplo n.º 8
0
    parser.add_argument("--savepath", default=None)
    parser.add_argument("--data_folder", default=None)
    parser.add_argument("--max_length", type=int, default=50)
    parser.add_argument("--error_word_list", nargs='+', default=[])
    parser.add_argument("--min_freq", type=int, default=4)

    args = parser.parse_args()

    if args.data_folder is None:
        if args.checkpoint_folder is not None:
            chunk_list = combine_checkpoint_file(args.checkpoint_folder)
        else:
            raise FileNotFoundError('No file path was provided')
        savepath = args.savepath
    else:
        savepath = join_path(args.data_folder, ['vocab', 'vocab_tensor.pkl'])
        chunk_folder = join_path(args.data_folder, 'search_chunks')
        chunk_list = combine_checkpoint_file(chunk_folder)
        params_path = join_path(args.data_folder, 'params.pkl')
        try:
            params = pickle_load(params_path)
        except Exception:
            params = {}
        params['extract_vocab_max_length'] = args.max_length
        params['extract_vocab_error_word_list'] = args.error_word_list
        params['extract_vocabmin_freq'] = args.min_freq
        pickle_save(params, params_path)

    app_list = extract_app(chunk_list)
    random.shuffle(app_list)
    subset_size = min(len(app_list), 100)
Ejemplo n.º 9
0
    parser.add_argument("--extracted_pkl", default=None)
    parser.add_argument("--tensor_folder", default=None)
    parser.add_argument("--test_ratio", type=float, default=0.01)
    parser.add_argument("--data_folder", default=None)
    parser.add_argument("--random_seed", type=int, default=1)
    parser.add_argument("--chunk_size", type=int, default=300)
    parser.add_argument("--mode", default='attention')

    args = parser.parse_args()

    if args.data_folder is None:
        extracted_pkl = args.extracted_pkl
        tensor_folder = args.tensor_folder
    else:
        extracted_pkl = join_path(args.data_folder,
                                  ['extracted_txt', 'extracted.pkl'])
        tensor_folder = join_path(args.data_folder, ['tensor', args.mode])
        params_path = join_path(args.data_folder, 'params.pkl')

        check_mkdir(tensor_folder)
        try:
            params = pickle_load(params_path)
        except Exception:
            params = {}
        params['encode_data_test_ratio'] = args.test_ratio
        params['encode_data_random_seed'] = args.random_seed
        params['encode_data_chunk_size'] = args.chunk_size
        pickle_save(params, params_path)

    dataset = pickle_load(extracted_pkl)
Ejemplo n.º 10
0
    parser.add_argument("--mp", type=bool, default=False)
    parser.add_argument("--num_chunks", type=int, default=None)
    parser.add_argument("--keywords", nargs='+',
                        default=['bio', 'pharm', 'medic'])

    args = parser.parse_args()
    target_doc = None

    if args.data_folder is None:
        checkpoint_save = args.checkpoint_save
        savepath = args.savepath
        params_path = None

    else:
        setup_folder(args.data_folder)
        checkpoint_save = join_path(args.data_folder, 'search_chunks')
        savepath = join_path(args.data_folder, ['search_chunks', 'raw.pkl'])
        params_path = join_path(args.data_folder, 'params.pkl')

        params = {'preprocess_numdays': args.numdays,
                  'preprocess_keywords': args.keywords}

    if args.numdays is not None:
        date_list = generate_datelist(args.numdays)
        params['preprocess_date_list'] = date_list

        if args.mp:
            if args.num_chunks is None:
                num_chunks = cpu_count()
            else:
                num_chunks = args.num_chunks
Ejemplo n.º 11
0
    parser.add_argument("--pretrain_path", default=None)
    parser.add_argument("--model_output", default=None)
    parser.add_argument("--tensor_folder", default=None)
    parser.add_argument("--data_folder", default=None)
    parser.add_argument("--remark", default=None)

    args = parser.parse_args()

    if args.data_folder is None:
        model_output = args.model_output
        tensor_folder = args.tensor_folder
    else:
        date = datetime.datetime.today().strftime('%Y_%m_%d')
        if args.remark is not None:
            date = date + '_' + args.remark
        model_output = join_path(args.data_folder, ['models', date])
        check_mkdir(model_output)
        tensor_folder = join_path(args.data_folder, 'tensor')

    tensor_list = listdir(tensor_folder)
    train_tensors = [pickle_load(join_path(tensor_folder, name))
                     for name in tensor_list
                     if 'train' in name]
    X_train_list = [X for X, y in train_tensors]
    y_train_list = [y for X, y in train_tensors]
    X_train = tf.concat(X_train_list, axis=0)
    y_train = tf.concat(y_train_list, axis=0)
    X_test, y_test = pickle_load(join_path(tensor_folder, 'test.pkl'))

    del X_train_list, y_train_list
    gc.collect()