def params_setup(): parser = argparse.ArgumentParser() parser.add_argument('--attention_len', type=int, default=16) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--data_set', type=str, default='muse') parser.add_argument('--decay', type=int, default=0) parser.add_argument('--dropout', type=float, default=0.2) parser.add_argument('--file_output', type=int, default=1) parser.add_argument('--highway', type=int, default=0) parser.add_argument('--horizon', type=int, default=3) parser.add_argument('--init_weight', type=float, default=0.1) parser.add_argument('--learning_rate', type=float, default=1e-5) parser.add_argument('--max_gradient_norm', type=float, default=5.0) parser.add_argument('--mode', type=str, default='train') parser.add_argument('--model_dir', type=str, default='./models/model') parser.add_argument('--mts', type=int, default=1) parser.add_argument('--num_epochs', type=int, default=40) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--num_units', type=int, default=338) para = parser.parse_args() if para.data_set == "muse" or para.data_set == "lpd5": para.mts = 0 para.logging_level = logging.INFO if para.attention_len == -1: para.attention_len = para.max_len create_dir(para.model_dir) json_path = para.model_dir + '/parameters.json' json.dump(vars(para), open(json_path, 'w'), indent=4) return para
def __init__(self, start_ratio=0.0, end_ratio=0.98, _sample_rate=1.0, data_params={}, tokenizer_pl=[], encoder_pl=[], _tokenizer_dir='cdlm', _dataset='cdlm'): # initialize variables self.__data_params = data_params self.__tokenizer_pl = tokenizer_pl self.__encoder_pl = encoder_pl self.__sample_rate = _sample_rate self.__tokenizer_path = os.path.join( create_dir(data_dir, 'tokenizer', _tokenizer_dir), 'tokenizer.pkl') self.__processed_dir_path = create_dir(data_dir, 'preprocessed', _dataset) # load data from files # zh_en_dict = load_json(filtered_pos_union_en_zh_dict_path) zh_en_dict = load_json(filtered_pos_union_zh_en_dict_path) zh_en_list = list( filter(lambda x: 'translation' in x[1] and x[1]['translation'], zh_en_dict.items())) zh_en_list = list( map( lambda x: [[x[0]] * len(x[1]['translation']), x[1][ 'translation']], zh_en_list)) # data = reduce(lambda x, y: [x[0] + y[0], x[1] + y[1]], zh_en_list) zh_data = [] en_data = [] length = len(zh_en_list) for i, val in enumerate(zh_en_list): if i % 50 == 0: progress = float(i + 1) / length * 100. print('\rprogress: %.2f%% ' % progress, end='') zh_data += val[0] en_data += val[1] data = list(zip(zh_data, en_data)) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) # get tokenizer if os.path.isfile(self.__tokenizer_path): self.__tokenizer = load_pkl(self.__tokenizer_path) else: self.__tokenizer_src, self.__tokenizer_tar = list(zip(*data)) self.get_tokenizer() # get the data set (train or validation or test) data = self.__split_data(data, start_ratio, end_ratio) self.gen_preprocessed_data(data, self.BATCH_SIZE_PER_FILE)
def __init__(self, para): DataGenerator.__init__(self, para) self.split = list(para.split_date) self.split_names = ["train", "validation", "test"] self.h = para.horizon self.DATA_PATH = os.path.join(self.DIRECTORY, para.data_set + str(self.h)) create_dir(self.DATA_PATH) self._load(para) self._preprocess(para)
def __init__(self, para): DataGenerator.__init__(self, para) self.h = para.horizon self.DATA_PATH = os.path.join(self.DIRECTORY, para.data_set + str(self.h)) create_dir(self.DATA_PATH) self._download_file() self.split = [0, 0.6, 0.8, 1] self.split_names = ["train", "validation", "test"] self._preprocess(para) del self.raw_dat, self.dat
def __init__(self, _is_train, _sample_rate=1.0, data_params={}, tokenizer_pl=[], encoder_pl=[], _tokenizer_dir='cdlm', _dataset='cdlm'): # initialize variables self.__data_params = data_params self.__tokenizer_pl = tokenizer_pl self.__encoder_pl = encoder_pl self.__sample_rate = _sample_rate self.__tokenizer_path = os.path.join( create_dir(data_dir, 'tokenizer', _tokenizer_dir), 'tokenizer.pkl') self.__processed_dir_path = create_dir(data_dir, 'preprocessed', _dataset) # initialize wmt news loader start_ratio = 0.0 if _is_train else zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO end_ratio = zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0 zh_en_wmt_loader = zh_en_wmt_news.Loader(start_ratio, end_ratio) # initialize news commentary loader start_ratio = 0.0 if _is_train else zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO end_ratio = zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0 zh_en_news_commentary_loader = zh_en_news_commentary.Loader( start_ratio, end_ratio, 0.2) # load the data zh_data, en_data = zh_en_wmt_loader.data() zh_data_2, en_data_2 = zh_en_news_commentary_loader.data() # combine data zh_data += zh_data_2 en_data += en_data_2 data = list(zip(zh_data, en_data)) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) # get tokenizer if os.path.isfile(self.__tokenizer_path): self.__tokenizer = load_pkl(self.__tokenizer_path) else: self.__tokenizer_src, self.__tokenizer_tar = list(zip(*data)) self.get_tokenizer() self.gen_preprocessed_data(data, self.BATCH_SIZE_PER_FILE)
def __init__(self, tokenizer_dir, un_preprocess_dirs, data_params={}, pretrain_params={}, encoder_pl=[]): # initialize variables self.__data_params = data_params self.__pretrain_params = pretrain_params self.__encoder_pl = encoder_pl self.__dirs = un_preprocess_dirs self.__running = True self.__cur_index = 0 self.__data = [] self.__file_list = [] self.__tokenizer = load_pkl(get_file_path(data_dir, 'tokenizer', tokenizer_dir, 'tokenizer.pkl')) # get the list of all files for dir_name in self.__dirs: _dir_path = create_dir(data_dir, 'un_preprocessed', dir_name) self.__file_list += list(map(lambda x: os.path.join(_dir_path, x), os.listdir(_dir_path))) self.__len_files = len(self.__file_list) random.seed(self.RANDOM_STATE) random.shuffle(self.__file_list) self.start()
def __init__(self, data_params={}, tokenizer_pl=[], _tokenizer_dir='only_news_commentary'): # initialize variables self.__data_params = data_params self.__tokenizer_pl = tokenizer_pl self.tokenizer_dir = f'{_tokenizer_dir}_{self.__data_params["vocab_size"]}' self.tokenizer_path = os.path.join( create_dir(data_dir, 'tokenizer', self.tokenizer_dir), 'tokenizer.pkl') if os.path.isfile(self.tokenizer_path): return data = self.__load_from_news_commentary() data += self.__load_from_wmt_news() data += self.__load_from_um_corpus() data += self.__load_from_dict() # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) # get tokenizer self.__tokenizer_src, self.__tokenizer_tar = list(zip(*data)) self.get_tokenizer()
def __init__(self, data_params={}, tokenizer_pl=[], _tokenizer_dir='only_news_commentary'): # initialize variables self.__data_params = data_params self.__tokenizer_pl = tokenizer_pl self.tokenizer_dir = f'{_tokenizer_dir}_{self.__data_params["vocab_size"]}' self.tokenizer_path = os.path.join( create_dir(data_dir, 'tokenizer', self.tokenizer_dir), 'tokenizer.pkl') if os.path.isfile(self.tokenizer_path): return # load data from files data = news_commentary.zh_en() data = self.__split_data(data, 0., self.NMT_TRAIN_RATIO) data = reduce(lambda x, y: x + y, data) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) # get tokenizer self.__tokenizer_src, self.__tokenizer_tar = list(zip(*data)) self.get_tokenizer()
def __init__(self, data_params={}, preprocess_zh_pl=[], tokenizer_pl=[], _tokenizer_dir='only_news_commentary'): # initialize variables self.__data_params = data_params self.__preprocess_zh_pl = preprocess_zh_pl self.__tokenizer_pl = tokenizer_pl self.tokenizer_dir = f'{_tokenizer_dir}_{self.__data_params["vocab_size"]}' self.tokenizer_path = os.path.join(create_dir(data_dir, 'tokenizer', self.tokenizer_dir), 'tokenizer.pkl') if os.path.isfile(self.tokenizer_path): return # load zh en data data = self.__load_from_news_commentary() data += self.__load_from_wmt_news() # preprocess Chinese (word segmentation) zh_data, en_data = list(zip(*data)) zh_data = self.__preprocess_zh(zh_data) data = list(zip(zh_data, en_data)) # load ro en data data += self.__load_from_ro_en() # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) # get tokenizer self.__tokenizer_src, self.__tokenizer_tar = list(zip(*data)) self.get_tokenizer()
def __init__(self, _is_train, _dataset='cdlm'): # initialize variables self.__processed_dir_path = create_dir(data_dir, 'un_preprocessed', _dataset) # initialize wmt news loader start_ratio = 0.0 if _is_train else zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO end_ratio = zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0 zh_en_wmt_loader = zh_en_wmt_news.Loader(start_ratio, end_ratio) # initialize news commentary loader start_ratio = 0.0 if _is_train else zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO end_ratio = zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0 zh_en_news_commentary_loader = zh_en_news_commentary.Loader(start_ratio, end_ratio) # load the data zh_data, en_data = zh_en_wmt_loader.data() zh_data_2, en_data_2 = zh_en_news_commentary_loader.data() # combine data zh_data += zh_data_2 en_data += en_data_2 data = list(zip(zh_data, en_data)) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) self.gen_data(data, self.BATCH_SIZE_PER_FILE)
def sess_params_setup(): sess_parser = argparse.ArgumentParser() sess_parser.add_argument('--attention_len', type=int, default=16) sess_parser.add_argument('--batch_size', type=int, default=32) sess_parser.add_argument('--data_set', type=str, default='muse') sess_parser.add_argument('--decay', type=int, default=0) sess_parser.add_argument('--dropout', type=float, default=0.2) sess_parser.add_argument('--file_output', type=int, default=1) sess_parser.add_argument('--highway', type=int, default=0) sess_parser.add_argument('--horizon', type=int, default=3) sess_parser.add_argument('--init_weight', type=float, default=0.1) sess_parser.add_argument('--learning_rate', type=float, default=1e-5) sess_parser.add_argument('--max_gradient_norm', type=float, default=5.0) sess_parser.add_argument('--mode', type=str, default='train') sess_parser.add_argument('--model_dir', type=str, default='./models/model') sess_parser.add_argument('--mts', type=int, default=1) sess_parser.add_argument('--num_epochs', type=int, default=40) sess_parser.add_argument('--num_layers', type=int, default=3) sess_parser.add_argument('--num_units', type=int, default=338) para, unknown = sess_parser.parse_known_args() # para = parser.parse_args() para.mode = "validation" para.mode2 = "explain" para.attention_len = para.highway = 16 para.horizon = 3 para.data_set = "traffic" para.batch_size = 32 para.learning_rate = 1e-3 para.model_dir = "./models/traffic" para.num_epochs = 40 para.num_units = 25 if para.data_set == "muse" or para.data_set == "lpd5": para.mts = 0 para.logging_level = logging.INFO if para.attention_len == -1: para.attention_len = para.max_len create_dir(para.model_dir) json_path = para.model_dir + '/parameters.json' json.dump(vars(para), open(json_path, 'w'), indent=4) return para
def params_setup(): parser = argparse.ArgumentParser() parser.add_argument('--attention_len', type=int, default=16) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--data_set', type=str, default='muse') parser.add_argument('--decay', type=int, default=0) parser.add_argument('--dropout', type=float, default=0.2) parser.add_argument('--file_output', type=int, default=1) parser.add_argument('--highway', type=int, default=0) parser.add_argument('--horizon', type=int, default=5) parser.add_argument('--init_weight', type=float, default=0.1) parser.add_argument('--learning_rate', type=float, default=1e-5) parser.add_argument('--max_gradient_norm', type=float, default=5.0) parser.add_argument('--mode', type=str, default='train') parser.add_argument('--initial_weights', type=str, default='') parser.add_argument('--model_dir', type=str, default='./models/model') parser.add_argument('--mts', type=int, default=1) parser.add_argument('--split', type=float, default=0.2) parser.add_argument('--num_epochs', type=int, default=40) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--num_units', type=int, default=338) parser.add_argument('--first_epoch', type=int, default=1) parser.add_argument('--save_final_model_path', type=str, default='') parser.add_argument('--samples', type=int, default=1) para = parser.parse_args() if para.data_set == "muse" or para.data_set == "lpd5": para.mts = 0 para.logging_level = logging.DEBUG if para.attention_len == -1: para.attention_len = para.max_len if not 0.01 <= para.split <= 0.5: para.split = 0.1 logging.error('Split param must be in (0, 1). Reset to 0.1') create_dir(para.model_dir) para.first_epoch = 1 json_path = para.model_dir + '/parameters.json' json.dump(vars(para), open(json_path, 'w'), indent=4) return para
def __init__(self, start_ratio=0.0, end_ratio=0.98, _sample_rate=1.0, data_params={}, tokenizer_pl=[], encoder_pl=[], _tokenizer_dir='cdlm', _dataset='cdlm'): # initialize variables self.__data_params = data_params self.__tokenizer_pl = tokenizer_pl self.__encoder_pl = encoder_pl self.__sample_rate = _sample_rate self.__tokenizer_path = os.path.join( create_dir(data_dir, 'tokenizer', _tokenizer_dir), 'tokenizer.pkl') self.__processed_dir_path = create_dir(data_dir, 'preprocessed', _dataset) # load data from files data = news_commentary.zh_en() data = self.__split_data(data, 0., self.NMT_TRAIN_RATIO) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) # get tokenizer if os.path.isfile(self.__tokenizer_path): self.__tokenizer = load_pkl(self.__tokenizer_path) else: tmp_data = reduce(lambda x, y: x + y, data) self.__tokenizer_src, self.__tokenizer_tar = list(zip(*tmp_data)) self.get_tokenizer() # get the data set (train or validation or test) data = self.__split_data(data, start_ratio, end_ratio) data = reduce(lambda x, y: x + y, data) self.gen_preprocessed_data(data, self.BATCH_SIZE_PER_FILE)
def __init__(self, _dataset='cdlm'): # initialize variables self.__processed_dir_path = create_dir(data_dir, 'un_preprocessed', _dataset) zh_data, en_data = um_corpus.zh_en(get_test=False) data = list(zip(zh_data, en_data)) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) self.gen_data(data, self.BATCH_SIZE_PER_FILE)
def extract_data(**context): logging.info('EXECUTION_DATE: %s' % context['task_instance'].execution_date) time_step = (context['task_instance'].execution_date - default_args['start_date']).days + 1 logging.info('TIME_STEP: %d' % time_step) with open(input_file_path, mode='rb') as input_file: movielens = pickle.load(input_file) train = movielens['train'][:time_step] shape = movielens['shape'] # TODO: OPTMIZE rows, cols, dta = np.concatenate([i[0] for i in train]), np.concatenate([i[1] for i in train]), np.concatenate([i[2] for i in train]) train_data = coo_matrix((dta, (rows, cols)), shape=shape) logging.info('NNZ: %d' % train_data.nnz) create_dir(output_path) with open(output_path+'output_train.pickle', mode='wb') as output_file: pickle.dump(train_data, output_file) path = output_file.name # send the path for the next task context['task_instance'].xcom_push(key='time_step', value=time_step) context['task_instance'].xcom_push(key='path', value=path)
def __init__(self, *args): # initialize variables self.__running = True self.__cur_index = 0 self.__data = [] self.__file_list = [] self.__dirs = args # get the list of all files for dir_name in args: processed_dir_path = create_dir(data_dir, 'preprocessed', dir_name) self.__file_list += list( map(lambda x: os.path.join(processed_dir_path, x), os.listdir(processed_dir_path))) self.__len_files = len(self.__file_list) random.seed(self.RANDOM_STATE) random.shuffle(self.__file_list) self.start()
def __init__(self, _is_train, _dataset='cdlm'): # initialize variables self.__processed_dir_path = create_dir(data_dir, 'un_preprocessed', _dataset) # initialize wmt news loader start_ratio = 0.0 if _is_train else zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO end_ratio = zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0 zh_en_wmt_loader = zh_en_wmt_news.Loader(start_ratio, end_ratio) # initialize news commentary loader start_ratio = 0.0 if _is_train else zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO end_ratio = zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0 zh_en_news_commentary_loader = zh_en_news_commentary.Loader(start_ratio, end_ratio) # load the data zh_data, en_data = zh_en_wmt_loader.data() zh_data_2, en_data_2 = zh_en_news_commentary_loader.data() # um corpus data is only for training if _is_train: zh_data_3, en_data_3 = um_corpus.zh_en(get_test=False) # combine data zh_data += tuple(zh_data_3) en_data += tuple(en_data_3) # combine data zh_data += zh_data_2 en_data += en_data_2 # word segmentation for zh_data zh_data = utils.pipeline(seg_zh_by_jieba_pipeline, zh_data) data = list(zip(zh_data, en_data)) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) self.gen_data(data, self.BATCH_SIZE_PER_FILE)
def __init__(self, start_ratio=0.0, end_ratio=0.98, _dataset='cdlm'): # initialize variables self.__processed_dir_path = create_dir(data_dir, 'un_preprocessed', _dataset) # load data from files data = news_commentary.zh_en() data = self.__split_data(data, 0., self.NMT_TRAIN_RATIO) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) # get the data set (train or validation or test) data = self.__split_data(data, start_ratio, end_ratio) data = reduce(lambda x, y: x + y, data) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) self.gen_data(data, self.BATCH_SIZE_PER_FILE)
parser.add_argument('--custom', type=bool, default=True) parser.add_argument('--split_date', type=list, default=['20181201', '20190320']) parser.add_argument('--dataset_address', type=str, default='./data/raw_time_series.parquet') parser.add_argument('--output_dir', type=str, default='./output') #%% para = parser.parse_args(args=[]) para.logging_level = logging.INFO logging_config_setup(para) #%% create_dir(para.model_dir) create_dir(para.output_dir) json_path = para.model_dir + '/parameters.json' json.dump(vars(para), open(json_path, 'w'), indent=4) # %% graph = tf.Graph() # %% graph, model, data_generator = create_graph(para) # %% with tf.Session(config=config_setup(), graph=graph) as sess: sess.run(tf.global_variables_initializer()) load_weights(para, sess, model) print_num_of_trainable_parameters() train(para, sess, model, data_generator)
def _download_file(self): logging.info("Downloading %s dataset from Google drive..." % self.para.data_set) create_dir(self.DATA_PATH) download_file_from_google_drive(self.DATASET_ID, self.DATA_FULL_PATH + ".tar")