def main(): print utils.get_max_vowels(first_sentence) print print utils.get_max_len(first_sentence) print print utils.reverse(second_sentence) print print utils.get_info_for_obj(os) print print utils.get_info_for_obj(sys) print print utils.get_pseudo_sum(124) print print utils.get_primes(10000)
def print_stats(players, tokens): header = ("NOME", "FICHAS") body = zip(players, tokens) max_names_len = utils.get_max_len(players + [header[0]]) max_tokens_len = utils.get_max_len(tokens + [header[1]]) print( f"┌ {header[0]} { '─' * (max_names_len - 6 + 3) } {header[1]} { '─' * (max_tokens_len - 6) }┐" ) for line in body: name, player_tokens = line name_length = len(name) name_spacer = " " * (max_names_len - name_length + 3) tokens_length = len(str(player_tokens)) tokens_spacer = " " * (max_tokens_len - tokens_length) utils.print_colored( f"│ §y{name}{name_spacer}§g{player_tokens}{tokens_spacer} §0│") print("└" + "─" * (max_names_len + 3 + max_tokens_len + 2) + "┘")
def load_dataset(data_dir, model_params, inference_mode=False): """Loads the .npz file, and splits the set into train/valid/test.""" # normalizes the x and y columns usint the training set. # applies same scaling factor to valid and test set. datasets = [] if isinstance(model_params.data_set, list): datasets = model_params.data_set else: datasets = [model_params.data_set] train_strokes = None valid_strokes = None test_strokes = None for dataset in datasets: data_filepath = os.path.join(data_dir, dataset) if data_dir.startswith('http://') or data_dir.startswith('https://'): tf.logging.info('Downloading %s', data_filepath) response = requests.get(data_filepath) data = np.load(StringIO(response.content)) else: data = np.load(data_filepath) # load this into dictionary tf.logging.info('Loaded {}/{}/{} from {}'.format( len(data['train']), len(data['valid']), len(data['test']), dataset)) if train_strokes is None: train_strokes = data['train'] valid_strokes = data['valid'] test_strokes = data['test'] else: train_strokes = np.concatenate((train_strokes, data['train'])) valid_strokes = np.concatenate((valid_strokes, data['valid'])) test_strokes = np.concatenate((test_strokes, data['test'])) all_strokes = np.concatenate((train_strokes, valid_strokes, test_strokes)) num_points = 0 for stroke in all_strokes: num_points += len(stroke) avg_len = num_points / len(all_strokes) tf.logging.info('Dataset combined: {} ({}/{}/{}), avg len {}'.format( len(all_strokes), len(train_strokes), len(valid_strokes), len(test_strokes), int(avg_len))) # calculate the max strokes we need. max_seq_len = utils.get_max_len(all_strokes) # overwrite the hps with this calculation. model_params.max_seq_len = max_seq_len tf.logging.info('model_params.max_seq_len %i.', model_params.max_seq_len) eval_model_params = sketch_rnn_model.copy_hparams(model_params) eval_model_params.use_input_dropout = 0 eval_model_params.use_recurrent_dropout = 0 eval_model_params.use_output_dropout = 0 eval_model_params.is_training = 1 if inference_mode: eval_model_params.batch_size = 1 eval_model_params.is_training = 0 sample_model_params = sketch_rnn_model.copy_hparams(eval_model_params) sample_model_params.batch_size = 1 # only sample one at a time sample_model_params.max_seq_len = 1 # sample one point at a time train_set = utils.DataLoader( train_strokes, model_params.batch_size, max_seq_length=model_params.max_seq_len, random_scale_factor=model_params.random_scale_factor, augment_stroke_prob=model_params.augment_stroke_prob) normalizing_scale_factor = train_set.calculate_normalizing_scale_factor() train_set.normalize(normalizing_scale_factor) print('Length original', len(train_strokes), len(valid_strokes), len(test_strokes)) valid_set = utils.DataLoader(valid_strokes, eval_model_params.batch_size, max_seq_length=eval_model_params.max_seq_len, random_scale_factor=0.0, augment_stroke_prob=0.0) valid_set.normalize(normalizing_scale_factor) test_set = utils.DataLoader(test_strokes, eval_model_params.batch_size, max_seq_length=eval_model_params.max_seq_len, random_scale_factor=0.0, augment_stroke_prob=0.0) test_set.normalize(normalizing_scale_factor) tf.logging.info('normalizing_scale_factor %4.4f.', normalizing_scale_factor) result = [ train_set, valid_set, test_set, model_params, eval_model_params, sample_model_params ] return result
def load_dataset(data_dir, model_params, testing_mode=False): """Loads the .npz file, and splits the set into train/valid/test.""" # normalizes the x and y columns using scale_factor. dataset = model_params.data_set data_filepath = os.path.join(data_dir, dataset) data = np.load(data_filepath, allow_pickle=True, encoding='latin1') # target data train_strokes = data['train'] valid_strokes = data['valid'] test_strokes = data['test'] all_strokes = np.concatenate((train_strokes, valid_strokes, test_strokes)) # standard data (reference data in paper) std_train_strokes = data['std_train'] std_valid_strokes = data['std_valid'] std_test_strokes = data['std_test'] all_std_trokes = np.concatenate( (std_train_strokes, std_valid_strokes, std_test_strokes)) print('Dataset combined: %d (train=%d/validate=%d/test=%d)' % (len(all_strokes), len(train_strokes), len(valid_strokes), len(test_strokes))) # calculate the max strokes we need. max_seq_len = utils.get_max_len(all_strokes) max_std_seq_len = utils.get_max_len(all_std_trokes) # overwrite the hps with this calculation. model_params.max_seq_len = max(max_seq_len, max_std_seq_len) print('model_params.max_seq_len set to %d.' % model_params.max_seq_len) eval_model_params = copy_hparams(model_params) eval_model_params.rnn_dropout_keep_prob = 1.0 eval_model_params.is_training = True if testing_mode: # for testing eval_model_params.batch_size = 1 eval_model_params.is_training = False # sample mode train_set = utils.DataLoader( train_strokes, model_params.batch_size, max_seq_length=model_params.max_seq_len, random_scale_factor=model_params.random_scale_factor, augment_stroke_prob=model_params.augment_stroke_prob) normalizing_scale_factor = model_params.scale_factor train_set.normalize(normalizing_scale_factor) valid_set = utils.DataLoader(valid_strokes, eval_model_params.batch_size, max_seq_length=eval_model_params.max_seq_len, random_scale_factor=0.0, augment_stroke_prob=0.0) valid_set.normalize(normalizing_scale_factor) test_set = utils.DataLoader(test_strokes, eval_model_params.batch_size, max_seq_length=eval_model_params.max_seq_len, random_scale_factor=0.0, augment_stroke_prob=0.0) test_set.normalize(normalizing_scale_factor) # process the reference dataset std_train_set = utils.DataLoader( std_train_strokes, model_params.batch_size, max_seq_length=model_params.max_seq_len, random_scale_factor=model_params.random_scale_factor, augment_stroke_prob=model_params.augment_stroke_prob) std_train_set.normalize(normalizing_scale_factor) std_valid_set = utils.DataLoader( std_valid_strokes, eval_model_params.batch_size, max_seq_length=eval_model_params.max_seq_len, random_scale_factor=0.0, augment_stroke_prob=0.0) std_valid_set.normalize(normalizing_scale_factor) std_test_set = utils.DataLoader( std_test_strokes, eval_model_params.batch_size, max_seq_length=eval_model_params.max_seq_len, random_scale_factor=0.0, augment_stroke_prob=0.0) std_test_set.normalize(normalizing_scale_factor) result = [ train_set, valid_set, test_set, std_train_set, std_valid_set, std_test_set, model_params, eval_model_params ] return result
def load_dataset(data_dir, model_params, inference_mode=False, contain_labels=False): """Loads the .npz file, and splits the set into train/valid/test.""" # normalizes the x and y columns usint the training set. # applies same scaling factor to valid and test set. # contain_labels: set to True to return labels for classification tasks, default as False datasets = [] if isinstance(model_params.data_set, list): datasets = model_params.data_set else: datasets = [model_params.data_set] train_strokes = None valid_strokes = None test_strokes = None label_index = 0 class_num = len(datasets) for dataset in datasets: # Get input data data_filepath = os.path.join(data_dir, "sketch", dataset) onv_left_filepath = os.path.join(data_dir, "onv_9936_thick", dataset) onv_right_filepath = os.path.join(data_dir, "onv_9936_thick_right", dataset) if data_dir.startswith('http://') or data_dir.startswith('https://'): tf.logging.info('Downloading %s', data_filepath) response = requests.get(data_filepath) data = np.load(StringIO(response.content)) else: tf.logging.info('Getting data from %s', data_filepath) data = np.load(data_filepath) # load this into dictionary tf.logging.info('Getting left onv from %s', onv_left_filepath) onv_left = np.load(onv_left_filepath) tf.logging.info('Getting right onv from %s', onv_right_filepath) onv_right = np.load(onv_right_filepath) train_size = len(onv_left['train']) valid_size = len(onv_left['valid']) test_size = len(onv_left['test']) tf.logging.info('Loaded {}/{}/{} from {}'.format( train_size, valid_size, test_size, dataset)) # set labels for classification task cur_train_labels = np.zeros((train_size, class_num)) cur_valid_labels = np.zeros((valid_size, class_num)) cur_test_labels = np.zeros((test_size, class_num)) cur_train_labels[:, label_index] = 1 cur_valid_labels[:, label_index] = 1 cur_test_labels[:, label_index] = 1 #print ("label_index", label_index, cur_train_labels[0]) if train_strokes is None: train_strokes = data['train'][0:train_size] valid_strokes = data['valid'][0:valid_size] test_strokes = data['test'][0:test_size] train_onvs_left = onv_left['train'][0:train_size] valid_onvs_left = onv_left['valid'][0:valid_size] test_onvs_left = onv_left['test'][0:test_size] train_onvs_right = onv_right['train'][0:train_size] valid_onvs_right = onv_right['valid'][0:valid_size] test_onvs_right = onv_right['test'][0:test_size] train_labels = cur_train_labels[0:train_size] valid_labels = cur_valid_labels[0:valid_size] test_labels = cur_valid_labels[0:test_size] else: train_strokes = np.concatenate((train_strokes, data['train'][0:train_size])) valid_strokes = np.concatenate((valid_strokes, data['valid'][0:valid_size])) test_strokes = np.concatenate((test_strokes, data['test'][0:test_size])) train_onvs_left = np.concatenate((train_onvs_left, onv_left['train'][0:train_size])) valid_onvs_left = np.concatenate((valid_onvs_left, onv_left['valid'][0:valid_size])) test_onvs_left = np.concatenate((test_onvs_left, onv_left['test'][0:test_size])) train_onvs_right = np.concatenate((train_onvs_right, onv_right['train'][0:train_size])) valid_onvs_right = np.concatenate((valid_onvs_right, onv_right['valid'][0:valid_size])) test_onvs_right = np.concatenate((test_onvs_right, onv_right['test'][0:test_size])) train_labels = np.concatenate((train_labels, cur_train_labels[0:train_size])) valid_labels = np.concatenate((valid_labels, cur_valid_labels[0:valid_size])) test_labels = np.concatenate((test_labels, cur_test_labels[0:test_size])) label_index+=1 all_strokes = np.concatenate((train_strokes, valid_strokes, test_strokes)) num_points = 0 for stroke in all_strokes: num_points += len(stroke) avg_len = num_points / len(all_strokes) tf.logging.info('Dataset combined: {} ({}/{}/{}), avg len {}'.format( len(all_strokes), len(train_strokes), len(valid_strokes), len(test_strokes), int(avg_len))) # calculate the max strokes we need. max_seq_len = utils.get_max_len(all_strokes) # overwrite the hps with this calculation. model_params.max_seq_len = max_seq_len tf.logging.info('model_params.max_seq_len %i.', model_params.max_seq_len) eval_model_params = sketch_rnn_model.copy_hparams(model_params) eval_model_params.use_input_dropout = 0 eval_model_params.use_recurrent_dropout = 0 eval_model_params.use_output_dropout = 0 eval_model_params.is_training = 1 if inference_mode: eval_model_params.batch_size = 1 eval_model_params.is_training = 0 sample_model_params = sketch_rnn_model.copy_hparams(eval_model_params) sample_model_params.batch_size = 1 # only sample one at a time sample_model_params.max_seq_len = 1 # sample one point at a time train_set = utils.DataLoader( train_strokes, model_params.batch_size, max_seq_length=model_params.max_seq_len, random_scale_factor=model_params.random_scale_factor, augment_stroke_prob=model_params.augment_stroke_prob) normalizing_scale_factor = train_set.calculate_normalizing_scale_factor() train_set.normalize(normalizing_scale_factor) valid_set = utils.DataLoader( valid_strokes, eval_model_params.batch_size, max_seq_length=eval_model_params.max_seq_len, random_scale_factor=0.0, augment_stroke_prob=0.0) valid_set.normalize(normalizing_scale_factor) test_set = utils.DataLoader( test_strokes, eval_model_params.batch_size, max_seq_length=eval_model_params.max_seq_len, random_scale_factor=0.0, augment_stroke_prob=0.0) test_set.normalize(normalizing_scale_factor) tf.logging.info('normalizing_scale_factor %4.4f.', normalizing_scale_factor) # onv preprocess print ("unique", np.unique(train_onvs_left)) train_onvs_left = train_onvs_left / 255.0 valid_onvs_left = valid_onvs_left / 255.0 test_onvs_left = test_onvs_left / 255.0 train_onvs_right = train_onvs_right / 255.0 valid_onvs_right = valid_onvs_right / 255.0 test_onvs_right = test_onvs_right / 255.0 if not contain_labels: result = [ train_set, valid_set, test_set, model_params, eval_model_params, sample_model_params, train_onvs_left, valid_onvs_left, test_onvs_left, train_onvs_right, valid_onvs_right, test_onvs_right ] else: #return labels for classification tasks result = [ train_set, valid_set, test_set, model_params, eval_model_params, sample_model_params, train_onvs_left, valid_onvs_left, test_onvs_left, train_onvs_right, valid_onvs_right, test_onvs_right, train_labels, valid_labels, test_labels ] return result
def load_dataset(data_dir, model_params, inference_mode=False): """Loads the .npz file, and splits the set into train/valid/test.""" # normalizes the x and y columns using the training set. # applies same scaling factor to valid and test set. if isinstance(model_params.data_set, list): datasets = model_params.data_set else: datasets = [model_params.data_set] train_strokes = None valid_strokes = None test_strokes = None train_data = [] valid_data = [] test_data = [] dataset_lengths = [] all_strokes = [] for i, dataset in enumerate(datasets): data_filepath = os.path.join(data_dir, dataset) if six.PY3: tmp_data = np.load(data_filepath, encoding='latin1', allow_pickle=True) else: tmp_data = np.load(data_filepath, allow_pickle=True) all_strokes = np.concatenate((all_strokes, tmp_data['train'], tmp_data['valid'], tmp_data['test'])) max_seq_len = utils.get_max_len(all_strokes) model_params.max_seq_len = max_seq_len print('Max sequence length: ', max_seq_len) for i, dataset in enumerate(datasets): data_filepath = os.path.join(data_dir, dataset) if six.PY3: data = np.load(data_filepath, encoding='latin1', allow_pickle=True) else: data = np.load(data_filepath, allow_pickle=True) logger.info('Loaded {}/{}/{} from {}'.format(len(data['train']), len(data['valid']), len(data['test']), dataset)) train_strokes = data['train'] valid_strokes = data['valid'] test_strokes = data['test'] train_set = utils.DataLoader( train_strokes, model_params.batch_size, max_seq_length=max_seq_len, random_scale_factor=model_params.random_scale_factor, augment_stroke_prob=model_params.augment_stroke_prob) normalizing_scale_factor = train_set.calculate_normalizing_scale_factor( ) train_set.normalize(normalizing_scale_factor) train_set.strokes = [ utils.to_big_strokes(stroke, max_seq_len) for stroke in train_set.strokes ] train_set.strokes = [ np.insert(stroke, 0, [0, 0, 1, 0, 0], axis=0) for stroke in train_set.strokes ] valid_set = utils.DataLoader( valid_strokes, model_params.batch_size, max_seq_length=max_seq_len, random_scale_factor=model_params.random_scale_factor, augment_stroke_prob=model_params.augment_stroke_prob) valid_set.normalize(normalizing_scale_factor) valid_set.strokes = [ utils.to_big_strokes(stroke, max_seq_len) for stroke in valid_set.strokes ] valid_set.strokes = [ np.insert(stroke, 0, [0, 0, 1, 0, 0], axis=0) for stroke in valid_set.strokes ] test_set = utils.DataLoader( test_strokes, model_params.batch_size, max_seq_length=max_seq_len, random_scale_factor=model_params.random_scale_factor, augment_stroke_prob=model_params.augment_stroke_prob) test_set.normalize(normalizing_scale_factor) test_set.strokes = [ utils.to_big_strokes(stroke, max_seq_len) for stroke in test_set.strokes ] test_set.strokes = [ np.insert(stroke, 0, [0, 0, 1, 0, 0], axis=0) for stroke in test_set.strokes ] train_sketches = [{ 'dataset': dataset, 'draw': sketch } for sketch in train_set.strokes] valid_sketches = [{ 'dataset': dataset, 'draw': sketch } for sketch in valid_set.strokes] test_sketches = [{ 'dataset': dataset, 'draw': sketch } for sketch in test_set.strokes] train_data.append(train_sketches) valid_data.append(valid_sketches) test_data.append(test_sketches) return [train_data, valid_data, test_data]
def load_datasets(data_dir, model_params, inference_mode=False): """Load and preprocess data""" data = utils.load_dataset(data_dir) train_strokes = data['train'] valid_strokes = data['valid'] test_strokes = data['test'] all_strokes = np.concatenate((train_strokes, valid_strokes, test_strokes)) num_points = 0 for stroke in all_strokes: num_points += len(stroke) avg_len = num_points / len(all_strokes) tf.logging.info('{} Shapes / {} Total points'.format(len(all_strokes), num_points)) tf.logging.info('Dataset combined: {} ({}/{}/{}), avg len {}'.format( len(all_strokes), len(train_strokes), len(valid_strokes), len(test_strokes), int(avg_len))) # calculate the max strokes we need. max_seq_len = utils.get_max_len(all_strokes) # overwrite the hps with this calculation. model_params.max_seq_len = max_seq_len tf.logging.info('model_params.max_seq_len %i.', model_params.max_seq_len) eval_model_params = derender_model.copy_hparams(model_params) eval_model_params.use_input_dropout = 0 eval_model_params.use_recurrent_dropout = 0 eval_model_params.use_output_dropout = 0 eval_model_params.is_training = 1 if inference_mode: eval_model_params.batch_size = 1 eval_model_params.is_training = 0 sample_model_params = derender_model.copy_hparams(eval_model_params) sample_model_params.batch_size = 1 # only sample one at a time sample_model_params.max_seq_len = 1 # sample one point at a time train_set = utils.DataLoader( train_strokes, model_params.batch_size, max_seq_length=model_params.max_seq_len, random_scale_factor=model_params.random_scale_factor, augment_stroke_prob=model_params.augment_stroke_prob) normalizing_scale_factor = train_set.calculate_normalizing_scale_factor() train_set.normalize(normalizing_scale_factor) valid_set = utils.DataLoader( valid_strokes, eval_model_params.batch_size, max_seq_length=eval_model_params.max_seq_len, random_scale_factor=0.0, augment_stroke_prob=0.0) valid_set.normalize(normalizing_scale_factor) test_set = utils.DataLoader( test_strokes, eval_model_params.batch_size, max_seq_length=eval_model_params.max_seq_len, random_scale_factor=0.0, augment_stroke_prob=0.0) test_set.normalize(normalizing_scale_factor) tf.logging.info('normalizing_scale_factor %4.4f.', normalizing_scale_factor) result = [ train_set, valid_set, test_set, model_params, eval_model_params, sample_model_params ] return result
def load_dataset(data_dir, model_params, inference_mode=False): """Loads the .npz file, and splits the set into train/valid/test.""" # normalizes the x and y columns using the training set. # applies same scaling factor to valid and test set. if isinstance(model_params.data_set, list): datasets = model_params.data_set else: datasets = [model_params.data_set] train_strokes = None valid_strokes = None test_strokes = None png_paths_map = {'train': [], 'valid': [], 'test': []} for dataset in datasets: if data_dir.startswith('http://') or data_dir.startswith('https://'): data_filepath = '/'.join([data_dir, dataset]) print('Downloading %s' % data_filepath) response = requests.get(data_filepath) data = np.load(six.BytesIO(response.content), encoding='latin') else: data_filepath = os.path.join(data_dir, 'npz', dataset) if six.PY3: data = np.load(data_filepath, encoding='latin1') else: data = np.load(data_filepath) print('Loaded {}/{}/{} from {}'.format(len(data['train']), len(data['valid']), len(data['test']), dataset)) if train_strokes is None: train_strokes = data[ 'train'] # [N (#sketches),], each with [S (#points), 3] valid_strokes = data['valid'] test_strokes = data['test'] else: train_strokes = np.concatenate((train_strokes, data['train'])) valid_strokes = np.concatenate((valid_strokes, data['valid'])) test_strokes = np.concatenate((test_strokes, data['test'])) splits = ['train', 'valid', 'test'] for split in splits: for im_idx in range(len(data[split])): png_path = os.path.join( data_dir, 'png', dataset[:-4], split, str(model_params.img_H) + 'x' + str(model_params.img_W), str(im_idx) + '.png') png_paths_map[split].append(png_path) all_strokes = np.concatenate((train_strokes, valid_strokes, test_strokes)) num_points = 0 for stroke in all_strokes: num_points += len(stroke) avg_len = num_points / len(all_strokes) print('Dataset combined: {} ({}/{}/{}), avg len {}'.format( len(all_strokes), len(train_strokes), len(valid_strokes), len(test_strokes), int(avg_len))) assert len(train_strokes) == len(png_paths_map['train']) assert len(valid_strokes) == len(png_paths_map['valid']) assert len(test_strokes) == len(png_paths_map['test']) # calculate the max strokes we need. max_seq_len = utils.get_max_len(all_strokes) # overwrite the hps with this calculation. model_params.max_seq_len = max_seq_len print('model_params.max_seq_len %i.' % model_params.max_seq_len) eval_model_params = sketch_rnn_model.copy_hparams(model_params) eval_model_params.use_input_dropout = 0 eval_model_params.use_recurrent_dropout = 0 eval_model_params.use_output_dropout = 0 eval_model_params.is_training = 1 if inference_mode: eval_model_params.batch_size = 1 eval_model_params.is_training = 0 sample_model_params = sketch_rnn_model.copy_hparams(eval_model_params) sample_model_params.batch_size = 1 # only sample one at a time sample_model_params.max_seq_len = 1 # sample one point at a time train_set = utils.DataLoader( train_strokes, png_paths_map['train'], model_params.img_H, model_params.img_W, model_params.batch_size, max_seq_length=model_params.max_seq_len, random_scale_factor=model_params.random_scale_factor, augment_stroke_prob=model_params.augment_stroke_prob) normalizing_scale_factor = train_set.calculate_normalizing_scale_factor() train_set.normalize(normalizing_scale_factor) valid_set = utils.DataLoader(valid_strokes, png_paths_map['valid'], eval_model_params.img_H, eval_model_params.img_W, eval_model_params.batch_size, max_seq_length=eval_model_params.max_seq_len, random_scale_factor=0.0, augment_stroke_prob=0.0) valid_set.normalize(normalizing_scale_factor) test_set = utils.DataLoader(test_strokes, png_paths_map['test'], eval_model_params.img_H, eval_model_params.img_W, eval_model_params.batch_size, max_seq_length=eval_model_params.max_seq_len, random_scale_factor=0.0, augment_stroke_prob=0.0) test_set.normalize(normalizing_scale_factor) print('normalizing_scale_factor %4.4f.' % normalizing_scale_factor) result = [ train_set, valid_set, test_set, model_params, eval_model_params, sample_model_params ] return result
def load_dataset(data_dir, datasets, inference_mode=False): """Loads the .npz file, and splits the set into train/valid/test.""" # normalizes the x and y columns usint the training set. # applies same scaling factor to valid and test set. train_strokes = None valid_strokes = None test_strokes = None for dataset in datasets: data_filepath = os.path.join(data_dir, dataset) if data_dir.startswith('http://') or data_dir.startswith('https://'): tf.logging.info('Downloading %s', data_filepath) response = requests.get(data_filepath) data = np.load(StringIO(response.content)) else: if six.PY3: data = np.load(data_filepath, encoding='latin1') else: data = np.load(data_filepath) tf.logging.info('Loaded {}/{}/{} from {}'.format( len(data['train']), len(data['valid']), len(data['test']), dataset)) if train_strokes is None: train_strokes = data['train'] valid_strokes = data['valid'] test_strokes = data['test'] else: train_strokes = np.concatenate((train_strokes, data['train'])) valid_strokes = np.concatenate((valid_strokes, data['valid'])) test_strokes = np.concatenate((test_strokes, data['test'])) all_strokes = np.concatenate((train_strokes, valid_strokes, test_strokes)) num_points = 0 for stroke in all_strokes: num_points += len(stroke) avg_len = num_points / len(all_strokes) tf.logging.info('Dataset combined: {} ({}/{}/{}), avg len {}'.format( len(all_strokes), len(train_strokes), len(valid_strokes), len(test_strokes), int(avg_len))) # calculate the max strokes we need. max_seq_len = utils.get_max_len(all_strokes) tf.logging.info('model_params.max_seq_len %i.', max_seq_len) train_set = utils.DataLoader(train_strokes, random_scale_factor=0.1, augment_stroke_prob=0.1) normalizing_scale_factor = train_set.calculate_normalizing_scale_factor() train_set.normalize(normalizing_scale_factor) valid_set = utils.DataLoader(valid_strokes, random_scale_factor=0.0, augment_stroke_prob=0.0) valid_set.normalize(normalizing_scale_factor) test_set = utils.DataLoader(test_strokes, random_scale_factor=0.0, augment_stroke_prob=0.0) test_set.normalize(normalizing_scale_factor) tf.logging.info('normalizing_scale_factor %4.4f.', normalizing_scale_factor) result = [train_set, valid_set, test_set] return result
def load_dataset(sketch_data_dir, photo_data_dir, model_params, inference_mode=False): """Loads the .npz file, and splits the set into train/test.""" # normalizes the x and y columns using the training set. # applies same scaling factor to test set. if isinstance(model_params.data_set, list): datasets = model_params.data_set else: datasets = [model_params.data_set] train_strokes = None test_strokes = None train_image_paths = [] test_image_paths = [] for dataset in datasets: if model_params.data_type == 'QMUL': train_data_filepath = os.path.join(sketch_data_dir, dataset, 'train_svg_sim_spa_png.h5') test_data_filepath = os.path.join(sketch_data_dir, dataset, 'test_svg_sim_spa_png.h5') train_data_dict = utils.load_hdf5(train_data_filepath) test_data_dict = utils.load_hdf5(test_data_filepath) train_sketch_data = utils.reassemble_data( train_data_dict['image_data'], train_data_dict['data_offset'] ) # list of [N_sketches], each [N_points, 4] train_photo_names = train_data_dict[ 'image_base_name'] # [N_sketches, 1], byte train_photo_paths = [ os.path.join(photo_data_dir, train_photo_names[i, 0].decode() + '.png') for i in range(train_photo_names.shape[0]) ] # [N_sketches], str test_sketch_data = utils.reassemble_data( test_data_dict['image_data'], test_data_dict['data_offset'] ) # list of [N_sketches], each [N_points, 4] test_photo_names = test_data_dict[ 'image_base_name'] # [N_sketches, 1], byte test_photo_paths = [ os.path.join(photo_data_dir, test_photo_names[i, 0].decode() + '.png') for i in range(test_photo_names.shape[0]) ] # [N_sketches], str # transfer stroke-4 to stroke-3 train_sketch_data = utils.to_normal_strokes_4to3(train_sketch_data) test_sketch_data = utils.to_normal_strokes_4to3( test_sketch_data) # [N_sketches,], each with [N_points, 3] if train_strokes is None: train_strokes = train_sketch_data test_strokes = test_sketch_data else: train_strokes = np.concatenate( (train_strokes, train_sketch_data)) test_strokes = np.concatenate((test_strokes, test_sketch_data)) elif model_params.data_type == 'QuickDraw': data_filepath = os.path.join(sketch_data_dir, dataset, 'npz', 'sketchrnn_' + dataset + '.npz') if six.PY3: data = np.load(data_filepath, encoding='latin1') else: data = np.load(data_filepath) if train_strokes is None: train_strokes = data[ 'train'] # [N_sketches,], each with [N_points, 3] test_strokes = data['test'] else: train_strokes = np.concatenate((train_strokes, data['train'])) test_strokes = np.concatenate((test_strokes, data['test'])) train_photo_paths = [ os.path.join( sketch_data_dir, dataset, 'png', 'train', str(model_params.image_size) + 'x' + str(model_params.image_size), str(im_idx) + '.png') for im_idx in range(len(data['train'])) ] test_photo_paths = [ os.path.join( sketch_data_dir, dataset, 'png', 'test', str(model_params.image_size) + 'x' + str(model_params.image_size), str(im_idx) + '.png') for im_idx in range(len(data['test'])) ] else: raise Exception('Unknown data type:', model_params.data_type) print('Loaded {}/{} from {} {}'.format(len(train_photo_paths), len(test_photo_paths), model_params.data_type, dataset)) train_image_paths += train_photo_paths test_image_paths += test_photo_paths all_strokes = np.concatenate((train_strokes, test_strokes)) num_points = 0 for stroke in all_strokes: num_points += len(stroke) avg_len = num_points / len(all_strokes) print('Dataset combined: {} ({}/{}), avg len {}'.format( len(all_strokes), len(train_strokes), len(test_strokes), int(avg_len))) assert len(train_image_paths) == len(train_strokes) assert len(test_image_paths) == len(test_strokes) # calculate the max strokes we need. max_seq_len = utils.get_max_len(all_strokes) # overwrite the hps with this calculation. model_params.max_seq_len = max_seq_len print('model_params.max_seq_len %i.' % model_params.max_seq_len) eval_model_params = sketch_p2s_model.copy_hparams(model_params) eval_model_params.use_input_dropout = 0 eval_model_params.use_recurrent_dropout = 0 eval_model_params.use_output_dropout = 0 eval_model_params.is_training = 1 if inference_mode: eval_model_params.batch_size = 1 eval_model_params.is_training = 0 sample_model_params = sketch_p2s_model.copy_hparams(eval_model_params) sample_model_params.batch_size = 1 # only sample one at a time sample_model_params.max_seq_len = 1 # sample one point at a time train_set = utils.DataLoader( train_strokes, train_image_paths, model_params.image_size, model_params.image_size, model_params.batch_size, max_seq_length=model_params.max_seq_len, random_scale_factor=model_params.random_scale_factor, augment_stroke_prob=model_params.augment_stroke_prob) normalizing_scale_factor = train_set.calculate_normalizing_scale_factor() train_set.normalize(normalizing_scale_factor) # valid_set = utils.DataLoader( # valid_strokes, # eval_model_params.batch_size, # max_seq_length=eval_model_params.max_seq_len, # random_scale_factor=0.0, # augment_stroke_prob=0.0) # valid_set.normalize(normalizing_scale_factor) test_set = utils.DataLoader(test_strokes, test_image_paths, model_params.image_size, model_params.image_size, eval_model_params.batch_size, max_seq_length=eval_model_params.max_seq_len, random_scale_factor=0.0, augment_stroke_prob=0.0) test_set.normalize(normalizing_scale_factor) print('normalizing_scale_factor %4.4f.' % normalizing_scale_factor) result = [ train_set, None, test_set, model_params, eval_model_params, sample_model_params ] return result