def load_data(placeholders): train_data = get_data() train_data, vocab = prepare_data(train_data) with open(VOCAB_DATA_DIR + "vocab.pkl", "wb") as f: pickle.dump(vocab, f) train_data = numpify(train_data, pad=0) # padding to same length and converting lists to numpy arrays train_feed_dicts = get_feed_dicts(train_data, placeholders, batch_size=100, inst_length=len(train_data["sentences"])) return train_feed_dicts, vocab
def get_batches(data, batch_size=32, pad=0, bucket_order=None, bucket_structure=None, exact_epoch=False): """ Creates generator that batches `data`. To avoid biases, it is advised to keep `bucket_order=None` and `bucket_structure=None` if computationally possible. (which will sample batches from all instances) Args: `data`: dict with (multi-dimensional) numpy arrays or (nested) lists; first inner dimension (`num_instances`) should be the same over all data values. `batch_size`: the desired batch size `pad`: padding symbol in case data contains lists of lists of different sizes `bucket_order`: argument `order` in get_buckets (list with keys); `None` if no bucketing `bucket_structure`: argument `structure` in get_buckets; `None` if no bucketing `exact_epoch`: if set to `True`, final batch per bucket may be smaller, but each instance will be seen exactly once during training. Default: `False`, to be certain during training that each instance per batch gets same weight in the total loss (but not all instances are observed per epoch if bucket sizes are no multiple of `batch_size`). Returns: a generator that generates a dict with same keys as `data`, and as values data batches consisting of `[batch_size x num_instances]` 2D numpy tensors (1st dimension is at most `batch_size` but may be smaller to cover all instances exactly once per epoch, if `exact_epoch=True`) """ assert isinstance(data, dict) data0 = list(data.values())[0] if not isinstance(data0, np.ndarray): data_np = numpify( data, pad) # still need original data for length-based bucketing else: data_np = data def get_bucket_probs(_buckets2instances): N = float(np.sum([len(ids) for ids in _buckets2instances.values()])) return { bid: len(ids) / N if N > 0. else 0. for bid, ids in _buckets2instances.items() } def shuffle_buckets(_buckets2instances): for bid in sorted( _buckets2instances.keys()): # sorted: to keep deterministic rs.shuffle(_buckets2instances[bid]) buckets2instances, _ = get_buckets(data, bucket_order, bucket_structure) n_buckets = len(buckets2instances) exact_epoch = True if len(data0) < n_buckets * batch_size else exact_epoch #if average instances/bucket smaller than batch_size: set exact_epoch = True #to avoid empty batches during debugging on small data samples def bucket_generator(): buckets2instances, _ = get_buckets(data, bucket_order, bucket_structure) shuffle_buckets(buckets2instances) all_seen = False while not all_seen: bids, probs = zip( *sorted(get_bucket_probs(buckets2instances).items(), key=lambda x: x[0])) # sorted keys: to keep deterministic if np.sum(probs) == 0.: all_seen = True else: bid = rs.choice( bids, replace=False, p=probs) # sample bucket according to remaining size batch_indices = buckets2instances[bid][:batch_size] buckets2instances[bid] = buckets2instances[bid][batch_size:] # if required by exact_epoch: also include last batch in bucket if too small if len(batch_indices) == batch_size or exact_epoch: yield {k: data_np[k][batch_indices] for k in data_np} return GeneratorWithRestart(bucket_generator)