Example #1
0
def load_data(placeholders):

    train_data = get_data()

    train_data, vocab = prepare_data(train_data)

    with open(VOCAB_DATA_DIR + "vocab.pkl", "wb") as f:
        pickle.dump(vocab, f)

    train_data = numpify(train_data, pad=0)  # padding to same length and converting lists to numpy arrays
    train_feed_dicts = get_feed_dicts(train_data, placeholders, batch_size=100, inst_length=len(train_data["sentences"]))
    return train_feed_dicts, vocab
Example #2
0
def get_batches(data,
                batch_size=32,
                pad=0,
                bucket_order=None,
                bucket_structure=None,
                exact_epoch=False):
    """
    Creates generator that batches `data`.
    To avoid biases, it is advised to keep `bucket_order=None` and `bucket_structure=None` if computationally possible.
    (which will sample batches from all instances)

    Args:
        `data`: dict with (multi-dimensional) numpy arrays or (nested) lists;
            first inner dimension (`num_instances`) should be the same over all data values.
        `batch_size`: the desired batch size
        `pad`: padding symbol in case data contains lists of lists of different sizes
        `bucket_order`: argument `order` in get_buckets (list with keys); `None` if no bucketing
        `bucket_structure`: argument `structure` in get_buckets; `None` if no bucketing
        `exact_epoch`: if set to `True`, final batch per bucket may be smaller, but each instance will be seen exactly
            once during training. Default: `False`, to be certain during training
            that each instance per batch gets same weight in the total loss
            (but not all instances are observed per epoch if bucket sizes are no multiple of `batch_size`).

    Returns:
        a generator that generates a dict with same keys as `data`, and
        as values data batches consisting of `[batch_size x num_instances]` 2D numpy tensors
        (1st dimension is at most `batch_size` but may be smaller to cover all instances exactly once per epoch,
        if `exact_epoch=True`)
     """
    assert isinstance(data, dict)

    data0 = list(data.values())[0]
    if not isinstance(data0, np.ndarray):
        data_np = numpify(
            data, pad)  # still need original data for length-based bucketing
    else:
        data_np = data

    def get_bucket_probs(_buckets2instances):
        N = float(np.sum([len(ids) for ids in _buckets2instances.values()]))
        return {
            bid: len(ids) / N if N > 0. else 0.
            for bid, ids in _buckets2instances.items()
        }

    def shuffle_buckets(_buckets2instances):
        for bid in sorted(
                _buckets2instances.keys()):  # sorted: to keep deterministic
            rs.shuffle(_buckets2instances[bid])

    buckets2instances, _ = get_buckets(data, bucket_order, bucket_structure)
    n_buckets = len(buckets2instances)

    exact_epoch = True if len(data0) < n_buckets * batch_size else exact_epoch

    #if average instances/bucket smaller than batch_size: set exact_epoch = True
    #to avoid empty batches during debugging on small data samples

    def bucket_generator():
        buckets2instances, _ = get_buckets(data, bucket_order,
                                           bucket_structure)
        shuffle_buckets(buckets2instances)
        all_seen = False
        while not all_seen:
            bids, probs = zip(
                *sorted(get_bucket_probs(buckets2instances).items(),
                        key=lambda x: x[0]))
            # sorted keys: to keep deterministic
            if np.sum(probs) == 0.:
                all_seen = True
            else:
                bid = rs.choice(
                    bids, replace=False,
                    p=probs)  # sample bucket according to remaining size
                batch_indices = buckets2instances[bid][:batch_size]
                buckets2instances[bid] = buckets2instances[bid][batch_size:]
                # if required by exact_epoch: also include last batch in bucket if too small
                if len(batch_indices) == batch_size or exact_epoch:
                    yield {k: data_np[k][batch_indices] for k in data_np}

    return GeneratorWithRestart(bucket_generator)