def all_training_examples_cached():
    global _all_examples
    if _all_examples is None:
        try:
            _all_examples, cnt = cPickle.load(myopen(training_examples_cache_filename()))
            assert len(_all_examples) == cnt
            logging.info("Successfully read %d training examples from %s" % (cnt, training_examples_cache_filename()))
            logging.info(stats())
        except:
            logging.info("(Couldn't read training examples from %s, sorry)" % (training_examples_cache_filename()))
            logging.info("Caching all training examples...")
            logging.info(stats())
            _all_examples = []
            for l1, l2, f1, f2, falign in bicorpora_filenames():
                for e in get_training_biexample(l1, l2, f1, f2, falign):
                    _all_examples.append(e)
                    if len(_all_examples) % 10000 == 0:
                        logging.info("\tcurrently have read %d training examples" % len(_all_examples))
                        logging.info(stats())
            random.shuffle(_all_examples)
            logging.info("...done caching all %d training examples" % len(_all_examples))
            logging.info(stats())

            cnt = len(_all_examples)
            cPickle.dump((_all_examples, cnt), myopen(training_examples_cache_filename(), "wb"), protocol=-1)
            assert len(_all_examples) == cnt
            logging.info("Wrote %d training examples to %s" % (cnt, training_examples_cache_filename()))
            logging.info(stats())
    assert _all_examples is not None
    return _all_examples
Esempio n. 2
0
def all_training_examples_cached():
    global _all_examples
    if _all_examples is None:
        try:
            _all_examples, cnt = cPickle.load(
                myopen(training_examples_cache_filename()))
            assert len(_all_examples) == cnt
            logging.info("Successfully read %d training examples from %s" %
                         (cnt, training_examples_cache_filename()))
            logging.info(stats())
        except:
            logging.info("(Couldn't read training examples from %s, sorry)" %
                         (training_examples_cache_filename()))
            logging.info("Caching all training examples...")
            logging.info(stats())
            _all_examples = []
            for l1, l2, f1, f2, falign in bicorpora_filenames():
                for e in get_training_biexample(l1, l2, f1, f2, falign):
                    _all_examples.append(e)
                    if len(_all_examples) % 10000 == 0:
                        logging.info(
                            "\tcurrently have read %d training examples" %
                            len(_all_examples))
                        logging.info(stats())
            random.shuffle(_all_examples)
            logging.info("...done caching all %d training examples" %
                         len(_all_examples))
            logging.info(stats())

            cnt = len(_all_examples)
            cPickle.dump((_all_examples, cnt),
                         myopen(training_examples_cache_filename(), "wb"),
                         protocol=-1)
            assert len(_all_examples) == cnt
            logging.info("Wrote %d training examples to %s" %
                         (cnt, training_examples_cache_filename()))
            logging.info(stats())
    assert _all_examples is not None
    return _all_examples
Esempio n. 3
0
def get_training_minibatch_online():
    """
    Warning: The approach has the weird property that if one language
    pair's corpus is way longer than others, it will be the only examples
    for a while after the other corpora are exhausted.
    """

    assert 0  # We need to filter validation examples

    import common.hyperparameters
    HYPERPARAMETERS = common.hyperparameters.read("language-model")
    MINIBATCH_SIZE = HYPERPARAMETERS["MINIBATCH SIZE"]

    generators = []
    for l1, l2, f1, f2, falign in bicorpora_filenames():
        #        print l1, l2, f1, f2, falign
        generators.append(get_training_biexample(l1, l2, f1, f2, falign))
    for l, f in monocorpora_filenames():
        assert 0

    # Cycles over generators.
    idx = 0
    last_minibatch = None
    while 1:
        minibatch = []
        for e in generators[idx]:
            minibatch.append(e)
            if len(minibatch) >= MINIBATCH_SIZE:
                break
        if len(minibatch) > 0:
            last_minibatch = idx
            yield minibatch
        elif last_minibatch == idx:
            # We haven't had any minibatch in the last cycle over the generators.
            # So we are done will all corpora.
            break

        # Go to the next corpus
        idx = (idx + 1) % len(generators)
def get_training_minibatch_online():
    """
    Warning: The approach has the weird property that if one language
    pair's corpus is way longer than others, it will be the only examples
    for a while after the other corpora are exhausted.
    """

    assert 0 # We need to filter validation examples

    import common.hyperparameters
    HYPERPARAMETERS = common.hyperparameters.read("language-model")
    MINIBATCH_SIZE = HYPERPARAMETERS["MINIBATCH SIZE"]

    generators = []
    for l1, l2, f1, f2, falign in bicorpora_filenames():
#        print l1, l2, f1, f2, falign
        generators.append(get_training_biexample(l1, l2, f1, f2, falign))
    for l, f in monocorpora_filenames(): assert 0

    # Cycles over generators.
    idx = 0
    last_minibatch = None
    while 1:
        minibatch = []
        for e in generators[idx]:
            minibatch.append(e)
            if len(minibatch) >= MINIBATCH_SIZE:
                break
        if len(minibatch) > 0:
            last_minibatch = idx
            yield minibatch
        elif last_minibatch == idx:
            # We haven't had any minibatch in the last cycle over the generators.
            # So we are done will all corpora.
            break

        # Go to the next corpus
        idx = (idx + 1) % len(generators)