def all_training_examples_cached(): global _all_examples if _all_examples is None: try: _all_examples, cnt = cPickle.load(myopen(training_examples_cache_filename())) assert len(_all_examples) == cnt logging.info("Successfully read %d training examples from %s" % (cnt, training_examples_cache_filename())) logging.info(stats()) except: logging.info("(Couldn't read training examples from %s, sorry)" % (training_examples_cache_filename())) logging.info("Caching all training examples...") logging.info(stats()) _all_examples = [] for l1, l2, f1, f2, falign in bicorpora_filenames(): for e in get_training_biexample(l1, l2, f1, f2, falign): _all_examples.append(e) if len(_all_examples) % 10000 == 0: logging.info("\tcurrently have read %d training examples" % len(_all_examples)) logging.info(stats()) random.shuffle(_all_examples) logging.info("...done caching all %d training examples" % len(_all_examples)) logging.info(stats()) cnt = len(_all_examples) cPickle.dump((_all_examples, cnt), myopen(training_examples_cache_filename(), "wb"), protocol=-1) assert len(_all_examples) == cnt logging.info("Wrote %d training examples to %s" % (cnt, training_examples_cache_filename())) logging.info(stats()) assert _all_examples is not None return _all_examples
def all_training_examples_cached(): global _all_examples if _all_examples is None: try: _all_examples, cnt = cPickle.load( myopen(training_examples_cache_filename())) assert len(_all_examples) == cnt logging.info("Successfully read %d training examples from %s" % (cnt, training_examples_cache_filename())) logging.info(stats()) except: logging.info("(Couldn't read training examples from %s, sorry)" % (training_examples_cache_filename())) logging.info("Caching all training examples...") logging.info(stats()) _all_examples = [] for l1, l2, f1, f2, falign in bicorpora_filenames(): for e in get_training_biexample(l1, l2, f1, f2, falign): _all_examples.append(e) if len(_all_examples) % 10000 == 0: logging.info( "\tcurrently have read %d training examples" % len(_all_examples)) logging.info(stats()) random.shuffle(_all_examples) logging.info("...done caching all %d training examples" % len(_all_examples)) logging.info(stats()) cnt = len(_all_examples) cPickle.dump((_all_examples, cnt), myopen(training_examples_cache_filename(), "wb"), protocol=-1) assert len(_all_examples) == cnt logging.info("Wrote %d training examples to %s" % (cnt, training_examples_cache_filename())) logging.info(stats()) assert _all_examples is not None return _all_examples
def get_training_minibatch_online(): """ Warning: The approach has the weird property that if one language pair's corpus is way longer than others, it will be the only examples for a while after the other corpora are exhausted. """ assert 0 # We need to filter validation examples import common.hyperparameters HYPERPARAMETERS = common.hyperparameters.read("language-model") MINIBATCH_SIZE = HYPERPARAMETERS["MINIBATCH SIZE"] generators = [] for l1, l2, f1, f2, falign in bicorpora_filenames(): # print l1, l2, f1, f2, falign generators.append(get_training_biexample(l1, l2, f1, f2, falign)) for l, f in monocorpora_filenames(): assert 0 # Cycles over generators. idx = 0 last_minibatch = None while 1: minibatch = [] for e in generators[idx]: minibatch.append(e) if len(minibatch) >= MINIBATCH_SIZE: break if len(minibatch) > 0: last_minibatch = idx yield minibatch elif last_minibatch == idx: # We haven't had any minibatch in the last cycle over the generators. # So we are done will all corpora. break # Go to the next corpus idx = (idx + 1) % len(generators)