def test_hypertrain_loadcheckpoints(): """A previously generated checkpoints file can be used to continue the hyperoptimization""" modelclass = SmallWavenet corpus = Corpus([ "This is a very small corpus for testing the hypertrain procedure.", "Hope it works!!!" ]) encoder = Encoder(corpus, CharTokenizer) checkpointsfile = DATAFOLDER + "checkpoints" with NamedTemporaryFile("r") as tempfile: tempdir = mkdtemp() tmpname = tempfile.name copyfile(checkpointsfile, tmpname) model = hypertrain(modelclass, encoder, corpus, tempdir, n_calls=15, verbose=2, valmask=[False, True], patience=1, maxepochs=10, checkpointfile=tmpname) rmtree(tempdir) assert model is not None
def test_corpus_jsonload(): """Loading a JSON corpus works as expected""" expected = [ "Na Boca da Noite", "The Other Side of the Wind", "Prata Palomares" ] datafile = DATAFOLDER + "jsoncorpus.json" corpus = Corpus.load_json(datafile) # Test iterator for doc, exp in zip(corpus, expected): print("Expected", exp) print("Obtained", doc) assert doc == exp # Test length assert len(corpus) == 3 # Test direct access for i in range(len(corpus)): assert corpus[i] == expected[i] # Test conditioners access expected = [ {"genres": ['Drama']}, {"genres": ['Drama']}, {"genres": ['Thriller']} ] for cond, exp in zip(corpus.iterconditioners(), expected): print("Expected", exp) print("Obtained", cond) assert cond == exp
def tokenize(inputcorpus, corpusformat, outputcorpus, tokenizername): """Tokenizes a corpus and produces a new corpus of tokens in JSON format""" # Read corpus corpus = FORMATTERSBYNAME[corpusformat](inputcorpus) print(corpus[0:min(3, len(corpus))]) # Fit tokenizer on corpus tokenizer = tokenizerbyname(tokenizername)() tokenizer.fit(corpus) # Transform corpus transformed = Corpus([tokenizer.transform(doc) for doc in corpus]) print(transformed[0:min(3, len(corpus))]) # Save resultant processed corpus transformed.save_json(outputcorpus)
def test_writer_beamsearch(): """Beam search works as expected""" mockmodel = MockModel() corpus = Corpus(["abc"]) encoder = Encoder(corpus=corpus, tokenizer=CharTokenizer()) writer = Writer(mockmodel, encoder, creativity=0, beamsize=3, batchsize=3) seed = np.array([0, 0]) expected = [0, 0, 0] obtained = writer.beamsearch(seed) print("Expected", expected) print("Obtained", obtained) assert obtained == expected
def test_corpus_multilineload(): """Loading a multiline text corpus works as expected""" expected = [ "This is a multidocument.", "The file stores one document per line.", "So there are three documents here." ] datafile = DATAFOLDER + "multiline.txt" corpus = Corpus.load_multilinetxt(datafile) # Test iterator for doc, exp in zip(corpus, expected): print("Expected", exp) print("Obtained", doc) assert doc == exp # Test length assert len(corpus) == 3 # Test direct access for i in range(len(corpus)): assert corpus[i] == expected[i]
def test_corpus_singletxtload(): """Loading a single text corpus works as expected""" expected = ("This is is a single document corpus.\n" + "All the lines from this file belong to the same document.\n" + "And now for something different!\n" + "\n" + "PINEAPPLES!!!\n" ) datafile = DATAFOLDER + "singledoc.txt" corpus = Corpus.load_singletxt(datafile) # Test iterator for doc in corpus: print("Expected", expected) print("Obtained", doc) assert doc == expected # Test length assert len(corpus) == 1 # Test direct access assert corpus[0] == expected
def test_hypertrain_run(): """A small hypertraining procedure can be run""" modelclass = PerceptronModel corpus = Corpus([ "This is a very small corpus for testing the hypertrain procedure.", "Hope it works!!!" ]) encoder = Encoder(corpus, CharTokenizer) with NamedTemporaryFile("r") as tempfile: tempdir = mkdtemp() tmpname = tempfile.name model = hypertrain(modelclass, encoder, corpus, tempdir, n_calls=15, verbose=2, valmask=[False, True], patience=1, maxepochs=10, checkpointfile=tmpname) rmtree(tempdir) assert model is not None