def test_hypertrain_loadcheckpoints():
    """A previously generated checkpoints file can be used to continue the hyperoptimization"""
    modelclass = SmallWavenet
    corpus = Corpus([
        "This is a very small corpus for testing the hypertrain procedure.",
        "Hope it works!!!"
    ])
    encoder = Encoder(corpus, CharTokenizer)
    checkpointsfile = DATAFOLDER + "checkpoints"

    with NamedTemporaryFile("r") as tempfile:
        tempdir = mkdtemp()
        tmpname = tempfile.name
        copyfile(checkpointsfile, tmpname)
        model = hypertrain(modelclass,
                           encoder,
                           corpus,
                           tempdir,
                           n_calls=15,
                           verbose=2,
                           valmask=[False, True],
                           patience=1,
                           maxepochs=10,
                           checkpointfile=tmpname)
        rmtree(tempdir)
        assert model is not None
Exemple #2
0
def test_corpus_jsonload():
    """Loading a JSON corpus works as expected"""
    
    expected = [
        "Na Boca da Noite",
        "The Other Side of the Wind",
        "Prata Palomares"
    ]
    datafile = DATAFOLDER + "jsoncorpus.json"
    corpus = Corpus.load_json(datafile)
    
    # Test iterator
    for doc, exp in zip(corpus, expected):
        print("Expected", exp)
        print("Obtained", doc)
        assert doc == exp

    # Test length
    assert len(corpus) == 3

    # Test direct access
    for i in range(len(corpus)):
        assert corpus[i] == expected[i]
    
    # Test conditioners access    
    expected = [
        {"genres": ['Drama']},
        {"genres": ['Drama']},
        {"genres": ['Thriller']}
    ]
    for cond, exp in zip(corpus.iterconditioners(), expected):
        print("Expected", exp)
        print("Obtained", cond)
        assert cond == exp
def tokenize(inputcorpus, corpusformat, outputcorpus, tokenizername):
    """Tokenizes a corpus and produces a new corpus of tokens in JSON format"""

    # Read corpus
    corpus = FORMATTERSBYNAME[corpusformat](inputcorpus)
    print(corpus[0:min(3, len(corpus))])

    # Fit tokenizer on corpus
    tokenizer = tokenizerbyname(tokenizername)()
    tokenizer.fit(corpus)

    # Transform corpus
    transformed = Corpus([tokenizer.transform(doc) for doc in corpus])
    print(transformed[0:min(3, len(corpus))])

    # Save resultant processed corpus
    transformed.save_json(outputcorpus)
def test_writer_beamsearch():
    """Beam search works as expected"""
    mockmodel = MockModel()
    corpus = Corpus(["abc"])
    encoder = Encoder(corpus=corpus, tokenizer=CharTokenizer())
    writer = Writer(mockmodel, encoder, creativity=0, beamsize=3, batchsize=3)
    seed = np.array([0, 0])

    expected = [0, 0, 0]
    obtained = writer.beamsearch(seed)
    print("Expected", expected)
    print("Obtained", obtained)
    assert obtained == expected
Exemple #5
0
def test_corpus_multilineload():
    """Loading a multiline text corpus works as expected"""
    expected = [
        "This is a multidocument.",
        "The file stores one document per line.",
        "So there are three documents here."
    ]
    datafile = DATAFOLDER + "multiline.txt"
    corpus = Corpus.load_multilinetxt(datafile)
    
    # Test iterator
    for doc, exp in zip(corpus, expected):
        print("Expected", exp)
        print("Obtained", doc)
        assert doc == exp

    # Test length
    assert len(corpus) == 3

    # Test direct access
    for i in range(len(corpus)):
        assert corpus[i] == expected[i]
Exemple #6
0
def test_corpus_singletxtload():
    """Loading a single text corpus works as expected"""
    expected = ("This is is a single document corpus.\n"
                + "All the lines from this file belong to the same document.\n"
                + "And now for something different!\n"
                + "\n"
                + "PINEAPPLES!!!\n"
                )
    datafile = DATAFOLDER + "singledoc.txt"
    corpus = Corpus.load_singletxt(datafile)
    
    # Test iterator
    for doc in corpus:
        print("Expected", expected)
        print("Obtained", doc)
        assert doc == expected
        
    # Test length
    assert len(corpus) == 1
        
    # Test direct access
    assert corpus[0] == expected
def test_hypertrain_run():
    """A small hypertraining procedure can be run"""
    modelclass = PerceptronModel
    corpus = Corpus([
        "This is a very small corpus for testing the hypertrain procedure.",
        "Hope it works!!!"
    ])
    encoder = Encoder(corpus, CharTokenizer)

    with NamedTemporaryFile("r") as tempfile:
        tempdir = mkdtemp()
        tmpname = tempfile.name
        model = hypertrain(modelclass,
                           encoder,
                           corpus,
                           tempdir,
                           n_calls=15,
                           verbose=2,
                           valmask=[False, True],
                           patience=1,
                           maxepochs=10,
                           checkpointfile=tmpname)
        rmtree(tempdir)
        assert model is not None