Ejemplo n.º 1
0
def develop_model(model, dev_file_name):
    """
    Develop the model, training the lambda values to optimize perplexity

    If the model is not smoothed (anything other than `2s` or `3s`), this
    method skips over the tuning since it is time consuming and unnecessary.

    Tuning will be done only when a smoothed model is specified

    Arguments:
        model (Model): the model to train
        dev_file_name (str): the file to read dev data from

    Returns:
        (Model): the newly trained model
    """
    # No need to tune lambda if we're not using a smoothed model
    if not model.is_smoothed_bigram and not model.is_smoothed_trigram:
        print('Unsmoothed model; skipping tuning')
        return model

    print('Smoothed model; tuning with development data...')
    with open(dev_file_name) as f:
        raw_dev_data = f.read()
    dev_sentences = make_sentences(raw_dev_data)
    model.tune(dev_sentences)
    return model
Ejemplo n.º 2
0
def develop_model(model, dev_file_name):
    """
    Develop the model, training the lambda values to optimize perplexity

    If the model is not smoothed (anything other than `2s` or `3s`), this
    method skips over the tuning since it is time consuming and unnecessary.

    Tuning will be done only when a smoothed model is specified

    Arguments:
        model (Model): the model to train
        dev_file_name (str): the file to read dev data from

    Returns:
        (Model): the newly trained model
    """
    # No need to tune lambda if we're not using a smoothed model
    if not model.is_smoothed_bigram and not model.is_smoothed_trigram:
        print('Unsmoothed model; skipping tuning')
        return model

    print('Smoothed model; tuning with development data...')
    with open(dev_file_name) as f:
        raw_dev_data = f.read()
    dev_sentences = make_sentences(raw_dev_data)
    model.tune(dev_sentences)
    return model
Ejemplo n.º 3
0
def test_model(model, test_file_name):
    with open(test_file_name) as f:
        raw_test_data = f.read()
    test_sentences = make_sentences(raw_test_data)

    perplexities = model.evaluate(test_sentences)
    for sentence, perplexity in perplexities:
        print('{} : {}'.format(sentence, perplexity))
Ejemplo n.º 4
0
def test_model(model, test_file_name):
    with open(test_file_name) as f:
        raw_test_data = f.read()
    test_sentences = make_sentences(raw_test_data)

    perplexities = model.evaluate(test_sentences)
    for sentence, perplexity in perplexities:
        print('{} : {}'.format(sentence, perplexity))
Ejemplo n.º 5
0
def main(training_data_directory, test_file_path):
    model = Model('3s')
    model = train(model, training_data_directory)

    with open(test_file_path) as f:
        text = f.read()
    sentences = make_sentences(text)

    development_sentences = list(chunks(sentences, 520 * 5))[0]

    model = develop(model, development_sentences)
    test_results = test(model, sentences)
    print('Finished testing model', file=sys.stderr)
    for sentence, perplexity in test_results:
        print('{}\t{}'.format(format_for_output(sentence), perplexity))
 def test_real_text(self):
     sentences = make_sentences('''
         This is some real text.  It is pretty long, and has some
         stuff in it.  But that's cool, since we need to
         handle that.
     ''', True)
     assert str(sentences[0]) == 'This is some real text.'
     assert str(sentences[1]) == \
         'It is pretty long, and has some stuff in it.'
     assert str(sentences[2]) == \
         'But that\'s cool, since we need to handle that.'
     for sentence in sentences:
         assert isinstance(sentence, Sentence)
         assert sentence.words[0] == '<s>'
         assert sentence.words[len(sentence) - 1] == '</s>'
Ejemplo n.º 7
0
def main(training_data_directory, test_file_path):
    model = Model('3s')
    model = train(model, training_data_directory)

    with open(test_file_path) as f:
        text = f.read()
    sentences = make_sentences(text)

    development_sentences = list(chunks(sentences, 520 * 5))[0]

    model = develop(model, development_sentences)
    test_results = test(model, sentences)
    print('Finished testing model', file=sys.stderr)
    for sentence, perplexity in test_results:
        print('{}\t{}'.format(format_for_output(sentence), perplexity))
Ejemplo n.º 8
0
def train(model, path):
    """
    Reads in a bunch of text data and trains the model from it
    """
    training_sentences = list()
    number_of_files = 1
    for filename in os.listdir(path):
        filename = '{}/{}'.format(path, filename)

        print('Reading file #{}'.format(number_of_files), file=sys.stderr)
        number_of_files += 1

        with open(filename, encoding='utf-8', errors='ignore') as f:
            text = f.read()
        training_sentences += make_sentences(text, full_tokenizer=True)

    print('Finished reading training data', file=sys.stderr)
    Sentence.replaced_words = True

    # Replace unknown words in the training sentences
    # Splits the task across 4 threads to speed things up
    threads = list()
    number_of_threads = 0
    for sentences in chunks(training_sentences, 1000):
        print('Starting thread #{} to replace {} sentences'
              .format(number_of_threads, len(sentences)),
              file=sys.stderr)
        number_of_threads += 1
        thread = Process(
            target=replace_unknown_words_in_sentences,
            args=(sentences)
        )
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()

    print('Finished doing word replacement on training data', file=sys.stderr)

    print('Staring model training...', file=sys.stderr)
    model.train(training_sentences)
    print('Finished training model', file=sys.stderr)

    return model
Ejemplo n.º 9
0
def train(model, path):
    """
    Reads in a bunch of text data and trains the model from it
    """
    training_sentences = list()
    number_of_files = 1
    for filename in os.listdir(path):
        filename = '{}/{}'.format(path, filename)

        print('Reading file #{}'.format(number_of_files), file=sys.stderr)
        number_of_files += 1

        with open(filename, encoding='utf-8', errors='ignore') as f:
            text = f.read()
        training_sentences += make_sentences(text, full_tokenizer=True)

    print('Finished reading training data', file=sys.stderr)
    Sentence.replaced_words = True

    # Replace unknown words in the training sentences
    # Splits the task across 4 threads to speed things up
    threads = list()
    number_of_threads = 0
    for sentences in chunks(training_sentences, 1000):
        print('Starting thread #{} to replace {} sentences'.format(
            number_of_threads, len(sentences)),
              file=sys.stderr)
        number_of_threads += 1
        thread = Process(target=replace_unknown_words_in_sentences,
                         args=(sentences))
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()

    print('Finished doing word replacement on training data', file=sys.stderr)

    print('Staring model training...', file=sys.stderr)
    model.train(training_sentences)
    print('Finished training model', file=sys.stderr)

    return model
Ejemplo n.º 10
0
def train_model(model, train_file_name):
    """
    Train the model with some test data

    Arguuments:
        model (Model): the model to train
        train_file_name (str): the training file to read data from

    Returns:
        (Model): the trained model
    """
    with open(train_file_name) as f:
        raw_training_data = f.read()
    training_sentences = make_sentences(raw_training_data)

    # Run word replacement once the training data has been gathered
    Sentence.replaced_words = True

    # Apply replacement to the words that we just trained
    for sentence in training_sentences:
        sentence.replace_words()
    model.train(training_sentences)
    return model
Ejemplo n.º 11
0
def train_model(model, train_file_name):
    """
    Train the model with some test data

    Arguuments:
        model (Model): the model to train
        train_file_name (str): the training file to read data from

    Returns:
        (Model): the trained model
    """
    with open(train_file_name) as f:
        raw_training_data = f.read()
    training_sentences = make_sentences(raw_training_data)

    # Run word replacement once the training data has been gathered
    Sentence.replaced_words = True

    # Apply replacement to the words that we just trained
    for sentence in training_sentences:
        sentence.replace_words()
    model.train(training_sentences)
    return model
 def test_problematic_sentences(self):
     sentence = make_sentences('<s> a a , a b , b </s>')[0]
     assert sentence == 'a a, a b, b.'
 def test_newline_at_end_of_file(self):
     s = make_sentences('<s> a b b a </s>\n<s> b a b b </s>\n')
     assert len(s) == 2
 def test_handle_processed_text(self):
     s = make_sentences('<s> a b b a </s>\n<s> b a b b </s>')
     assert s[0] == 'a b b a.'
     assert isinstance(s[0], Sentence)
     assert s[1] == 'b a b b.'
     assert isinstance(s[1], Sentence)
 def test_handle_multiline_text(self):
     s = make_sentences('a b b\na. b a b b.')
     assert s[0] == 'a b b a.'
     assert isinstance(s[0], Sentence)
     assert s[1] == 'b a b b.'
     assert isinstance(s[1], Sentence)
 def test_handle_unprocessed_text(self):
     sentences = make_sentences('a b b a. b a b b.')
     assert sentences[0] == 'a b b a.'
     assert sentences[1] == 'b a b b.'
     for sentence in sentences:
         assert isinstance(sentence, Sentence)