Esempio n. 1
0
def main():
    questions_path, answers_path = sys.argv[1:]

    print("Reading Corpus:")
    train_sentences = read_corpus('train_data', disp=True)

    print('\nTraining on Corpus')
    model = NGram.train_model(train_sentences, disp=True)

    with open(answers_path, 'r') as answer_file:
        answers = get_sentences(untokenized_text=answer_file.read(),
                                is_tokenized=True,
                                token_start_end=('<s>', '</s>'))

    dev_sentences = answers[:520]

    print('Calculating Probabilities for Dev Sentences:')
    model.sentences_probabilities(dev_sentences, disp=True)
    lambdas = optimize_lambdas(model)

    with open(questions_path, 'r') as question_file:
        questions = get_sentences(untokenized_text=question_file.read(),
                                  is_tokenized=True,
                                  token_start_end=('<s>', '</s>'))

    print('Calculating Probabilities for Test Sentences:')
    model.sentences_probabilities(sentences=questions, disp=True)
    _, sentences_perplexity = model.perplexity(lambdas=lambdas)

    print('Writing sentences and perplexities to file')
    with open('output.txt', 'w') as out_file:
        for i, perplexity in enumerate(sentences_perplexity):
            out_file.write('{}\t{}\n'.format(' '.join(questions[i]).replace('<s0> <s1>', '<s>'), perplexity))
Esempio n. 2
0
def main():
    questions_path, answers_path = sys.argv[1:]

    print("Reading Corpus:")
    train_sentences = read_corpus('train_data', disp=True)

    print('\nTraining on Corpus')
    model = NGram.train_model(train_sentences, disp=True)

    with open(answers_path, 'r') as answer_file:
        answers = get_sentences(untokenized_text=answer_file.read(),
                                is_tokenized=True,
                                token_start_end=('<s>', '</s>'))

    dev_sentences = answers[:520]

    print('Calculating Probabilities for Dev Sentences:')
    model.sentences_probabilities(dev_sentences, disp=True)
    lambdas = optimize_lambdas(model)

    with open(questions_path, 'r') as question_file:
        questions = get_sentences(untokenized_text=question_file.read(),
                                  is_tokenized=True,
                                  token_start_end=('<s>', '</s>'))

    print('Calculating Probabilities for Test Sentences:')
    model.sentences_probabilities(sentences=questions, disp=True)
    _, sentences_perplexity = model.perplexity(lambdas=lambdas)

    print('Writing sentences and perplexities to file')
    with open('output.txt', 'w') as out_file:
        for i, perplexity in enumerate(sentences_perplexity):
            out_file.write('{}\t{}\n'.format(
                ' '.join(questions[i]).replace('<s0> <s1>', '<s>'),
                perplexity))
Esempio n. 3
0
def read_corpus(path, disp=False):
    sentences = []

    if os.path.isdir(path):
        files = [
            join(path, file) for file in listdir(path)
            if isfile(join(path, file))
        ]
        for i, file in enumerate(files):
            if disp and int((i / len(files)) * 100) % 2 == 0:
                progress = int((i / len(files)) * 100)
                progress_bar(progress)

            with open(file, encoding='utf-8', errors='ignore') as data_file:
                file_data = data_file.read().lower()
                data_file.readlines()

                sentences += get_sentences(file_data,
                                           sentence_tokenizer=sent_tokenize)

        if disp:
            progress_bar(100)
            print()

    return sentences
Esempio n. 4
0
def read_corpus(path, disp=False):
    sentences = []

    if os.path.isdir(path):
        files = [join(path, file) for file in listdir(path) if isfile(join(path, file))]
        for i, file in enumerate(files):
            if disp and int((i / len(files)) * 100) % 2 == 0:
                progress = int((i / len(files)) * 100)
                progress_bar(progress)

            with open(file, encoding='utf-8', errors='ignore') as data_file:
                file_data = data_file.read().lower()
                data_file.readlines()

                sentences += get_sentences(file_data, sentence_tokenizer=sent_tokenize)

        if disp:
            progress_bar(100)
            print()

    return sentences