def process_train_data(v: int, n: int, delta: float, vocab_size: int,
                       train_file: str) -> Ngram:
    """
    Wrapper function for the training data processing.
    Either fetch or generate necessary Ngrams based on the training information.
    :param v: Vocabulary choice
    :param n: ngram choice
    :param delta: Smoothing choice
    :param vocab_size: The size of the vocabulary
    :param train_file: Path to training data
    :return: Ngram
    """
    ngrams = Ngram(n)
    if ds.data_ser_exists(v, n, delta):
        print("Model with parameters already stored. Retrieving")
        ngrams = ds.data_ser_load(v, n, delta)
    else:
        print(
            "Model with parameters not stored. Generating model from provided training data"
        )
        train_data = pd.read_csv(train_file,
                                 delimiter='\t',
                                 names=[
                                     DF_COLUMN_ID, DF_COLUMN_NAME,
                                     DF_COLUMN_LANG, DF_COLUMN_TWEET
                                 ])
        transform_to_vocab(train_data, v)
        print("Shape of Training Data (Rows, Columns) => {}".format(
            train_data.shape))
        ngrams.generate(train_data, delta, vocab_size)
        ds.data_ser_save(ngrams, v, n, delta)
    return ngrams
Beispiel #2
0
def main(args):
    print(f'Loading corpus from `{args.data}`...')
    corpus = Corpus(args.data,
                    order=args.order,
                    lower=args.lower,
                    max_lines=args.max_lines)
    model = Ngram(order=args.order)
    name = f'{args.name}.{args.order}gram'

    print('Example data:')
    print('Train:', corpus.train[:20])
    print('Valid:', corpus.valid[:20])

    print('Training model...')
    model.train(corpus.train,
                add_k=args.add_k,
                interpolate=args.interpolate,
                backoff=args.backoff)
    print(f'Vocab size: {len(model.vocab):,}')

    if args.save_arpa:
        print(f'Saving model to `{name}`...')
        model.save_arpa(name)

    assert model.sum_to_one(n=10)

    print('Generating text...')
    text = model.generate(100)
    text = ' '.join(text)
    path = os.path.join(args.out, f'generated.{name}.txt')
    print(text)
    with open(path, 'w') as f:
        print(text, file=f)

    if model.is_smoothed:
        print('\nPredicting test set NLL...')
        logprob = model(corpus.test)
        nll = -logprob / len(corpus.test)
        print(f'Test NLL: {nll:.2f} | Perplexity {exp(nll):.2f}')
        path = os.path.join(args.out, f'result.{name}.txt')
        with open(path, 'w') as f:
            print(f'Test NLL: {nll:.2f} | Perplexity {exp(nll):.2f}', file=f)
    else:
        exit(
            'No evaluation with unsmoothed model: probability is probably 0 anyways.'
        )