Beispiel #1
0
def convert_dataset(filepath, extend_with, vocab):
    print('-- Generating {} '.format(filepath))
    sys.stdout.flush()
    st = SentenceTokenizer(vocab, maxlen)
    tokenized, dicts, _ = st.split_train_val_test(
        texts,
        labels, [data['train_ind'], data['val_ind'], data['test_ind']],
        extend_with=extend_with)
    pick = format_pickle(dset, tokenized[0], tokenized[1], tokenized[2],
                         dicts[0], dicts[1], dicts[2])
    with open(filepath, 'w') as f:
        pickle.dump(pick, f)
    cover = coverage(tokenized[2])

    print('     done. Coverage: {}'.format(cover))
def convert_dataset(filepath, extend_with, vocab):
    print('-- Generating {} '.format(filepath))
    sys.stdout.flush()
    st = SentenceTokenizer(vocab, maxlen)
    tokenized, dicts, _ = st.split_train_val_test(texts,
                                                  labels,
                                                  [data['train_ind'],
                                                   data['val_ind'],
                                                   data['test_ind']],
                                                  extend_with=extend_with)
    pick = format_pickle(dset, tokenized[0], tokenized[1], tokenized[2],
                         dicts[0], dicts[1], dicts[2])
    with open(filepath, 'w') as f:
        pickle.dump(pick, f)
    cover = coverage(tokenized[2])

    print('     done. Coverage: {}'.format(cover))
    with open(p) as f:
        s = pickle.load(f)

    # Decode data
    try:
        s['texts'] = [str(x) for x in s['texts']]
    except UnicodeDecodeError:
        s['texts'] = [x.decode('utf-8') for x in s['texts']]

    # Own
    st = SentenceTokenizer({}, 30)
    tests, dicts, _ = st.split_train_val_test(
        s['texts'],
        s['info'], [s['train_ind'], s['val_ind'], s['test_ind']],
        extend_with=10000)
    coverage_result.append(coverage(tests[2]))

    # Last
    st = SentenceTokenizer(vocab, 30)
    tests, dicts, _ = st.split_train_val_test(
        s['texts'],
        s['info'], [s['train_ind'], s['val_ind'], s['test_ind']],
        extend_with=0)
    coverage_result.append(coverage(tests[2]))

    # Full
    st = SentenceTokenizer(vocab, 30)
    tests, dicts, _ = st.split_train_val_test(
        s['texts'],
        s['info'], [s['train_ind'], s['val_ind'], s['test_ind']],
        extend_with=10000)