Python Corpus Examples, data_ptb.Corpus Python Examples

Example #1

0

Show file

File: main_UP.py Project: Kelina/PRPN

# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print(
            "WARNING: You have a CUDA device, so you should probably run with --cuda"
        )
    else:
        torch.cuda.manual_seed(args.seed)

###############################################################################
# Load data
###############################################################################

corpus = data.Corpus(args.data)


def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = len(data) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data[0:nbatch * bsz]

    # Evenly divide the data across the bsz batches.
    def list2batch(x_list):
        maxlen = max([len(x) for x in x_list])
        input = torch.LongTensor(maxlen, bsz).zero_()
        mask = torch.FloatTensor(maxlen, bsz).zero_()
        target = torch.LongTensor(maxlen, bsz).zero_()
        for idx, x in enumerate(x_list):

Example #2

0

Show file

    # Set the random seed manually for reproducibility.
    torch.manual_seed(args.seed)

    # Load model
    with open(args.checkpoint, 'rb') as f:
        model = torch.load(f)
        if args.cuda:
            model.cuda()
            torch.cuda.manual_seed(args.seed)
        else:
            model.cpu()

    # Load data
    # we are going to test the ptb parses
    # so the default corpus is ptb
    corpus = data_ptb.Corpus('./data/ptb')

    # This is to load the dictionary used in training
    if 'ptb' in args.data:
        c2_dict = corpus.dictionary
    else:
        corpus2 = data.Corpus(args.data)
        c2_dict = corpus2.dictionary
    sys.stdout.flush()
    #test(model, corpus, args.cuda, mode='test', dictionary=None, prt=True)
    test(model,
         corpus,
         args.cuda,
         mode=args.mode,
         dictionary=c2_dict,
         prt=True)

Example #3

0

Show file

File: test_phrase_grammar.py Project: zmskye/Ordered-Neurons

    torch.manual_seed(args.seed)

    # Load model
    with open(args.checkpoint, 'rb') as f:
        model, _, _ = torch.load(f)
        torch.cuda.manual_seed(args.seed)
        model.cpu()
        if args.cuda:
            model.cuda()

    # Load data
    import hashlib

    fn = 'corpus.{}.data'.format(hashlib.md5('data/penn'.encode()).hexdigest())
    print('Loading cached dataset...')
    corpus = torch.load(fn)
    dictionary = corpus.dictionary

    # test_batch_size = 1
    # test_data = batchify(corpus.test, test_batch_size, args)
    # test_loss = evaluate(test_data, test_batch_size)
    # print('=' * 89)
    # print('| End of training | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}'.format(
    #     test_loss, math.exp(test_loss), test_loss / math.log(2)))
    # print('=' * 89)

    print('Loading PTB dataset...')
    corpus = data_ptb.Corpus(args.data)
    corpus.dictionary = dictionary

    test(model, corpus, args.cuda, prt=True)

Example #4

0

Show file

File: main_ptb.py Project: lazywatch/HighOrderRNN

###############################################################################
# Load data
###############################################################################
def batchify(data, bsz, cuda=False):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    if cuda:
        data = data.cuda()
    return data


corpus = data.Corpus(hps['data'])
###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)
model = RNNModel(hps['model'], ntokens, hps['emsize'], hps['nhid'],
                 hps['nlayers'], hps['dropout'], hps['tied'])
if hps['cuda']:
    model.cuda()

criterion = nn.CrossEntropyLoss()
#optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0)
###############################################################################
# Training code

Example #5

0

Show file

File: test_phrase_grammar.py Project: zhengyangb/Ordered-Neuron-LSTM-with-Pretrained-Model

            torch.cuda.synchronize()

    # Load data
    import hashlib

    # fn = 'corpus.{}.data'.format(hashlib.md5('data/penn/'.encode()).hexdigest())
    fn = 'corpus.{}.data'.format(hashlib.md5((args.data_train + args.wvec).encode()).hexdigest())
    # fn = 'corpus.{}.data'.format(hashlib.md5((args.data + args.wvec).encode()).hexdigest())
    tools.print_log(args.save, 'Loading cached dataset...')
    corpus = torch.load(fn)
    dictionary = corpus.dictionary

    # test_batch_size = 1
    # test_data = batchify(corpus.test, test_batch_size, args)
    # test_loss = evaluate(test_data, test_batch_size)
    # print('=' * 89)
    # print('| End of training | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}'.format(
    #     test_loss, math.exp(test_loss), test_loss / math.log(2)))
    # print('=' * 89)

    tools.print_log(args.save, 'Loading PTB dataset...')
    if args.wvec:
        word2idx = tools.pkl_loader(os.path.join('data/wordvec', args.wvec, 'words2idx'))
        idx2word = tools.pkl_loader(os.path.join('data/wordvec', args.wvec, 'idx2words'))
        corpus = data_ptb.Corpus(args.data, args.wvec, word2idx, idx2word)
    else:
        corpus = data_ptb.Corpus(args.data)
    corpus.dictionary = dictionary

    test(model, corpus, args.cuda, prt=True)

Example #6

0

Show file

File: main.py Project: Noahs-ARK/PaLM

fn = 'corpus'
if os.path.exists(fn):
    print('Loading cached dataset...')
    corpus = torch.load(fn)
else:
    print('Producing dataset...')
    corpus = data.Corpus(args.data, max_span_length=args.max_span_length)
    torch.save(corpus, fn)

fn_ptb = 'corpus_ptb'
if os.path.exists(fn_ptb):
    print('Loading cached PTB dataset...')
    corpus_ptb = torch.load(fn_ptb)
else:
    print('Producing PTB dataset...')
    corpus_ptb = data_ptb.Corpus(args.data_ptb)
    torch.save(corpus_ptb, fn_ptb)

sys.stdout.flush()
eval_batch_size = 10
test_batch_size = 1

train_data, train_trees = batchify(corpus.train, args.batch_size, args,
                                   corpus.train_trees)
val_data, _ = batchify(corpus.valid, eval_batch_size, args)
test_data, _ = batchify(corpus.test, test_batch_size, args)

###############################################################################
# Build the model
###############################################################################

Example #7

0

Show file

from nltk import Tree
from tqdm import tqdm
import json

import data_ptb

CORPUS_FILE = '/home/ritesh/Content_alignment/diora_snli/data/snli_1.0/snli_1.0_train.jsonl'
RULE_FILE = '/home/ritesh/Content_alignment/diora_snli/Basic-CYK-Parser/grammar_wsj_no_cnf.txt'

if __name__ == '__main__':
    rules = dict()
    # data = open(CORPUS_FILE,"r")
    corpus = data_ptb.Corpus(
        "/home/ritesh/Content_alignment/Tree-Transformer/data/")
    # corpus.dictionary = dictionary
    dataset = zip(corpus.test_sens, corpus.test_trees, corpus.test_nltktrees)
    output = open("wsj_bracket.jsonl", "w")
    idx = 0
    for sen, sen_tree, sen_nltktree in tqdm(dataset):
        tree = sen_nltktree
        for sub in tree.subtrees():
            for n, child in enumerate(sub):
                if isinstance(child, str):
                    continue

                if len(
                        list(
                            child.subtrees(filter=lambda x: x.label() ==
                                           '-NONE-'))) == len(child.leaves()):
                    del sub[n]

Example #8

0

Show file

File: test_phrase_grammar.py Project: drewjel/CS6316-Final-Project

def test(model, corpus, sess, seq_len):
    prt = True
    corpus = data_ptb.Corpus('data/penn')

    prec_list = []
    reca_list = []
    f1_list = []

    pred_tree_list = []
    targ_tree_list = []

    nsens = 0
    word2idx = corpus.dict.word2idx
    if True:#args.wsj10:
        dataset = zip(corpus.train_sens, corpus.train_trees, corpus.train_nltktrees)
    else:
        dataset = zip(corpus.test_sens, corpus.test_trees, corpus.test_nltktrees)

    corpus_sys = {}
    corpus_ref = {}
    print(len(corpus.test_sens))
    for sen, sen_tree, sen_nltktree in dataset:
        if len(sen) > 12:#args.wsj10 and len(sen) > 12:
            continue
        input = numpy.array([word2idx[w] if w in word2idx else word2idx['<unk>'] for w in sen])
        #print(input.shape)
        input = numpy.stack([input] + [numpy.zeros(input.shape) for i in range(79)])

        #print(input.shape)

        _, _, distance_forget, distance_input =\
           sess.run([model.cell.forward_propagate(input.shape[1])], feed_dict={model.cell.input:input, model.cell.seq_len:seq_len, model.targets:numpy.zeros((80,1))})[0]

        #print(distance_forget.shape)
        #print(distance_input.shape)

        distance_forget = distance_forget[:,:,0]
        distance_input = distance_input[:,:,0]

        nsens += 1
        if prt and nsens % 100 == 0:
            for i in range(len(sen)):
                print('%15s\t%s\t%s' % (sen[i], str(distance_forget[:, i]), str(distance_input[:, i])))
            print('Standard output:', sen_tree)

        sen_cut = sen[1:-1]
        for gates in [
            # distance[0],
            distance_forget[1],
            # distance[2],
            # distance.mean(axis=0)
        ]:
            #print(gates.shape)
            #print(len(sen_cut))
            depth = gates[1:-1]
            parse_tree = build_tree(depth, sen_cut)

            corpus_sys[nsens] = MRG(parse_tree)
            corpus_ref[nsens] = MRG_labeled(sen_nltktree)

            pred_tree_list.append(parse_tree)
            targ_tree_list.append(sen_tree)

            model_out, _ = get_brackets(parse_tree)
            std_out, _ = get_brackets(sen_tree)
            overlap = model_out.intersection(std_out)

            prec = float(len(overlap)) / (len(model_out) + 1e-8)
            reca = float(len(overlap)) / (len(std_out) + 1e-8)
            if len(std_out) == 0:
                reca = 1.
                if len(model_out) == 0:
                    prec = 1.
            f1 = 2 * prec * reca / (prec + reca + 1e-8)
            prec_list.append(prec)
            reca_list.append(reca)
            f1_list.append(f1)

            if prt and nsens % 1 == 0:
                print('Model output:', parse_tree)
                print('Prec: %f, Reca: %f, F1: %f' % (prec, reca, f1))

        if prt and nsens % 100 == 0:
            print('-' * 80)

            _, axarr = plt.subplots(3, sharex=True, figsize=(distance_forget.shape[1] // 2, 6))
            axarr[0].bar(numpy.arange(distance_forget.shape[1])-0.2, distance_forget[0], width=0.4)
            axarr[0].bar(numpy.arange(distance_input.shape[1])+0.2, distance_input[0], width=0.4)
            axarr[0].set_ylim([0., 1.])
            axarr[0].set_ylabel('1st layer')
            axarr[1].bar(numpy.arange(distance_forget.shape[1]) - 0.2, distance_forget[1], width=0.4)
            axarr[1].bar(numpy.arange(distance_input.shape[1]) + 0.2, distance_input[1], width=0.4)
            axarr[1].set_ylim([0., 1.])
            axarr[1].set_ylabel('2nd layer')
            axarr[2].bar(numpy.arange(distance_forget.shape[1]) - 0.2, distance_forget[2], width=0.4)
            axarr[2].bar(numpy.arange(distance_input.shape[1]) + 0.2, distance_input[2], width=0.4)
            axarr[2].set_ylim([0., 1.])
            axarr[2].set_ylabel('3rd layer')
            plt.sca(axarr[2])
            plt.xlim(xmin=-0.5, xmax=distance_forget.shape[1] - 0.5)
            plt.xticks(numpy.arange(distance_forget.shape[1]), sen, fontsize=10, rotation=45)

            plt.savefig('figure/%d.png' % (nsens))
            plt.close()

    prec_list, reca_list, f1_list \
        = numpy.array(prec_list).reshape((-1,1)), numpy.array(reca_list).reshape((-1,1)), numpy.array(f1_list).reshape((-1,1))
    if prt:
        print('-' * 80)
        numpy.set_printoptions(precision=4)
        print('Mean Prec:', prec_list.mean(axis=0),
              ', Mean Reca:', reca_list.mean(axis=0),
              ', Mean F1:', f1_list.mean(axis=0))
        print('Number of sentence: %i' % nsens)

        correct, total = corpus_stats_labeled(corpus_sys, corpus_ref)
        print(correct)
        print(total)
        print('ADJP:', correct['ADJP'], total['ADJP'])
        print('NP:', correct['NP'], total['NP'])
        print('PP:', correct['PP'], total['PP'])
        print('INTJ:', correct['INTJ'], total['INTJ'])
        print(corpus_average_depth(corpus_sys))

        evalb(pred_tree_list, targ_tree_list)

    return f1_list.mean(axis=0)