Ejemplo n.º 1
0
    def test_t5(self):
        # logger.info("Running Tests4Dataset1test1/test_t5")
        ds = Dataset(TESTFILE4)
        it1 = iter(ds.instances_converted(train=False, convert=True))
        rec = next(it1)

        indep, dep = rec
        logger.debug("TESTFILE4: indep=%r" % indep)
        logger.debug("TESTFILE4: dep=%r" % dep)
        logger.debug("TESTFILE4 info=%r" % ds.get_info())
        # the first row is a sequence of 3 elements, with 18 independent
        # features and one of 17 different targets
        # so we should convert this into 18 features which each now should have 3 values
        # and 3 onehot vectors for the class

        assert len(dep) == 3
        assert len(indep) == 3  # 3 elements in the sequence
        assert len(indep[0]) == 18
        assert len(indep[1]) == 18
        assert len(indep[2]) == 18
        # check if the class is actually ADJ for all three targets
        dep1 = dep[0]
        dep2 = dep[1]
        dep3 = dep[2]
        t11 = ds.target.vocab.idx2string(dep1)
        assert t11 == "ADJ"
        t12 = ds.target.vocab.idx2string(dep2)
        assert t12 == "ADJ"
        t13 = ds.target.vocab.idx2string(dep3)
        assert t13 == "ADJ"
        # test getting batches in non-reshaped form
        bit1 = ds.batches_converted(train=False,
                                    convert=True,
                                    batch_size=2,
                                    reshape=False)
        biter1 = iter(bit1)
        batch1 = next(biter1)
        # print("DEBUG: TESTFILE4 batch/noreshape=%s" % (batch1,), file=sys.stderr)
        assert len(batch1) == 2
        # test getting batches in reshaped form
        bit2 = ds.batches_converted(train=False,
                                    convert=True,
                                    batch_size=2,
                                    reshape=True)
        biter2 = iter(bit2)
        batch2 = next(biter2)
        # print("DEBUG: TESTFILE4 batch/noreshape=%s" % (batch1,), file=sys.stderr)
        bindep, bdep = batch2
        assert len(bindep) == 18
        assert len(bdep) == 2
        assert len(bindep[0]) == 2
Ejemplo n.º 2
0
 def test_t9(self):
     # logger.info("Running Tests4Dataset1test1/test_t9")
     ds1 = Dataset(TESTFILE4, reuse_files=False, targets_need_padding=False)
     ds1.target.set_as_onehot(True)
     batch_reshape = ds1.batches_converted(train=False,
                                           batch_size=4,
                                           reshape=True,
                                           convert=True)
     b1r = next(iter(batch_reshape))
     # print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!DEBUG: the whole batch:\n",b1r,file=sys.stderr)
     targets = b1r[1]
     indeps = b1r[0]
     # ok, we want one element for each example in the batch
     assert len(targets) == 4
     # get the length of the first sequence from indep by looking at the number of elements
     # of the first feature of the first instance. Note that all sequences of feature values
     # and targets should be padded to the maximum sequence length!
     feature1 = indeps[0]
     len1 = len(feature1[0])
     # check the length of the target sequences
     assert len(targets[0]) == len1
     assert len(targets[1]) == len1
     assert len(targets[2]) == len1
     assert len(targets[3]) == len1
     # TODO: check why this is supposed to be one hot vectors and not indices here!!
     # for each target check that all entries are one-hot vectors of the same length
     for i in range(4):
         for j in range(len1):
             val = targets[i][j]
             assert isinstance(val, list)
             # print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!DEBUG: len val ",len(val), "val=",val, file=sys.stderr)
             assert len(val) == 17
Ejemplo n.º 3
0
 def test_t7(self):
     # logger.info("Running Tests4Dataset1test1/test_t7")
     ds = Dataset(TESTFILE3)
     ds.split(convert=True,
              keep_orig=True,
              validation_size=3,
              random_seed=1)
     # check if getting the batches and validation sets works
     valset_orig = ds.validation_set_orig()
     # print("DEBUG: valset_orig=%s" % valset_orig, file=sys.stderr)
     assert len(valset_orig) == 3
     vorigi2 = valset_orig[1]
     assert vorigi2 == [[
         'you', 'think', 'this', 'place', 'is', 'nice', 'VERB', 'DET', 'a',
         'a', 'a', 'a', 'a', 'a', '', 'nk', 'is', 'ce', '', 'ce', '', 'ink',
         '', 'ace', '', ''
     ], 'NOUN']
     valset_conv = ds.validation_set_converted()
     # print("DEBUG: valset_conv=%s" % valset_conv, file=sys.stderr)
     assert len(valset_conv) == 3
     vconvi2 = valset_conv[1]
     # print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! DEBUG: vconvi2=", vconvi2, file=sys.stderr)
     assert vconvi2 == [[
         13, 157, 25, 104, 12, 319, 2, 5, 2, 2, 2, 2, 2, 2, 0, 151, 28, 14,
         0, 14, 0, 215, 0, 101, 0, 0
     ], 0]
     valset_conv_b = ds.validation_set_converted(as_batch=True)
     # print("DEBUG: valset_conv_b=%s" % (valset_conv_b,), file=sys.stderr)
     # we expect a tuple for indep and dep
     assert len(valset_conv_b) == 2
     indep1, dep1 = valset_conv_b
     # the indep part should now have lenth equal to the number of features
     assert len(indep1) == ds.nFeatures
     # there should be 3 values for that first feature
     assert len(indep1[0]) == 3
     # get a batch of original data
     bitb1 = ds.batches_original(train=True, batch_size=4, reshape=False)
     batch_orig1 = next(iter(bitb1))
     # print("DEBUG: batch_orig1=%s" % (batch_orig1,), file=sys.stderr)
     # if reshape was False, this is just a list of instances in original format
     assert len(batch_orig1) == 4
     assert batch_orig1[1] == [[
         'Bill', 'Bradford', 'in', 'Credit', 'are', 'supposed', 'PROPN',
         'ADP', 'Aa', 'Aa', 'a', 'Aa', 'a', 'a', 'll', 'rd', '', 'it', '',
         'ed', '', 'ord', '', 'dit', '', 'sed'
     ], 'NOUN']
     bitb2 = ds.batches_original(train=True, batch_size=4, reshape=True)
     batch_orig2 = next(iter(bitb2))
     # print("DEBUG: batch_orig2=%s" % (batch_orig2,), file=sys.stderr)
     # if reshape was True, this is a tuple where the first element is the list of features
     assert len(batch_orig2) == 2
     featurelist1 = batch_orig2[0]
     feature1 = featurelist1[0]
     assert feature1[1] == 'Bill'
     bconvb1 = ds.batches_converted(train=True, batch_size=4, reshape=False)
     batch_conv1 = next(iter(bconvb1))
     # print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! DEBUG: !!!batch_conv1[1]=%s" % (batch_conv1[1],), file=sys.stderr)
     assert len(batch_conv1) == 4
     # TODO: check why some indices changed between previously and now and if this is till correct!
     assert batch_conv1[1] == [[
         1210, 1495, 9, 796, 23, 3075, 6, 3, 3, 3, 2, 3, 2, 2, 20, 54, 0,
         86, 0, 2, 0, 391, 0, 300, 0, 77
     ], 0]
     bconvb2 = ds.batches_converted(train=True, batch_size=4, reshape=True)
     batch_conv2 = next(iter(bconvb2))
     # print("DEBUG: batch_conv2=%s" % (batch_conv2,), file=sys.stderr)
     assert len(batch_conv2) == 2
     featurelist1 = batch_conv2[0]
     feature1 = featurelist1[0]
     assert feature1[1] == 1210
Ejemplo n.º 4
0
 def test_t6(self):
     # logger.info("Running Tests4Dataset1test1/test_t6")
     ds = Dataset(TESTFILE2)
     ds.split(convert=True,
              keep_orig=True,
              validation_size=3,
              random_seed=1)
     # check if getting the batches and validation sets works
     valset_orig = ds.validation_set_orig()
     # print("DEBUG: valset_orig=%s" % valset_orig, file=sys.stderr)
     assert len(valset_orig) == 3
     vorigi2 = valset_orig[1]
     assert vorigi2 == [[[
         'a', 'very', 'well-made', ',', 'funny', 'and', 'entertaining',
         'picture', '.'
     ]], 'pos']
     valset_conv = ds.validation_set_converted()
     # print("DEBUG: valset_conv=%s" % valset_conv, file=sys.stderr)
     assert len(valset_conv) == 3
     vconvi2 = valset_conv[1]
     # print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! DEBUG: vconvi2=", vconvi2, file=sys.stderr)
     assert vconvi2 == [[[5, 84, 1530, 4, 75, 6, 190, 175, 2]], 1]
     valset_conv_b = ds.validation_set_converted(as_batch=True)
     # print("DEBUG: valset_conv_b=%s" % (valset_conv_b,), file=sys.stderr)
     # we expect a tuple for indep and dep
     assert len(valset_conv_b) == 2
     indep1, dep1 = valset_conv_b
     # the indep part should now have lenth one because there is only one feature
     assert len(indep1) == 1
     # there should be 3 values for that first feature
     # print("DEBUG: indep1[0]=%r" % (indep1[0]), file=sys.stderr)
     assert len(indep1[0]) == 3
     # get a batch of original data
     bitb1 = ds.batches_original(train=True, batch_size=4, reshape=False)
     batch_orig1 = next(iter(bitb1))
     # print("DEBUG: batch_orig1=%s" % (batch_orig1,), file=sys.stderr)
     # if reshape was False, this is just a list of instances in original format
     assert len(batch_orig1) == 4
     assert batch_orig1[1] == [[[
         'rife', 'with', 'nutty', 'cliches', 'and', 'far', 'too', 'much',
         'dialogue', '.'
     ]], 'neg']
     bitb2 = ds.batches_original(train=True, batch_size=4, reshape=True)
     batch_orig2 = next(iter(bitb2))
     # print("DEBUG: batch_orig2=%s" % (batch_orig2,), file=sys.stderr)
     # if reshape was True, this is a tuple where the first element is the list of features
     assert len(batch_orig2) == 2
     featurelist1 = batch_orig2[0]
     feature1 = featurelist1[0]
     # print("DEBUG: feature1[1]=%s" % (feature1[1],), file=sys.stderr)
     assert feature1[1] == [
         'rife', 'with', 'nutty', 'cliches', 'and', 'far', 'too', 'much',
         'dialogue', '.', '', '', '', '', '', '', '', '', '', '', '', '',
         '', '', '', '', '', '', '', '', '', '', '', '', '', ''
     ]
     bconvb1 = ds.batches_converted(train=True, batch_size=4, reshape=False)
     batch_conv1 = next(iter(bconvb1))
     # print("DEBUG: batch_conv1=%s" % (batch_conv1,), file=sys.stderr)
     assert len(batch_conv1) == 4
     # print("DEBUG: batch_conv1[1]=%s" % (batch_conv1[1],), file=sys.stderr)
     assert batch_conv1[1] == [[[
         6694, 17, 6469, 544, 6, 168, 51, 59, 237, 2
     ]], 0]
     bconvb2 = ds.batches_converted(train=True, batch_size=4, reshape=True)
     batch_conv2 = next(iter(bconvb2))
     # print("DEBUG: batch_conv2=%s" % (batch_conv2,), file=sys.stderr)
     assert len(batch_conv2) == 2
     featurelist1 = batch_conv2[0]
     feature1 = featurelist1[0]
     # print("DEBUG: feature1[1]=%s" % (feature1[1],), file=sys.stderr)
     assert feature1[1] == [
         6694, 17, 6469, 544, 6, 168, 51, 59, 237, 2, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
     ]