Beispiel #1
0
    def test_EncodedDataset_constructor(self):
        dataset = ch.datasets.TupleDataset([
            Entry("entry1", [Example(([10, 20, 30], ), 10)],
                  dict([["HEAD", True], ["SORT", False]])),
            Entry("entry2", [Example(([30, 20, 10], ), [10, 20, 30])],
                  dict([["HEAD", False], ["SORT", True]]))
        ])

        cdataset = EncodedDataset(
            Dataset(dataset, DatasetMetadata(1, set(["HEAD", "SORT"]), 256,
                                             5)))
        [(types0, values0, attribute0),
         (types1, values1, attribute1)] = list(cdataset)

        self.assertTrue(np.all([[[0, 1], [1, 0]]] == types0))
        self.assertTrue(
            np.all([[[266, 276, 286, 512, 512], [266, 512, 512, 512, 512]]] ==
                   values0))
        self.assertTrue(np.all(np.array([1, 0]) == attribute0))

        self.assertTrue(np.all([[[0, 1], [0, 1]]] == types1))
        self.assertTrue(
            np.all([[[286, 276, 266, 512, 512], [266, 276, 286, 512, 512]]] ==
                   values1))
        self.assertTrue(np.all(np.array([0, 1]) == attribute1))
    def test_search(self):
        # example of access
        examples = [
            Example([2, [10, 20, 30]], 30),
            Example([1, [-10, 30, 40]], 30)
        ]

        def pred(examples):
            LINQ, _ = generate_io_samples.get_language(50)
            LINQ = [f for f in LINQ if not "IDT" in f.src]
            prob = dict()
            for function in LINQ:
                for name in function.src.split(" "):
                    if name == "ACCESS":
                        prob[name] = 0.8
                    else:
                        prob[name] = 0.2
            return prob

        result = search(
            os.path.join(os.getcwd(), "DeepCoder_Utils", "enumerative-search",
                         "search"), 1000, 256, examples, 2, pred)

        self.assertTrue(result.is_solved)
        self.assertAlmostEqual(0.8, result.probabilities["ACCESS"])
        self.assertAlmostEqual(0.2, result.probabilities["HEAD"])
        self.assertEqual(1, result.explored_nodes)
        self.assertEqual(" %2 <- access %0 %1\n", result.solution)
Beispiel #3
0
def to_batch_seq(sql_data, table_data, idxes, st, ed,
                 is_train=True):
    """

    :return:
    """
    examples = []

    for i in range(st, ed):
        sql = sql_data[idxes[i]]
        table = table_data[sql['db_id']]

        process_dict = process(sql, table)

        for c_id, col_ in enumerate(process_dict['col_set_iter']):
            for q_id, ori in enumerate(process_dict['q_iter_small']):
                if ori in col_:
                    process_dict['col_set_type'][c_id][0] += 1

        schema_linking(process_dict['question_arg'], process_dict['question_arg_type'],
                       process_dict['one_hot_type'], process_dict['col_set_type'], process_dict['col_set_iter'], sql)

        col_table_dict = get_col_table_dict(process_dict['tab_cols'], process_dict['tab_ids'], sql)
        table_col_name = get_table_colNames(process_dict['tab_ids'], process_dict['col_iter'])

        process_dict['col_set_iter'][0] = ['count', 'number', 'many']

        rule_label = None
        if 'rule_label' in sql:
            rule_label = [eval(x) for x in sql['rule_label'].strip().split(' ')]
            if is_valid(rule_label, col_table_dict=col_table_dict, sql=sql) is False:
                continue

        example = Example(
            src_sent=process_dict['question_arg'],
            col_num=len(process_dict['col_set_iter']),
            vis_seq=(sql['question'], process_dict['col_set_iter'], sql['query']),
            tab_cols=process_dict['col_set_iter'],
            sql=sql['query'],
            one_hot_type=process_dict['one_hot_type'],
            col_hot_type=process_dict['col_set_type'],
            table_names=process_dict['table_names'],
            table_len=len(process_dict['table_names']),
            col_table_dict=col_table_dict,
            cols=process_dict['tab_cols'],
            table_col_name=table_col_name,
            table_col_len=len(table_col_name),
            tokenized_src_sent=process_dict['col_set_type'],
            tgt_actions=rule_label
        )
        example.sql_json = copy.deepcopy(sql)
        examples.append(example)

    if is_train:
        examples.sort(key=lambda e: -len(e.src_sent))
        return examples
    else:
        return examples
Beispiel #4
0
    def test_example_embed_embed_minibatch_with_different_number_of_inputs(
            self):
        embed = ExampleEmbed(2, 2, 1, (np.arange(5) + 1).reshape((5, 1)))
        """
        EmbedId
          0 (-2)   -> 1 
          1 (-1)   -> 2
          2 ( 0)   -> 3
          3 ( 1)   -> 4
          4 (NULL) -> 5
        """

        metadata = DatasetMetadata(2, set([]), 2, 2)
        e0 = examples_encoding(
            [Example([[0, 1]], 0), Example([[1]], 1)], metadata)
        e1 = examples_encoding(
            [Example([1, [0, 1]], [0]),
             Example([0, [0, 1]], [])], metadata)

        state_embeddings = embed.forward(np.array([e0.types, e1.types]),
                                         np.array([e0.values, e1.values]))
        self.assertEqual((2, 2, 3, 2 + 2 * 1), state_embeddings.shape)
        self.assertTrue(
            np.allclose([0, 1, 3, 4],
                        state_embeddings.array[0, 0, 0]))  # Input of e00
        self.assertTrue(
            np.allclose([0, 0, 5, 5],
                        state_embeddings.array[0, 0, 1]))  # Input of e00
        # Output of e00
        self.assertTrue(
            np.allclose([1, 0, 3, 5], state_embeddings.array[0, 0, 2]))
        self.assertTrue(
            np.allclose([0, 1, 4, 5],
                        state_embeddings.array[0, 1, 0]))  # Input of e01
        self.assertTrue(
            np.allclose([0, 0, 5, 5],
                        state_embeddings.array[0, 1, 1]))  # Input of e01
        # Output of e01
        self.assertTrue(
            np.allclose([1, 0, 4, 5], state_embeddings.array[0, 1, 2]))
        self.assertTrue(
            np.allclose([1, 0, 4, 5],
                        state_embeddings.array[1, 0, 0]))  # Input of e10
        self.assertTrue(
            np.allclose([0, 1, 3, 4],
                        state_embeddings.array[1, 0, 1]))  # Input of e10
        # Output of e10
        self.assertTrue(
            np.allclose([0, 1, 3, 5], state_embeddings.array[1, 0, 2]))
        self.assertTrue(
            np.allclose([1, 0, 3, 5],
                        state_embeddings.array[1, 1, 0]))  # Input of e11
        self.assertTrue(
            np.allclose([0, 1, 3, 4],
                        state_embeddings.array[1, 1, 1]))  # Input of e11
        # Output of e11
        self.assertTrue(
            np.allclose([0, 1, 5, 5], state_embeddings.array[1, 1, 2]))
Beispiel #5
0
def to_batch_seq(sql_data, schema_data,idxes, st, ed,
                 is_train=True):
    """

    :return:
    """
    examples = []

    for i in range(st, ed):
        sql = sql_data[idxes[i]]
        schema_id = sql['db_id']
        tab_cols = []
        tab_ids = []
        for i in schema_data[schema_id]["column_names"]:
            tab_cols.append(i[1])
            tab_ids.append(i[0])
        col_set = [''.join(col) for col in sql['column_names']][:-1]
        col_table_dict = get_col_table_dict(tab_cols, tab_ids, col_set)

        rule_label = None
        if 'label_str' in sql:
            try:
                rule_label = [eval(x) for x in sql['label_str'].strip().split(' ')]
            except:
                continue
            '''
            TODO: 以后再去这部分改
            if is_valid(rule_label, col_table_dict=col_table_dict, sql=sql) is False:
                print('*'*50)
                continue
            '''
        example = Example(
            src_sent=sql['question_tokens'],
            src_len=len(sql['question_tokens']),

            col_names=sql['column_names'],
            col_len=len(sql['column_names']),
            feature_c = sql['column_features'],

            table_names=sql['table_names'],
            table_len=len(sql['table_names']),

            value_name=sql['values'],
            value_len = len(sql['values']),

            col_table_dict=col_table_dict,
            tgt_actions=rule_label,

        )
        example.sql_json = copy.deepcopy(sql)
        examples.append(example)

    if is_train:
        examples.sort(key=lambda e: -len(e.src_sent))
        return examples
    else:
        return examples
Beispiel #6
0
 def test_dataset_metadata(self):
     e0 = Entry("HEAD", [Example([[10, 20]], 10)],
                dict([["HEAD", True], ["TAKE", False]]))
     e1 = Entry("TAKE", [Example([1, [10, 20]], 10)],
                dict([["HEAD", False], ["TAKE", True]]))
     dataset = ch.datasets.TupleDataset([e0, e1])
     stats = dataset_metadata(dataset)
     self.assertEqual(2, stats.max_num_inputs)
     self.assertEqual(set(["HEAD", "TAKE"]), stats.symbols)
Beispiel #7
0
    def test_TrainingClassifier(self):
        embed = ExampleEmbed(1, 2, 2)
        encoder = Encoder(10)
        decoder = Decoder(2)
        classifier = TrainingClassifier(ch.Sequential(embed, encoder, decoder))

        metadata = DatasetMetadata(1, set([]), 2, 2)
        e = examples_encoding(
            [Example([[0, 1]], 0), Example([[1]], 1)], metadata)
        labels = np.array([[1, 1]])
        loss = classifier(np.array([e.types]), np.array([e.values]), labels)
        loss.grad = np.ones(loss.shape, dtype=np.float32)

        # backward does not throw an error
        loss.backward()
    def test_predict_with_neural_network(self):
        examples = [
            Example([2, [10, 20, 30]], 30),
            Example([1, [-10, 30, 40]], 30)
        ]
        metadata = DatasetMetadata(2, set(["MAP", "HEAD"]), 256, 5)
        model_shape = ModelShapeParameters(metadata, 3, 2, 10)
        m = InferenceModel(model_shape)
        pred = predict_with_neural_network(model_shape, m)
        prob = pred(examples)

        encoding = examples_encoding(examples, metadata)
        prob_dnn = m.model(np.array([encoding.types]),
                           np.array([encoding.values])).array[0]

        self.assertAlmostEqual(prob_dnn[0], prob["HEAD"])
        self.assertAlmostEqual(prob_dnn[1], prob["MAP"])
    def test_search_when_pred_throws_error(self):
        # example that do not correspond to any programs
        examples = [
            Example([2, [10, 20, 30]], -255),
            Example([1, [-10, 30, 40]], -255)
        ]

        def pred(examples):
            raise RuntimeError("test")

        result = search(
            os.path.join(os.getcwd(), "DeepCoder_Utils", "enumerative-search",
                         "search"), 1000, 256, examples, 2, pred)

        self.assertFalse(result.is_solved)
        self.assertEqual(-1, result.explored_nodes)
        self.assertEqual(dict([]), result.probabilities)
        self.assertEqual("", result.solution)
    def test_search_with_invalid_examples(self):
        # example that do not correspond to any programs
        examples = [
            Example([2, [10, 20, 30]], -255),
            Example([1, [-10, 30, 40]], -255)
        ]

        def pred(examples):
            LINQ, _ = generate_io_samples.get_language(50)
            LINQ = [f for f in LINQ if not "IDT" in f.src]
            prob = dict()
            for function in LINQ:
                for name in function.src.split(" "):
                    prob[name] = 1.0
            return prob

        result = search(
            os.path.join(os.getcwd(), "DeepCoder_Utils", "enumerative-search",
                         "search"), 1000, 256, examples, 2, pred)

        self.assertFalse(result.is_solved)
        self.assertEqual(-1, result.explored_nodes)
        self.assertEqual("", result.solution)
Beispiel #11
0
    def test_Encoder(self):
        embed = ExampleEmbed(1, 2, 1, (np.arange(5) + 1).reshape((5, 1)))

        encoder = Encoder(1,
                          initialW=ch.initializers.One(),
                          initial_bias=ch.initializers.Zero())
        self.assertEqual(6, len(list(encoder.params())))
        """
        state_embeddings: (N, e, 2, 4) -> h1: (N, e, 1) -> h2: (N, e, 2) -> output: (N, e, 2)
        """

        metadata = DatasetMetadata(1, set([]), 2, 2)
        e = examples_encoding(
            [Example([[0, 1]], 0), Example([[1]], 1)], metadata)

        state_embeddings = embed(np.array([e.types]), np.array([e.values]))
        layer_encodings = encoder(state_embeddings)

        self.assertEqual((1, 2, 1), layer_encodings.shape)
        for i in range(1):
            for j in range(2):
                h = np.array(state_embeddings[i, j, :, :].array.sum())
                h = F.sigmoid(F.sigmoid(F.sigmoid(h)))
                self.assertEqual(h.array, layer_encodings.array[i, j])
Beispiel #12
0
    def test_example_embed_embed_one_sample(self):
        embed = ExampleEmbed(1, 2, 1, (np.arange(5) + 1).reshape((5, 1)))
        self.assertEqual(1, len(list(embed.params())))
        """
        EmbedId
          0 (-2)   -> 1 
          1 (-1)   -> 2
          2 ( 0)   -> 3
          3 ( 1)   -> 4
          4 (NULL) -> 5
        """

        e = examples_encoding(
            [Example([[0, 1]], 0), Example([[1]], 1)],
            DatasetMetadata(1, set([]), 2, 2))

        state_embeddings = embed.forward(np.array([e.types]),
                                         np.array([e.values]))
        self.assertEqual((1, 2, 2, 2 + 2 * 1), state_embeddings.shape)
        self.assertTrue(
            np.allclose([0, 1, 3, 4],
                        state_embeddings.array[0, 0, 0]))  # Input of e1
        self.assertTrue(
            np.allclose([1, 0, 3, 5],
                        state_embeddings.array[0, 0, 1]))  # Output of e1
        self.assertTrue(
            np.allclose([0, 1, 4, 5],
                        state_embeddings.array[0, 1, 0]))  # Input of e2
        self.assertTrue(
            np.allclose([1, 0, 4, 5],
                        state_embeddings.array[0, 1, 1]))  # Output of e2

        # backward does not throw an error
        state_embeddings.grad = np.ones(state_embeddings.shape,
                                        dtype=np.float32)
        state_embeddings.backward()
Beispiel #13
0
def to_batch_seq(datas, idxes, st, ed, is_train=True):

    examples = []
    for i in range(st, ed):
        feature = datas[idxes[i]]['feature']
        label = datas[idxes[i]]['label']
        decoder_pob = datas[idxes[i]]['decoder_pob']
        type = [x.split('.')[-1][:-2] for x in datas[idxes[i]]['type']]
        one_hot_type = np.zeros((len(feature), 4))
        for id_x, t_v in enumerate(type):
            if t_v == 'WikiSqlSelectColumnAction':
                one_hot_type[id_x][0] = 1
            elif t_v == 'ApplyRuleAction':
                one_hot_type[id_x][1] = 1
            elif t_v == 'GenTokenAction':
                one_hot_type[id_x][2] = 1
            elif t_v == 'ReduceAction':
                one_hot_type[id_x][3] = 1
            else:
                raise NotImplementedError("wrong type for ", t_v)



        assert len(feature) == len(label)
        example = Example(
            feature,
            np.expand_dims(np.array(label), axis=1),
            decoder_pob=decoder_pob,
            one_hot_type=one_hot_type
        )
        examples.append(example)

    if is_train:
        examples.sort(key=lambda e: -len(e.confidence))
        return examples
    else:
        return examples
Beispiel #14
0
 def test_examples_encoding_if_num_inputs_is_too_large(self):
     metadata = DatasetMetadata(0, set([]), 2, 2)
     self.assertRaises(
         RuntimeError, lambda: examples_encoding(
             [Example([1, [0, 1]], [0]),
              Example([0, [0, 1]], [])], metadata))