def test_basic_crossjoin_no_conflicts(self):
        left_data = [{
            'id': 1,
            'property_1': 'hello'
        }, {
            'id': 2,
            'property_1': 'world'
        }]

        right_data = [{
            'name': 1,
            'property_2': 'bye'
        }, {
            'name': 2,
            'property_2': 'moon'
        }]
        left = _op.JSONScan(object_payload=left_data)
        right = _op.JSONScan(object_payload=right_data)

        cj = _op.CrossJoin(left, right)
        tuples = list(cj)
        self.assertSequenceEqual(
            sorted(list(left_data[0].keys()) + list(right_data[0].keys())),
            sorted(tuples[0].keys()),
            'expected cross join tuple to contain all columns from both left and right tuples'
        )
        self.assertEqual(
            len(tuples),
            len(left_data) * len(right_data),
            'expected cross join to contain left * right number of tuples')
Esempio n. 2
0
    def test_near_matches(self):
        left_data = [{
            'id': 1,
            'species': 'Mus musculus'
        }, {
            'id': 2,
            'species': 'Danio rerio'
        }, {
            'id': 3,
            'species': 'mus musculus'
        }, {
            'id': 4,
            'species': 'danio rerio'
        }, {
            'id': 5,
            'species': 'Zebrafish'
        }, {
            'id': 6,
            'species': 'Mouse'
        }, {
            'id': 7,
            'species': 'Zbrafish'
        }, {
            'id': 8,
            'species': 'Muse'
        }]

        right_data = [{
            'ID': 1,
            'name': 'Mus musculus',
            'synonyms': ['Mouse']
        }, {
            'ID': 2,
            'name': 'Danio rerio',
            'synonyms': ['Zebrafish']
        }]
        left = _op.JSONScan(object_payload=left_data)
        right = _op.JSONScan(object_payload=right_data)

        condition = _opt.Similar('species', 'name', 'synonyms',
                                 _util.edit_distance_fn, None)

        sj = _op.NestedLoopsSimilarityJoin(left, right, condition)
        tuples = list(sj)
        logger.debug(tuples)

        self.assertSequenceEqual(
            sorted(list(left_data[0].keys()) + list(right_data[0].keys())),
            sorted(tuples[0].keys()),
            'expected join to contain all columns from both left and right tuples'
        )

        self.assertEqual(len(tuples), len(left_data),
                         "expected join to contain left's number of tuples")
    def test_basic_crossjoin_w_conflicts(self):
        left_data = [{
            'id': 1,
            'foo': 'a',
            'property_1': 'hello'
        }, {
            'id': 2,
            'foo': 'b',
            'property_1': 'world'
        }]

        right_data = [{
            'name': 1,
            'foo': 'x',
            'property_2': 'bye'
        }, {
            'name': 2,
            'foo': 'y',
            'property_2': 'moon'
        }]
        conflict = 'foo'
        left = _op.JSONScan(object_payload=left_data)
        right = _op.JSONScan(object_payload=right_data)

        cj = _op.CrossJoin(left, right)
        tuples = list(cj)

        self.assertEqual(
            len(tuples[0].keys()),
            len(left_data[0].keys()) + len(right_data[0].keys()),
            'expected the cross join columns to be as many of sum of left and right columns'
        )
        self.assertEqual(
            len(tuples),
            len(left_data) * len(right_data),
            'expected cross join to contain left * right number of tuples')

        non_conflicting_columns = [
            k for k in left_data[0] if k != conflict
        ] + [k for k in right_data[0] if k != conflict]

        for k in non_conflicting_columns:
            self.assertIn(
                k, tuples[0],
                'expected non-conflicting column "%s" in cross join tuple' % k)

        self.assertEqual(
            len([k for k in tuples[0] if k.endswith(conflict)]), 2,
            'expected two variants of conflicting column "%s" in cross join tuple'
            % conflict)
 def test_grouping_single_attr_no_nesting_w_distinct(self):
     self.data = self.generate_duplicate_data(self.data, 100)
     child = _op.JSONScan(object_payload=self.data)
     child = _op.HashDistinct(child, ('name', ))  # inject a distinct
     sa = _op.NestedLoopsSimilarityAggregation(child, ('name', ), tuple(),
                                               _util.edit_distance_fn, None)
     tuples = list(sa)
     self.assertEqual(len(tuples), 2, "expected 2 groups/tuples")
 def test_grouping_single_attr_no_nesting(self):
     child = _op.JSONScan(object_payload=self.data)
     sa = _op.NestedLoopsSimilarityAggregation(child, ('name', ), tuple(),
                                               _util.edit_distance_fn, None)
     tuples = list(sa)
     logger.debug(tuples)
     logger.debug(sa.description)
     self.assertEqual(len(tuples), 2, "expected 2 groups/tuples")
 def test_grouping_and_nesting_single_attrs(self):
     for datum in self.data:  # extend raw data with synonyms
         datum['synonyms'] = datum['name']
     child = _op.JSONScan(object_payload=self.data)
     sa = _op.NestedLoopsSimilarityAggregation(child, ('name', ),
                                               ('synonyms', ),
                                               _util.edit_distance_fn, None)
     tuples = list(sa)
     logger.debug(tuples)
     logger.debug(sa.description)
     self.assertEqual(len(tuples), 2, "expected 2 groups/tuples")
     self.assertEqual(len(self.data),
                      sum([len(t['synonyms']) for t in tuples]),
                      "expected all synonyms to be nested")
    def test_grouping_and_nesting_single_attrs_w_distinct(self):
        # generate test data
        for datum in self.data:  # extend raw data with synonyms
            datum['synonyms'] = datum['name']
        multiplier = 100
        self.data = self.generate_duplicate_data(self.data, multiplier)

        # create physical plan
        child = _op.JSONScan(object_payload=self.data)
        child = _op.HashDistinct(child,
                                 ('name', 'synonyms'))  # inject a distinct
        sa = _op.NestedLoopsSimilarityAggregation(child, ('name', ),
                                                  ('synonyms', ),
                                                  _util.edit_distance_fn, None)
        tuples = list(sa)

        # assertions
        self.assertEqual(len(tuples), 2, "expected 2 groups/tuples")
        self.assertEqual(
            len(self.data) / multiplier,
            sum([len(t['synonyms']) for t in tuples]),
            "expected all synonyms to be nested")
Esempio n. 8
0
class TestSelect (unittest.TestCase):
    """Basic tests for Select operator."""

    _test_helper = TestHelper()
    _child = _op.JSONScan(object_payload=_test_helper.test_data)

    def test_select_eq_on_field_0(self):
        comparison = _opt.Comparison(self._test_helper.FIELDS[0], 'eq', 0)
        oper = _op.Select(self._child, comparison)
        self.assertDictEqual(self._child.description, oper.description, "table definition should match source")
        self.assertEqual(1, count(oper), 'incorrect number of rows returned by operator')

    def test_select_eq_on_field_1(self):
        comparison = _opt.Comparison(self._test_helper.FIELDS[1], 'eq', self._test_helper.test_data[0][self._test_helper.FIELDS[1]])
        oper = _op.Select(self._child, comparison)
        self.assertDictEqual(self._child.description, oper.description, "table definition should match source")
        self.assertLess(1, count(oper), 'incorrect number of rows returned by operator')

    def test_select_conjunction(self):
        comparisons = [
            _opt.Comparison(self._test_helper.FIELDS[0], 'eq', 0),
            _opt.Comparison(self._test_helper.FIELDS[1], 'eq', self._test_helper.test_data[0][self._test_helper.FIELDS[1]])
            ]
        comparison = _opt.Conjunction(comparisons)
        oper = _op.Select(self._child, comparison)
        self.assertDictEqual(self._child.description, oper.description, "table definition should match source")
        self.assertEqual(1, count(oper), 'incorrect number of rows returned by operator')

    def test_select_disjunction(self):
        assert self._test_helper.num_test_rows > 2
        i = int(self._test_helper.num_test_rows / 2)
        comparisons = [
            _opt.Comparison(self._test_helper.FIELDS[0], 'lt', i),
            _opt.Comparison(self._test_helper.FIELDS[0], 'gt', i)
            ]
        comparison = _opt.Disjunction(comparisons)
        oper = _op.Select(self._child, comparison)
        self.assertDictEqual(self._child.description, oper.description, "table definition should match source")
        self.assertEqual(self._test_helper.num_test_rows-1, count(oper), 'incorrect number of rows returned by operator')
 def setUp(self):
     self._op = _op.JSONScan(object_payload=payload)
Esempio n. 10
0
class TestProjection(unittest.TestCase):
    """Basic tests for Project operator."""

    _child = _op.JSONScan(object_payload=payload)

    def test_simple_projection_description(self):
        projection = ('property_1', )
        oper = _op.Project(self._child, projection)
        desc = oper.description
        self.assertIsNotNone(desc, 'description is None')
        self.assertIsNotNone(desc['column_definitions'],
                             'column_definitions is None')
        self.assertEqual(len(desc['column_definitions']), len(projection),
                         'incorrect number of columns in description')

    def test_simple_projection_iter(self):
        projection = ('property_1', )
        oper = _op.Project(self._child, projection)
        it = iter(oper)
        self.assertIsNotNone(it, 'must return an iterable')
        rows = list(it)
        self.assertEqual(len(rows), len(payload),
                         'did not return correct number of rows')
        self.assertTrue(isinstance(rows[0], dict), 'row is not a dictionary')
        self.assertEqual(len(rows[0].keys()), len(projection),
                         'did not project correct number of attributes')

    def test_project_all_attributes(self):
        projection = (_opt.AllAttributes(), )
        oper = _op.Project(self._child, projection)
        desc = oper.description
        self.assertEqual(len(desc['column_definitions']),
                         len(payload[0].keys()),
                         'did not project all attributes')

    def test_project_and_rename_same_attribute_twice(self):
        renames = (_opt.AttributeAlias(name='property_1', alias='name'),
                   _opt.AttributeAlias(name='property_1', alias='synonyms'))
        projection = _op.Project(self._child, renames)
        tup = list(projection)[0]
        self.assertIn('name', tup)
        self.assertIn('synonyms', tup)
        self.assertNotIn('RID', tup)
        cnames = [
            column['name']
            for column in projection.description['column_definitions']
        ]
        logger.debug(cnames)
        for expected in ['name', 'synonyms']:
            self.assertIn(
                expected, cnames,
                "column missing in projected relation's description")

    def test_project_introspect_RID(self):
        projection = (_opt.IntrospectionFunction(_util.introspect_key_fn),
                      'property_1')
        oper = _op.Project(self._child, projection)
        renamed_rid = self._child.description['table_name'] + "_RID"
        self.assertTrue(
            any([
                c['name'] == renamed_rid
                for c in oper.description['column_definitions']
            ]), "'RID' not renamed to '%s'" % renamed_rid)

    def test_project_preserve_unique_on_rid(self):
        oper = _op.Project(self._child, ('RID', ))
        self.assertTrue(
            any([
                len(colset) == 1 and colset[0] == 'RID' for colset in
                [key['unique_columns'] for key in oper.description['keys']]
            ]),
            'could not find a key defined on (RID) when RID was projected from child relation'
        )