Ejemplo n.º 1
0
def test_splitter_datetime():
    """test that structures containing datetime instances can be pushed in"""

    elements = [
        [
            '1',
            datetime.datetime(2004, 1, 22, 10, 0, 0),
            datetime.date(2010, 12, 24),
        ],
        [
            '1',
            datetime.datetime(2005, 2, 23, 11, 27, 32),
            datetime.date(2015, 12, 24),
        ],
        [
            '1',
            datetime.datetime(2014, 8, 22, 9, 54, 0),
            datetime.date(2001, 11, 6),
        ],
    ]

    fnames = get_splitter(elements, 0).split()
    for fname in fnames:
        f = open(fname, 'rb')
        flow = get_input_item_flow(f)
        for index, item in enumerate(flow):
            assert item == elements[index], ("Expected: '%s' got '%s'" %
                                             (item, elements[index]))
Ejemplo n.º 2
0
    def test_simple_splitter_eleven_max3(self):
        # only one item with max_items > 1
        input_items = list()
        o = TestItem('ref0', 'value0')
        input_items.append(o)
        input_items.extend(
            [TestItem('ref1', 'value%s' % i) for i in range(1, 10)])

        # list of items with max_items < nb items
        self.splitter = get_splitter(input_items, 3)
        result = self.splitter.split()
        assert len(result) == 4, (
            "The splitter returned %s files instead of 4" % len(result))

        count = 0
        for filename in result:
            f = open(filename, 'rb')
            for item in get_input_item_flow(f):
                assert item == input_items[count], (
                    "[File %s][Item %s] Got %s instead of %s" %
                    (f.name, count, item, input_items[count]))
                count += 1
            f.close()
        assert count == len(input_items), (
            "We got %s items after splitting, instead of the %s we had before"
            % (count, len(input_items)))
Ejemplo n.º 3
0
    def test_simple_splitter_one_nomax(self):
        input_items = list()

        # only one item with no max_items
        o = TestItem('ref0', 'value0')
        input_items.append(o)

        self.splitter = get_splitter(input_items, 0)
        result = self.splitter.split()
        assert len(result) == 1, (
            "The splitter returned %s files instead of 1" % len(result))

        count = 0
        f = open(result[0], 'rb')
        for item in get_input_item_flow(f):
            assert item == input_items[count], (
                "[File %s][Item %s] Got %s, %s instead of %s, %s" % (
                    f.name,
                    count,
                    item,
                    type(item),
                    input_items[count],
                    type(input_items[count]),
                ))
        f.close()
Ejemplo n.º 4
0
def test_splitter_unicode():
    """Test that unicode attributes are preserved after splitting."""
    max_items = 2
    unicode_field = u'$£ø'
    split_attribute = 'field1'

    input_items = list()
    o = TestItem('ref1', unicode_field)
    input_items.append(o)

    # test the simple splitter
    result = get_splitter(input_items, max_items).split()
    assert len(result) == 1, ("The splitter returned %s files instead of 1" %
                              len(result))

    f = open(result[0], 'rb')
    for item in get_input_item_flow(f):
        assert item.field2 == unicode_field, (
            "Simple splitter didn't preserve unicode value: "
            "got %s instead of %s" % (item.field2, unicode_field))
    f.close()

    # test the splitter by attribute
    result = get_splitter(input_items, 2, split_attribute).split()
    assert len(result) == 1, ("The splitter returned %s files instead of 1" %
                              len(result))

    f = open(result[0], 'rb')
    for item in get_input_item_flow(f):
        assert item.field2 == unicode_field, (
            "Splitter by attribute didn't preserve unicode value: "
            "got %s instead of %s" % (item.field2, unicode_field))
    f.close()

    # test the splitter by attribute, forcing it to split
    result = get_splitter(input_items, 2, split_attribute,
                          force_split=True).split()
    assert len(result) == 1, ("The splitter returned %s files instead of 1" %
                              len(result))

    f = open(result[0], 'rb')
    for item in get_input_item_flow(f):
        assert item.field2 == unicode_field, (
            "Splitter by attribute (forcing to split) didn't "
            "preserve unicode value: got %s instead of %s" %
            (item.field2, unicode_field))
    f.close()
Ejemplo n.º 5
0
    def retrieve(self, shelves=None, auto_clean=False):
        """ Retrieve the content of the storage as a data flow
        @param shelves: List of shelves to retrieve all item, if shelves is None,
        the method retrieve all item from all shelves
        @type shelves: List of filename 
        """
        if shelves is None:
            shelves = self.shelves

        for shelve in shelves:
            with open(shelve, 'r') as shelve_file:
                for item in get_input_item_flow(shelve_file):
                    yield item

            if auto_clean:
                os.unlink(shelve)
Ejemplo n.º 6
0
def test_chaining_splitters():
    """Test chaining different splitters."""
    first_split_attribute = 'field1'
    second_split_attribute = 'field2'

    input_items = list()
    for i in range(1, 21):
        input_items.append(TestItem('ref%s' % (i % 5), 'value%s' % (i % 3)))

    # chain two splitters without max_items
    # the first one won't do anything, so this is strictly
    # equivalent to having only the second one
    params_splitters = [
        {
            'split_attribute': first_split_attribute
        },
        {
            'split_attribute': second_split_attribute,
            'force_split': True
        },
    ]
    result = chain_splitters(input_items, params_splitters)
    assert len(result) == 3, ("The splitter returned %s files instead of 3" %
                              len(result))

    count = 0
    for filename in result:
        bucket_value = None
        f = open(filename, 'rb')
        for item in get_input_item_flow(f):
            value = getattr(item, second_split_attribute)
            if not bucket_value:
                bucket_value = value
            assert value == bucket_value, (
                "We got an item with %s for its split_attribute "
                "in a file of items with %s" % (value, bucket_value))
            count += 1
        f.close()
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the "
        "%s we had before" % (count, len(input_items)))

    # chain two splitters with first max_items < nb items
    params_splitters = [{
        'max_items': 8,
        'split_attribute': first_split_attribute
    }, {
        'split_attribute': second_split_attribute,
        'force_split': True
    }]
    result = chain_splitters(input_items, params_splitters)
    assert len(result) == 9, ("The splitter returned %s files instead of 9" %
                              len(result))

    count = 0
    already_seen = list()
    for filename in result:
        bucket_value = None
        ref_values = set()

        f = open(filename, 'rb')
        for item in get_input_item_flow(f):
            value = getattr(item, second_split_attribute)
            if not bucket_value:
                bucket_value = value
            assert value == bucket_value, (
                "We got an item with %s for its split_attribute in a "
                "file of items with %s" % (value, bucket_value))

            ref_value = (getattr(item, first_split_attribute), value)
            ref_values.add(ref_value)
            assert ref_value not in already_seen, (
                "The value %s should have been in a previous file" %
                (ref_value, ))

            count += 1
        f.close()
        already_seen.extend(ref_values)
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the "
        "%s we had before" % (count, len(input_items)))
Ejemplo n.º 7
0
def test_chaining_splitters_single():
    """Make sure that chaining only one splitter doesn't
    change the behavior.
    """
    split_attribute = 'field1'

    input_items = list()
    for i in range(1, 11):
        input_items.append(TestItem('ref%s' % (i % 5), 'value%s' % (i % 3)))

    # simple splitter without max_items
    params_splitters = [{}]
    result = chain_splitters(input_items, params_splitters)
    assert len(result) == 1, ("The splitter returned "
                              "%s files instead of 1" % len(result))

    count = 0
    for filename in result:
        f = open(filename, 'rb')
        for item in get_input_item_flow(f):
            assert item == input_items[count], (
                "[File %s][Item %s] Got %s instead of %s" %
                (f.name, count, item, input_items[count]))
            count += 1
        f.close()
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the "
        "%s we had before" % (count, len(input_items)))

    # simple splitter with max_items < nb items
    params_splitters = [{'max_items': 4}]
    result = chain_splitters(input_items, params_splitters)
    assert len(result) == 3, ("The splitter returned %s files instead of 3" %
                              len(result))

    count = 0
    for filename in result:
        f = open(filename, 'rb')
        for item in get_input_item_flow(f):
            assert item == input_items[count], (
                "[File %s][Item %s] Got %s instead of %s" %
                (f.name, count, item, input_items[count]))
            count += 1
        f.close()
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the "
        "%s we had before" % (count, len(input_items)))

    # split by attribute without max_items
    params_splitters = [{'split_attribute': split_attribute}]
    result = chain_splitters(input_items, params_splitters)
    assert len(result) == 1, ("The splitter returned %s files instead of 1" %
                              len(result))

    count = 0
    already_seen = list()
    for filename in result:
        ref_values = set()
        f = open(filename, 'rb')
        for item in get_input_item_flow(f):
            ref_value = getattr(item, split_attribute)
            ref_values.add(ref_value)
            assert ref_value not in already_seen, (
                "The value %s should have been in "
                "a previous file" % ref_value)
            count += 1
        f.close()
        already_seen.extend(ref_values)
    assert count == len(input_items), ("We got %s items after splitting, "
                                       "instead of the %s we had before" %
                                       (count, len(input_items)))

    # split by attribute with max_items < nb items
    params_splitters = [{'max_items': 8, 'split_attribute': split_attribute}]
    result = chain_splitters(input_items, params_splitters)
    assert len(result) == 2, ("The splitter returned %s files instead of 2" %
                              len(result))

    count = 0
    already_seen = list()
    for filename in result:
        ref_values = set()
        f = open(filename, 'rb')
        for item in get_input_item_flow(f):
            ref_value = getattr(item, split_attribute)
            ref_values.add(ref_value)
            assert ref_value not in already_seen, (
                "The value %s should have been "
                "in a previous file" % ref_value)
            count += 1
        f.close()
        already_seen.extend(ref_values)
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the "
        "%s we had before" % (count, len(input_items)))

    # split by attribute forcing split without max_items
    params_splitters = [{
        'split_attribute': split_attribute,
        'force_split': True
    }]
    result = chain_splitters(input_items, params_splitters)
    assert len(result) == 5, ("The splitter returned %s files instead of 5" %
                              len(result))

    count = 0
    for filename in result:
        bucket_value = None
        f = open(filename, 'rb')
        for item in get_input_item_flow(f):
            value = getattr(item, split_attribute)
            if not bucket_value:
                bucket_value = value
            assert value == bucket_value, (
                "We got an item with %s for its split_attribute "
                "in a file of items with %s" % (value, bucket_value))
            count += 1
        f.close()
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the "
        "%s we had before" % (count, len(input_items)))

    # split by attribute forcing split with max_items < nb items
    params_splitters = [{
        'max_items': 6,
        'split_attribute': split_attribute,
        'force_split': True
    }]
    result = chain_splitters(input_items, params_splitters)
    assert len(result) == 5, ("The splitter returned %s files instead of 5" %
                              len(result))

    count = 0
    for filename in result:
        bucket_value = None
        f = open(filename, 'rb')
        for item in get_input_item_flow(f):
            value = getattr(item, split_attribute)
            if not bucket_value:
                bucket_value = value
            assert value == bucket_value, (
                "We got an item with %s for its split_attribute "
                "in a file of items with %s" % (value, bucket_value))
            count += 1
        f.close()
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the "
        "%s we had before" % (count, len(input_items)))
Ejemplo n.º 8
0
def test_splitter_force_attr():
    """Test splitting by attribute (forcing split) in various scenarios."""
    split_attribute = 'field1'

    # empty input with no max_items
    input_items = list()
    result = get_splitter(input_items, 0, split_attribute,
                          force_split=True).split()
    assert len(result) == 0, (
        "The splitter returned %s files instead of none" % len(result))

    # empty input with max_items
    result = get_splitter(input_items, 1, split_attribute,
                          force_split=True).split()
    assert len(result) == 0, (
        "The splitter returned %s files instead of none" % len(result))

    # only one item with no max_items
    input_items.append(TestItem('ref0', 'value0'))

    result = get_splitter(input_items, 0, split_attribute,
                          force_split=True).split()
    assert len(result) == 1, ("The splitter returned %s files instead of 1" %
                              len(result))

    count = 0
    f = open(result[0], 'rb')
    for item in get_input_item_flow(f):
        assert item == input_items[count], ("Got %s instead of %s" %
                                            (input_items[count], item))
        count += 1
    f.close()
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the %s we had before" %
        (count, len(input_items)))

    # only one item with max_items = 1
    result = get_splitter(input_items, 1, split_attribute,
                          force_split=True).split()
    assert len(result) == 1, ("The splitter returned %s files instead of 1" %
                              len(result))

    count = 0
    f = open(result[0], 'rb')
    for item in get_input_item_flow(f):
        assert item == input_items[count], ("Got %s instead of %s" %
                                            (input_items[count], item))
        count += 1
    f.close()
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the %s we had before" %
        (count, len(input_items)))

    # only one item with max_items > 1
    result = get_splitter(input_items, 10, split_attribute,
                          force_split=True).split()
    assert len(result) == 1, ("The splitter returned %s files instead of 1" %
                              len(result))

    count = 0
    f = open(result[0], 'rb')
    for item in get_input_item_flow(f):
        assert item == input_items[count], ("Got %s instead of %s" %
                                            (item, input_items[count]))
        count += 1
    f.close()
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the %s we had before" %
        (count, len(input_items)))

    # list of items with no max_items
    input_items.append(TestItem('ref0', 'value1'))
    input_items.extend([TestItem('ref1', 'value%s' % i) for i in range(3)])

    result = get_splitter(input_items, 0, split_attribute,
                          force_split=True).split()
    assert len(result) == 2, ("The splitter returned %s files instead of 2" %
                              len(result))

    count = 0
    for filename in result:
        bucket_value = None
        f = open(filename, 'rb')
        for item in get_input_item_flow(f):
            value = getattr(item, split_attribute)
            if not bucket_value:
                bucket_value = value
            assert value == bucket_value, (
                "We got an item with %s for its split_attribute "
                "in a file of items with %s" % (value, bucket_value))
            count += 1
        f.close()
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the %s we had before" %
        (count, len(input_items)))

    # list of items with max_items < nb items
    result = get_splitter(input_items, 3, split_attribute,
                          force_split=True).split()
    assert len(result) == 2, ("The splitter returned %s files instead of 2" %
                              len(result))

    count = 0
    for filename in result:
        bucket_value = None
        f = open(filename, 'rb')
        for item in get_input_item_flow(f):
            value = getattr(item, split_attribute)
            if not bucket_value:
                bucket_value = value
            assert value == bucket_value, (
                "We got an item with %s for its split_attribute "
                "in a file of items with %s" % (value, bucket_value))
            count += 1
        f.close()
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the %s we had before" %
        (count, len(input_items)))

    # list of items with max_items = nb items
    result = get_splitter(input_items, 5, split_attribute,
                          force_split=True).split()
    assert len(result) == 2, ("The splitter returned %s files instead of 2" %
                              len(result))

    count = 0
    for filename in result:
        bucket_value = None
        f = open(filename, 'rb')
        for item in get_input_item_flow(f):
            value = getattr(item, split_attribute)
            if not bucket_value:
                bucket_value = value
            assert value == bucket_value, (
                "We got an item with %s for its split_attribute "
                "in a file of items with %s" % (value, bucket_value))
            count += 1
        f.close()
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the %s we had before" %
        (count, len(input_items)))

    # list of items with max_items > nb items
    result = get_splitter(input_items, 10, split_attribute,
                          force_split=True).split()
    assert len(result) == 2, ("The splitter returned %s files instead of 2" %
                              len(result))

    count = 0
    for filename in result:
        bucket_value = None
        f = open(filename, 'rb')
        for item in get_input_item_flow(f):
            value = getattr(item, split_attribute)
            if not bucket_value:
                bucket_value = value
            assert value == bucket_value, (
                "We got an item with %s for its split_attribute "
                "in a file of items with %s" % (value, bucket_value))
            count += 1
        f.close()
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the "
        "%s we had before" % (count, len(input_items)))
Ejemplo n.º 9
0
def test_splitter_attr():
    """Test splitting by attribute in various scenarios."""
    split_attribute = 'field1'

    # empty input with no max_items
    input_items = list()
    result = get_splitter(input_items, 0, split_attribute).split()
    assert len(result) == 0, (
        "The splitter returned %s files instead of none" % len(result))

    # empty input with max_items
    result = get_splitter(input_items, 1, split_attribute).split()
    assert len(result) == 0, (
        "The splitter returned %s files instead of none" % len(result))

    # only one item with no max_items
    input_items.append(TestItem('ref0', 'value0'))

    result = get_splitter(input_items, 0, split_attribute).split()
    assert len(result) == 1, ("The splitter returned %s files instead of 1" %
                              len(result))

    count = 0
    f = open(result[0], 'rb')
    for item in get_input_item_flow(f):
        assert item == input_items[count], ("Got %s instead of %s" %
                                            (input_items[count], item))
        count += 1
    f.close()
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the %s we had before" %
        (count, len(input_items)))

    # only one item with max_items = 1
    result = get_splitter(input_items, 1, split_attribute).split()
    assert len(result) == 1, ("The splitter returned %s files instead of 1" %
                              len(result))

    count = 0
    f = open(result[0], 'rb')
    for item in get_input_item_flow(f):
        assert item == input_items[count], ("Got %s instead of %s" %
                                            (input_items[count], item))
        count += 1
    f.close()
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the %s we had before" %
        (count, len(input_items)))

    # only one item with max_items > 1
    result = get_splitter(input_items, 10, split_attribute).split()
    assert len(result) == 1, ("The splitter returned %s files instead of 1" %
                              len(result))

    count = 0
    f = open(result[0], 'rb')
    for item in get_input_item_flow(f):
        assert item == input_items[count], ("Got %s instead of %s" %
                                            (input_items[count], item))
        count += 1
    f.close()
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the %s we had before" %
        (count, len(input_items)))

    # list of items with no max_items
    input_items.append(TestItem('ref0', 'value1'))
    input_items.extend([TestItem('ref1', 'value%s' % i) for i in range(3)])

    result = get_splitter(input_items, 0, split_attribute).split()
    assert len(result) == 1, ("The splitter returned %s files instead of 1" %
                              len(result))

    count = 0
    already_seen = list()
    for filename in result:
        ref_values = set()
        f = open(filename, 'rb')
        for item in get_input_item_flow(f):
            ref_value = getattr(item, split_attribute)
            ref_values.add(ref_value)
            assert ref_value not in already_seen, (
                "The value %s should have been in a previous file" %
                (ref_value))
            count += 1
        f.close()
        already_seen.extend(ref_values)
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the %s we had before" %
        (count, len(input_items)))

    # list of items with max_items < nb items
    result = get_splitter(input_items, 3, split_attribute).split()
    assert len(result) == 2, ("The splitter returned %s files instead of 2" %
                              len(result))

    count = 0
    already_seen = list()
    for filename in result:
        ref_values = set()
        f = open(filename, 'rb')
        for item in get_input_item_flow(f):
            ref_value = getattr(item, split_attribute)
            ref_values.add(ref_value)
            assert ref_value not in already_seen, (
                "The value %s should have been in a previous file" %
                (ref_value))
            count += 1
        f.close()
        already_seen.extend(ref_values)
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the %s we had before" %
        (count, len(input_items)))

    # list of items with max_items = nb items
    result = get_splitter(input_items, 5, split_attribute).split()
    assert len(result) == 1, ("The splitter returned %s files instead of 1" %
                              len(result))

    count = 0
    already_seen = list()
    for filename in result:
        ref_values = set()
        f = open(filename, 'rb')
        for item in get_input_item_flow(f):
            ref_value = getattr(item, split_attribute)
            ref_values.add(ref_value)
            assert ref_value not in already_seen, (
                "The value %s should have been in a previous file" %
                (ref_value))
            count += 1
        f.close()
        already_seen.extend(ref_values)
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the %s we had before" %
        (count, len(input_items)))

    # list of items with max_items > nb items
    result = get_splitter(input_items, 10, split_attribute).split()
    assert len(result) == 1, ("The splitter returned %s files instead of 1" %
                              len(result))

    count = 0
    already_seen = list()
    for filename in result:
        ref_values = set()
        f = open(filename, 'rb')
        for item in get_input_item_flow(f):
            ref_value = getattr(item, split_attribute)
            ref_values.add(ref_value)
            assert ref_value not in already_seen, (
                "The value %s should have been in a previous file" %
                (ref_value))
            count += 1
        f.close()
        already_seen.extend(ref_values)
    assert count == len(input_items), (
        "We got %s items after splitting, instead of the %s we had before" %
        (count, len(input_items)))