Ejemplo n.º 1
0
    def map(self, value):
        """Generator that yields mapped items.

        """
        items = etree.fromstring(value)

        if self.xpath:
            items = items.findall(self.xpath)

        for item in items:
            mapped = Hash(((
                self.field,
                item,
            ), ))
            value_for_item = self.map_item(item)
            if value_for_item:
                if isgenerator(value_for_item):
                    for _value in value_for_item:
                        if _value:
                            yield copy(mapped).update(_value)
                else:
                    try:
                        mapped.update(value_for_item)
                        yield mapped
                    except TypeError, e:
                        raise TypeError(
                            '{name}.map_item(...) must be iterable.'.format(
                                name=type(self).__name__))
Ejemplo n.º 2
0
 def test_restrict_with_renamer(self):
     h = Hash({'foo': 'bar', 'bar': 'baz', })
     h.restrict(tester=lambda k: k in ('bar', ), renamer=lambda k: k.upper())
     self.assertTrue(not 'foo' in h)
     self.assertTrue(not 'bar' in h)
     self.assertTrue('BAR' in h)
     self.assertEqual(h['BAR'], 'baz')
Ejemplo n.º 3
0
    def map(self, value):
        """Generator that yields mapped items.

        """
        items = etree.fromstring(value)

        if self.xpath:
            items = items.findall(self.xpath)

        for item in items:
            mapped = Hash(((self.field, item, ), ))
            value_for_item = self.map_item(item)
            if value_for_item:
                if isgenerator(value_for_item):
                    for _value in value_for_item:
                        if _value:
                            yield copy(mapped).update(_value)
                else:
                    try:
                        mapped.update(value_for_item)
                        yield mapped
                    except TypeError, e:
                        raise TypeError('{name}.map_item(...) must be iterable.'.format(
                            name=type(self).__name__
                        ))
Ejemplo n.º 4
0
 def test_copy(self):
     h1 = Hash({'foo': 'bar', 'bar': 'baz', })
     h2 = h1.copy()
     h3 = h1.copy({'bar': 'oh my bar'})
     h1['foo'] = 'original foo'
     h2['foo'] = 'new foo'
     self.assertEqual(h1['foo'], 'original foo')
     self.assertEqual(h2['foo'], 'new foo')
     self.assertEqual(h3['foo'], 'bar')
     self.assertEqual(h3['bar'], 'oh my bar')
Ejemplo n.º 5
0
    def test_base_class_generator(self):
        def generator():
            yield INPUT_DATA[0]
            yield INPUT_DATA[1]

        t = Extract(extract=generator)
        self.assertStreamEqual(t(Hash()), INPUT_DATA)
Ejemplo n.º 6
0
    def test_base_class_decorator(self):
        @Extract
        def my_iterable():
            return INPUT_DATA

        self.assertEqual(my_iterable.__name__, 'my_iterable')
        self.assertStreamEqual(my_iterable(Hash()), INPUT_DATA)
Ejemplo n.º 7
0
 def test_remove(self):
     t = self._create_transform()
     t.remove('foo', 'boo')
     r = t.transform(Hash({'foo': 'bar', 'bar': 'baz', 'boo': 'hiya'}))
     self.assertIn('bar', r)
     self.assertNotIn('foo', r)
     self.assertNotIn('boo', r)
Ejemplo n.º 8
0
    def test_base_class_decorator_generator(self):
        @Extract
        def my_generator():
            yield INPUT_DATA[0]
            yield INPUT_DATA[1]

        self.assertEqual(my_generator.__name__, 'my_generator')
        self.assertStreamEqual(my_generator(Hash()), INPUT_DATA)
Ejemplo n.º 9
0
    def __call__(self, *stream, **options):
        channel = options['channel'] if 'channel' in options else STDIN

        for hash in stream:
            if not isinstance(hash, Hash):
                hash = Hash(hash)

            for line in self.transform(hash, channel):
                yield line
Ejemplo n.º 10
0
    def validate(self):
        """Validation of transform graph validity."""
        for id, transform in self._transforms.items():
            # Adds a special single empty hash queue to unplugged inputs
            for queue in transform._input.unplugged:
                queue.put(Begin)
                queue.put(Hash())
                queue.put(End)

            transform._output.put_all(Begin)
Ejemplo n.º 11
0
 def test_method_setter(self):
     hash = Hash({'foo': 'bar'})
     hash.set('bar', 'heya')
     self.assertEqual(hash['bar'], 'heya')
     self.assertEqual(hash['foo'], 'bar')
     hash.set('foo', 'yoho')
     self.assertEqual(hash['foo'], 'yoho')
     self.assertEqual(hash.set('any', 'thing'), hash)
Ejemplo n.º 12
0
 def commit(self):
     with self.connection.begin():
         while len(self._buffer):
             hash = self._buffer.pop(0)
             try:
                 yield self.do_transform(copy(hash))
             except Exception as e:
                 yield Hash((
                     (
                         '_input',
                         hash,
                     ),
                     (
                         '_transform',
                         self,
                     ),
                     (
                         '_error',
                         e,
                     ),
                 )), STDERR
Ejemplo n.º 13
0
    def test_base_class_decorator(self):
        @Map
        def my_map(s):
            for l in s.split('\n'):
                yield Hash(
                    (('f%d' % (i, ), v) for i, v in enumerate(l.split(':'))))

        self.assertStreamEqual(my_map((Hash({'_': 'a:b:c\nb:c:d\nc:d:e'}))), (
            OrderedDict((
                ('f0', 'a'),
                ('f1', 'b'),
                ('f2', 'c'),
            ), ),
            OrderedDict((
                ('f0', 'b'),
                ('f1', 'c'),
                ('f2', 'd'),
            ), ),
            OrderedDict((
                ('f0', 'c'),
                ('f1', 'd'),
                ('f2', 'e'),
            ), ),
        ))
Ejemplo n.º 14
0
 def test_get_values(self):
     h = Hash({'foo': 'bar', 'bar': 'baz', 'baz': 'boo', })
     self.assertEquals(h.get_values(('baz', 'foo', 'bar', )), ['boo', 'bar', 'baz', ])
Ejemplo n.º 15
0
    def test_base_class_decorator(self):
        @XmlMap
        def xml_map(item):
            f = lambda path: item.findtext(path)
            return OrderedDict((
                (
                    'id',
                    item.attrib['id'],
                ),
                (
                    'name',
                    f('name'),
                ),
                (
                    'value',
                    ';'.join((i.text for i in item.findall('values/data'))),
                ),
            ))

        xml_map.xpath = './path/to/items/item'

        self.assertStreamEqual(
            xml_map((Hash({
                '_':
                '''<root>
                <path>
                    <to>
                        <items>
                            <item id="one">
                                <name>foo</name>
                                <values>
                                    <data>bar</data>
                                </values>
                            </item>
                            <item id="two">
                                <name>bar</name>
                                <values>
                                    <data>baz</data>
                                </values>
                            </item>
                            <item id="three">
                                <name>baz</name>
                                <values>
                                    <data>toto</data>
                                    <data>titi</data>
                                </values>
                            </item>
                        </items>
                    </to>
                </path>
            </root>'''
            }))), (
                OrderedDict((
                    ('id', 'one'),
                    ('name', 'foo'),
                    ('value', 'bar'),
                ), ),
                OrderedDict((
                    ('id', 'two'),
                    ('name', 'bar'),
                    ('value', 'baz'),
                ), ),
                OrderedDict((
                    ('id', 'three'),
                    ('name', 'baz'),
                    ('value', 'toto;titi'),
                ), ),
            ))
Ejemplo n.º 16
0
class Join(Transform):
    """
    Join some key => value pairs, that can depend on the source hash.

    This element can change the stream length, either positively (joining >1 item data) or negatively (joining <1 item data)

    .. automethod:: join

    Example::

        >>> from rdc.etl.transform.join import Join
        >>> from rdc.etl.transform.util import clean

        >>> @Join
        ... def my_join(hash, channel=STDIN):
        ...     return ({'a':1}, {'b':2}, )

        >>> map(clean, my_join({'foo': 'bar'}, {'foo': 'baz'}, ))
        [H{'foo': 'bar', 'a': 1}, H{'foo': 'bar', 'b': 2}, H{'foo': 'baz', 'a': 1}, H{'foo': 'baz', 'b': 2}]

    """

    is_outer = False
    """
    Return default join data when an outer join is requested but join data is empty. Not used in the default inner
    join case, because no row will be returned if current row did not generate join data.

    """
    default_outer_join_data = Hash()

    def __init__(self,
                 join=None,
                 is_outer=False,
                 default_outer_join_data=None):
        super(Join, self).__init__()
        self.is_outer = is_outer or self.is_outer
        self.default_outer_join_data = default_outer_join_data or self.default_outer_join_data
        self.join = join or self.join

    def join(self, hash, channel=STDIN):
        """
        Abtract method that must be implemented in concrete subclasses, to return the data that should be joined with
        the given row.

        It should be iterable, or equivalent to False in a test.

        If the result is iterable and its length is superior to 0, the result of this transform will be a cartesian
        product between this method result and the original input row.

        If the result is false or iterable but 0-length, the result of this transform will depend on the join type,
        determined by the is_outer attribute.

        - If is_outer == True, the transform output will be a simple union between the input row and the result of
          self.get_default_outer_join_data()
        - If is_outer == False, this row will be sinked, and will not generate any output from this transform.

        Default join type is inner, to preserve backward compatibility.

        """
        raise AbstractError(self.join)

    def transform(self, hash, channel=STDIN):
        join_data = self.join(hash, channel)

        cnt = 0
        if join_data:
            for data in join_data:
                yield hash.copy(data)
                cnt += 1

        if not cnt and self.is_outer:
            yield hash.copy(self.default_outer_join_data)
Ejemplo n.º 17
0
 def test_base_class(self):
     t = Extract(extract=INPUT_DATA)
     self.assertStreamEqual(t(Hash()), INPUT_DATA)
Ejemplo n.º 18
0
 def test_restrict(self):
     h = Hash({'foo': 'bar', 'bar': 'baz', })
     h.restrict(tester=lambda k: k in ('bar', ))
     self.assertTrue(not 'foo' in h)
     self.assertTrue('bar' in h)
Ejemplo n.º 19
0
def extract_supported_domains():
    r = requests.get(url)
    for tld in r.text.split('\n'):
        yield Hash((('tld', tld), ))
Ejemplo n.º 20
0
 def test_method_in(self):
     hash = Hash({'foo': 'bar', 'bar': None})
     self.assertEqual(hash.has('foo'), True)
     self.assertEqual(hash.has('bar'), False)
     self.assertEqual(hash.has('bar', True), True)
     self.assertEqual(hash.has('baz'), False)
Ejemplo n.º 21
0
 def test_method_getter(self):
     hash = Hash({'foo': 'bar', 'bar': None})
     self.assertEqual(hash.get('foo', 'baz'), 'bar')
     self.assertEqual(hash.get('bar', 'baz'), None)
     self.assertEqual(hash.get('boo', 'foo'), 'foo')
     self.assertEqual(hash.get('boo'), None)
Ejemplo n.º 22
0
 def test_constructor_zippedtuples(self):
     hash = Hash((('foo', 'bar', ), ('bar', 'baz', ), ))
     self.assertEqual(hash.get('foo'), 'bar')
     self.assertEqual(hash.get('bar'), 'baz')
Ejemplo n.º 23
0
 def test_constructor_hash(self):
     hash = Hash({'foo': 'bar', 'bar': 'baz'})
     self.assertEqual(hash.get('foo'), 'bar')
     self.assertEqual(hash.get('bar'), 'baz')
Ejemplo n.º 24
0
def H(*args):
    """Builds a hash from a list of pairs."""
    from rdc.etl.hash import Hash
    return Hash(args)
Ejemplo n.º 25
0
 def test_remove(self):
     h = Hash({'foo': 'bar', 'bar': 'baz', 'baz': 'boo', })
     h.remove('foo', 'baz')
     self.assertTrue(not 'foo' in h)
     self.assertTrue('bar' in h)
     self.assertTrue(not 'baz' in h)
Ejemplo n.º 26
0
 def my_map(s):
     for l in s.split('\n'):
         yield Hash(
             (('f%d' % (i, ), v) for i, v in enumerate(l.split(':'))))