def map(self, value): """Generator that yields mapped items. """ items = etree.fromstring(value) if self.xpath: items = items.findall(self.xpath) for item in items: mapped = Hash((( self.field, item, ), )) value_for_item = self.map_item(item) if value_for_item: if isgenerator(value_for_item): for _value in value_for_item: if _value: yield copy(mapped).update(_value) else: try: mapped.update(value_for_item) yield mapped except TypeError, e: raise TypeError( '{name}.map_item(...) must be iterable.'.format( name=type(self).__name__))
def test_restrict_with_renamer(self): h = Hash({'foo': 'bar', 'bar': 'baz', }) h.restrict(tester=lambda k: k in ('bar', ), renamer=lambda k: k.upper()) self.assertTrue(not 'foo' in h) self.assertTrue(not 'bar' in h) self.assertTrue('BAR' in h) self.assertEqual(h['BAR'], 'baz')
def map(self, value): """Generator that yields mapped items. """ items = etree.fromstring(value) if self.xpath: items = items.findall(self.xpath) for item in items: mapped = Hash(((self.field, item, ), )) value_for_item = self.map_item(item) if value_for_item: if isgenerator(value_for_item): for _value in value_for_item: if _value: yield copy(mapped).update(_value) else: try: mapped.update(value_for_item) yield mapped except TypeError, e: raise TypeError('{name}.map_item(...) must be iterable.'.format( name=type(self).__name__ ))
def test_copy(self): h1 = Hash({'foo': 'bar', 'bar': 'baz', }) h2 = h1.copy() h3 = h1.copy({'bar': 'oh my bar'}) h1['foo'] = 'original foo' h2['foo'] = 'new foo' self.assertEqual(h1['foo'], 'original foo') self.assertEqual(h2['foo'], 'new foo') self.assertEqual(h3['foo'], 'bar') self.assertEqual(h3['bar'], 'oh my bar')
def test_base_class_generator(self): def generator(): yield INPUT_DATA[0] yield INPUT_DATA[1] t = Extract(extract=generator) self.assertStreamEqual(t(Hash()), INPUT_DATA)
def test_base_class_decorator(self): @Extract def my_iterable(): return INPUT_DATA self.assertEqual(my_iterable.__name__, 'my_iterable') self.assertStreamEqual(my_iterable(Hash()), INPUT_DATA)
def test_remove(self): t = self._create_transform() t.remove('foo', 'boo') r = t.transform(Hash({'foo': 'bar', 'bar': 'baz', 'boo': 'hiya'})) self.assertIn('bar', r) self.assertNotIn('foo', r) self.assertNotIn('boo', r)
def test_base_class_decorator_generator(self): @Extract def my_generator(): yield INPUT_DATA[0] yield INPUT_DATA[1] self.assertEqual(my_generator.__name__, 'my_generator') self.assertStreamEqual(my_generator(Hash()), INPUT_DATA)
def __call__(self, *stream, **options): channel = options['channel'] if 'channel' in options else STDIN for hash in stream: if not isinstance(hash, Hash): hash = Hash(hash) for line in self.transform(hash, channel): yield line
def validate(self): """Validation of transform graph validity.""" for id, transform in self._transforms.items(): # Adds a special single empty hash queue to unplugged inputs for queue in transform._input.unplugged: queue.put(Begin) queue.put(Hash()) queue.put(End) transform._output.put_all(Begin)
def test_method_setter(self): hash = Hash({'foo': 'bar'}) hash.set('bar', 'heya') self.assertEqual(hash['bar'], 'heya') self.assertEqual(hash['foo'], 'bar') hash.set('foo', 'yoho') self.assertEqual(hash['foo'], 'yoho') self.assertEqual(hash.set('any', 'thing'), hash)
def commit(self): with self.connection.begin(): while len(self._buffer): hash = self._buffer.pop(0) try: yield self.do_transform(copy(hash)) except Exception as e: yield Hash(( ( '_input', hash, ), ( '_transform', self, ), ( '_error', e, ), )), STDERR
def test_base_class_decorator(self): @Map def my_map(s): for l in s.split('\n'): yield Hash( (('f%d' % (i, ), v) for i, v in enumerate(l.split(':')))) self.assertStreamEqual(my_map((Hash({'_': 'a:b:c\nb:c:d\nc:d:e'}))), ( OrderedDict(( ('f0', 'a'), ('f1', 'b'), ('f2', 'c'), ), ), OrderedDict(( ('f0', 'b'), ('f1', 'c'), ('f2', 'd'), ), ), OrderedDict(( ('f0', 'c'), ('f1', 'd'), ('f2', 'e'), ), ), ))
def test_get_values(self): h = Hash({'foo': 'bar', 'bar': 'baz', 'baz': 'boo', }) self.assertEquals(h.get_values(('baz', 'foo', 'bar', )), ['boo', 'bar', 'baz', ])
def test_base_class_decorator(self): @XmlMap def xml_map(item): f = lambda path: item.findtext(path) return OrderedDict(( ( 'id', item.attrib['id'], ), ( 'name', f('name'), ), ( 'value', ';'.join((i.text for i in item.findall('values/data'))), ), )) xml_map.xpath = './path/to/items/item' self.assertStreamEqual( xml_map((Hash({ '_': '''<root> <path> <to> <items> <item id="one"> <name>foo</name> <values> <data>bar</data> </values> </item> <item id="two"> <name>bar</name> <values> <data>baz</data> </values> </item> <item id="three"> <name>baz</name> <values> <data>toto</data> <data>titi</data> </values> </item> </items> </to> </path> </root>''' }))), ( OrderedDict(( ('id', 'one'), ('name', 'foo'), ('value', 'bar'), ), ), OrderedDict(( ('id', 'two'), ('name', 'bar'), ('value', 'baz'), ), ), OrderedDict(( ('id', 'three'), ('name', 'baz'), ('value', 'toto;titi'), ), ), ))
class Join(Transform): """ Join some key => value pairs, that can depend on the source hash. This element can change the stream length, either positively (joining >1 item data) or negatively (joining <1 item data) .. automethod:: join Example:: >>> from rdc.etl.transform.join import Join >>> from rdc.etl.transform.util import clean >>> @Join ... def my_join(hash, channel=STDIN): ... return ({'a':1}, {'b':2}, ) >>> map(clean, my_join({'foo': 'bar'}, {'foo': 'baz'}, )) [H{'foo': 'bar', 'a': 1}, H{'foo': 'bar', 'b': 2}, H{'foo': 'baz', 'a': 1}, H{'foo': 'baz', 'b': 2}] """ is_outer = False """ Return default join data when an outer join is requested but join data is empty. Not used in the default inner join case, because no row will be returned if current row did not generate join data. """ default_outer_join_data = Hash() def __init__(self, join=None, is_outer=False, default_outer_join_data=None): super(Join, self).__init__() self.is_outer = is_outer or self.is_outer self.default_outer_join_data = default_outer_join_data or self.default_outer_join_data self.join = join or self.join def join(self, hash, channel=STDIN): """ Abtract method that must be implemented in concrete subclasses, to return the data that should be joined with the given row. It should be iterable, or equivalent to False in a test. If the result is iterable and its length is superior to 0, the result of this transform will be a cartesian product between this method result and the original input row. If the result is false or iterable but 0-length, the result of this transform will depend on the join type, determined by the is_outer attribute. - If is_outer == True, the transform output will be a simple union between the input row and the result of self.get_default_outer_join_data() - If is_outer == False, this row will be sinked, and will not generate any output from this transform. Default join type is inner, to preserve backward compatibility. """ raise AbstractError(self.join) def transform(self, hash, channel=STDIN): join_data = self.join(hash, channel) cnt = 0 if join_data: for data in join_data: yield hash.copy(data) cnt += 1 if not cnt and self.is_outer: yield hash.copy(self.default_outer_join_data)
def test_base_class(self): t = Extract(extract=INPUT_DATA) self.assertStreamEqual(t(Hash()), INPUT_DATA)
def test_restrict(self): h = Hash({'foo': 'bar', 'bar': 'baz', }) h.restrict(tester=lambda k: k in ('bar', )) self.assertTrue(not 'foo' in h) self.assertTrue('bar' in h)
def extract_supported_domains(): r = requests.get(url) for tld in r.text.split('\n'): yield Hash((('tld', tld), ))
def test_method_in(self): hash = Hash({'foo': 'bar', 'bar': None}) self.assertEqual(hash.has('foo'), True) self.assertEqual(hash.has('bar'), False) self.assertEqual(hash.has('bar', True), True) self.assertEqual(hash.has('baz'), False)
def test_method_getter(self): hash = Hash({'foo': 'bar', 'bar': None}) self.assertEqual(hash.get('foo', 'baz'), 'bar') self.assertEqual(hash.get('bar', 'baz'), None) self.assertEqual(hash.get('boo', 'foo'), 'foo') self.assertEqual(hash.get('boo'), None)
def test_constructor_zippedtuples(self): hash = Hash((('foo', 'bar', ), ('bar', 'baz', ), )) self.assertEqual(hash.get('foo'), 'bar') self.assertEqual(hash.get('bar'), 'baz')
def test_constructor_hash(self): hash = Hash({'foo': 'bar', 'bar': 'baz'}) self.assertEqual(hash.get('foo'), 'bar') self.assertEqual(hash.get('bar'), 'baz')
def H(*args): """Builds a hash from a list of pairs.""" from rdc.etl.hash import Hash return Hash(args)
def test_remove(self): h = Hash({'foo': 'bar', 'bar': 'baz', 'baz': 'boo', }) h.remove('foo', 'baz') self.assertTrue(not 'foo' in h) self.assertTrue('bar' in h) self.assertTrue(not 'baz' in h)
def my_map(s): for l in s.split('\n'): yield Hash( (('f%d' % (i, ), v) for i, v in enumerate(l.split(':'))))