def test_base_class_decorator(self): @Extract def my_iterable(): return INPUT_DATA self.assertEqual(my_iterable.__name__, 'my_iterable') self.assertStreamEqual(my_iterable(Hash()), INPUT_DATA)
def map(self, value): """Generator that yields mapped items. """ items = etree.fromstring(value) if self.xpath: items = items.findall(self.xpath) for item in items: mapped = Hash((( self.field, item, ), )) value_for_item = self.map_item(item) if value_for_item: if isgenerator(value_for_item): for _value in value_for_item: if _value: yield copy(mapped).update(_value) else: try: mapped.update(value_for_item) yield mapped except TypeError, e: raise TypeError( '{name}.map_item(...) must be iterable.'.format( name=type(self).__name__))
def test_base_class_generator(self): def generator(): yield INPUT_DATA[0] yield INPUT_DATA[1] t = Extract(extract=generator) self.assertStreamEqual(t(Hash()), INPUT_DATA)
def test_remove(self): t = self._create_transform() t.remove('foo', 'boo') r = t.transform(Hash({'foo': 'bar', 'bar': 'baz', 'boo': 'hiya'})) self.assertIn('bar', r) self.assertNotIn('foo', r) self.assertNotIn('boo', r)
def test_base_class_decorator_generator(self): @Extract def my_generator(): yield INPUT_DATA[0] yield INPUT_DATA[1] self.assertEqual(my_generator.__name__, 'my_generator') self.assertStreamEqual(my_generator(Hash()), INPUT_DATA)
def __call__(self, *stream, **options): channel = options['channel'] if 'channel' in options else STDIN for hash in stream: if not isinstance(hash, Hash): hash = Hash(hash) for line in self.transform(hash, channel): yield line
def validate(self): """Validation of transform graph validity.""" for id, transform in self._transforms.items(): # Adds a special single empty hash queue to unplugged inputs for queue in transform._input.unplugged: queue.put(Begin) queue.put(Hash()) queue.put(End) transform._output.put_all(Begin)
def commit(self): with self.connection.begin(): while len(self._buffer): hash = self._buffer.pop(0) try: yield self.do_transform(copy(hash)) except Exception as e: yield Hash(( ( '_input', hash, ), ( '_transform', self, ), ( '_error', e, ), )), STDERR
def test_base_class_decorator(self): @Map def my_map(s): for l in s.split('\n'): yield Hash( (('f%d' % (i, ), v) for i, v in enumerate(l.split(':')))) self.assertStreamEqual(my_map((Hash({'_': 'a:b:c\nb:c:d\nc:d:e'}))), ( OrderedDict(( ('f0', 'a'), ('f1', 'b'), ('f2', 'c'), ), ), OrderedDict(( ('f0', 'b'), ('f1', 'c'), ('f2', 'd'), ), ), OrderedDict(( ('f0', 'c'), ('f1', 'd'), ('f2', 'e'), ), ), ))
def test_base_class(self): t = Extract(extract=INPUT_DATA) self.assertStreamEqual(t(Hash()), INPUT_DATA)
def my_map(s): for l in s.split('\n'): yield Hash( (('f%d' % (i, ), v) for i, v in enumerate(l.split(':'))))
def extract_supported_domains(): r = requests.get(url) for tld in r.text.split('\n'): yield Hash((('tld', tld), ))
class Join(Transform): """ Join some key => value pairs, that can depend on the source hash. This element can change the stream length, either positively (joining >1 item data) or negatively (joining <1 item data) .. automethod:: join Example:: >>> from rdc.etl.transform.join import Join >>> from rdc.etl.transform.util import clean >>> @Join ... def my_join(hash, channel=STDIN): ... return ({'a':1}, {'b':2}, ) >>> map(clean, my_join({'foo': 'bar'}, {'foo': 'baz'}, )) [H{'foo': 'bar', 'a': 1}, H{'foo': 'bar', 'b': 2}, H{'foo': 'baz', 'a': 1}, H{'foo': 'baz', 'b': 2}] """ is_outer = False """ Return default join data when an outer join is requested but join data is empty. Not used in the default inner join case, because no row will be returned if current row did not generate join data. """ default_outer_join_data = Hash() def __init__(self, join=None, is_outer=False, default_outer_join_data=None): super(Join, self).__init__() self.is_outer = is_outer or self.is_outer self.default_outer_join_data = default_outer_join_data or self.default_outer_join_data self.join = join or self.join def join(self, hash, channel=STDIN): """ Abtract method that must be implemented in concrete subclasses, to return the data that should be joined with the given row. It should be iterable, or equivalent to False in a test. If the result is iterable and its length is superior to 0, the result of this transform will be a cartesian product between this method result and the original input row. If the result is false or iterable but 0-length, the result of this transform will depend on the join type, determined by the is_outer attribute. - If is_outer == True, the transform output will be a simple union between the input row and the result of self.get_default_outer_join_data() - If is_outer == False, this row will be sinked, and will not generate any output from this transform. Default join type is inner, to preserve backward compatibility. """ raise AbstractError(self.join) def transform(self, hash, channel=STDIN): join_data = self.join(hash, channel) cnt = 0 if join_data: for data in join_data: yield hash.copy(data) cnt += 1 if not cnt and self.is_outer: yield hash.copy(self.default_outer_join_data)
def test_base_class_decorator(self): @XmlMap def xml_map(item): f = lambda path: item.findtext(path) return OrderedDict(( ( 'id', item.attrib['id'], ), ( 'name', f('name'), ), ( 'value', ';'.join((i.text for i in item.findall('values/data'))), ), )) xml_map.xpath = './path/to/items/item' self.assertStreamEqual( xml_map((Hash({ '_': '''<root> <path> <to> <items> <item id="one"> <name>foo</name> <values> <data>bar</data> </values> </item> <item id="two"> <name>bar</name> <values> <data>baz</data> </values> </item> <item id="three"> <name>baz</name> <values> <data>toto</data> <data>titi</data> </values> </item> </items> </to> </path> </root>''' }))), ( OrderedDict(( ('id', 'one'), ('name', 'foo'), ('value', 'bar'), ), ), OrderedDict(( ('id', 'two'), ('name', 'bar'), ('value', 'baz'), ), ), OrderedDict(( ('id', 'three'), ('name', 'baz'), ('value', 'toto;titi'), ), ), ))
def H(*args): """Builds a hash from a list of pairs.""" from rdc.etl.hash import Hash return Hash(args)