def __or__(self, pipe): if not self.__refresh and self.__hdfs_folder_exists: # We remove all sources that are replaced by this cache, otherwise # Cascading complains about unused source taps return self.__taps.meta_source(self.__cache_folder) else: # We split the data into storing and processing pipelines pipe | Pipe(random_pipe_name("cache")) | self.__taps.binary_sink(self.__cache_folder) return pipe | Pipe(random_pipe_name("no_cache"))
def __or__(self, pipe): if not self.__refresh and self.__hdfs_folder_exists: # We remove all sources that are replaced by this cache, otherwise # Cascading complains about unused source taps return self.__taps.meta_source(self.__cache_folder) else: # We split the data into storing and processing pipelines pipe | Pipe(random_pipe_name('cache')) | \ self.__taps.binary_sink(self.__cache_folder) return pipe | Pipe(random_pipe_name('no_cache'))
def _create_with_parent(self, parent): # We need to name every tail differently so that Cascading can assign # a tail map to all sinks. # TODO: revise this after I name every pipe part separately parent = parent | Pipe(name=random_pipe_name("sink")) self.__taps.sink_map[parent.get_assembly().getName()] = self.__cascading_tap self.__taps.tails.append(parent) return None
def _create_with_parent(self, parent): # We need to name every tail differently so that Cascading can assign # a tail map to all sinks. # TODO: revise this after I name every pipe part separately parent = parent | Pipe(name=random_pipe_name('sink')) self.__taps.sink_map[parent.get_assembly().getName()] = \ self.__cascading_tap self.__taps.tails.append(parent) return None
def _create_with_parent(self, parent): args = [] if self.__argument_selector: args.append(coerce_to_fields(self.__argument_selector)) args.append(self.__function) if self.__output_selector: args.append(coerce_to_fields(self.__output_selector)) # We need to put another Pipe after the Each since otherwise # joins may not work as the names of pipes apparently have to be # different for Cascading. each = cascading.pipe.Each(parent.get_assembly(), *args) return cascading.pipe.Pipe(random_pipe_name('each'), each)
def source(self, cascading_tap): """A generic source using Cascading taps. Arguments: cascading_tap -- the Cascading Scheme object to store data into """ # We can create the source tap right away and also use a Pipe to name # the head of this pipeline p = Pipe(name=random_pipe_name('source')) p.hash = hash(cascading_tap) p.add_context([p.get_assembly().getName()]) self._connect_source(p.get_assembly().getName(), cascading_tap) return p