def meta_source(self, input_path): """Use data files in a folder and read the scheme from the meta file. Defines a source tap using files in input_path, which should be a (HDFS) folder. Takes care of using the appropriate scheme that was used to store the data, using meta data in the data folder. Arguments: input_path -- the HDFS folder to store data into """ input_path = expand_path_with_home(input_path) source_scheme = MetaScheme.getSourceScheme(input_path) return self.source(cascading.tap.Hfs(source_scheme, input_path))
def meta_sink(self, cascading_scheme, output_path): """Store data together with meta information about the scheme used. A sink that also stores in a file information about the scheme used to store data, and human-readable descriptions in the .pycascading_header and .pycascading_types files with the field names and their types, respectively. Arguments: cascading_scheme -- the Cascading Scheme used to store data output_path -- the folder where the output tuples should be stored. If it exists, it will be erased and replaced! """ output_path = expand_path_with_home(output_path) sink_scheme = MetaScheme.getSinkScheme(cascading_scheme, output_path) return self.sink(cascading.tap.Hfs(sink_scheme, output_path, cascading.tap.SinkMode.REPLACE))
def meta_sink(self, cascading_scheme, output_path): """Store data together with meta information about the scheme used. A sink that also stores in a file information about the scheme used to store data, and human-readable descriptions in the .pycascading_header and .pycascading_types files with the field names and their types, respectively. Arguments: cascading_scheme -- the Cascading Scheme used to store data output_path -- the folder where the output tuples should be stored. If it exists, it will be erased and replaced! """ output_path = expand_path_with_home(output_path) sink_scheme = MetaScheme.getSinkScheme(cascading_scheme, output_path) return self.sink( cascading.tap.Hfs(sink_scheme, output_path, cascading.tap.SinkMode.REPLACE))