def _parse_serializers(self): ser_conf = self._get_env_conf(SERIALIZATION_CONF_PICKE_FILE_PATH) or {} serializers = { 'input': get_protocol_from_name(ser_conf.get('input', DEFAULT_INPUT_SERIALIZED)), 'output': get_protocol_from_name(ser_conf.get('output', DEFAULT_OUTPUT_SERIALIZED)), 'inter': get_protocol_from_name(ser_conf.get('inter', DEFAULT_INTER_SERIALIZED)), } self._set_serializers(serializers)
def cat(self, path, serializer='raw', tab_separated=False): """ Returns a generator over files defined by path :param path: path to the files :param serializer: input serializer. Options are json, pickle and raw(default) :param tab_seperated: boolean if input is tab separated """ job = self._hdpm._run_hadoop_cmd('fs', ('-cat', path)) output = job.yield_stdout() output_serializer = get_protocol_from_name(serializer) for line in output: line = line.rstrip() if tab_separated: ls = line.split('\t') if len(ls) > 1: yield tuple(output_serializer.decode(part) for part in line.split('\t')) else: yield output_serializer.decode(line) else: yield output_serializer.decode(line) job.join()
def cat(self, path, serializer='raw', tab_separated=False): """ Returns a generator over files defined by path :param path: path to the files :param serializer: input serializer. Options are json, pickle and raw(default) :param tab_seperated: boolean if input is tab separated """ job = self._hdpm._run_hadoop_cmd('fs', ('-cat', path)) output = job.yield_stdout() output_serializer = get_protocol_from_name(serializer) for line in output: line = line.rstrip() if tab_separated: ls = line.split('\t') if len(ls) > 1: yield tuple( output_serializer.decode(part) for part in line.split('\t')) else: yield output_serializer.decode(line) else: yield output_serializer.decode(line) job.join()
def _parse_serializers(self): ser_conf = self._get_env_conf(SERIALIZATION_CONF_PICKE_FILE_PATH) or {} serializers = { 'input': get_protocol_from_name( ser_conf.get('input', DEFAULT_INPUT_SERIALIZED)), 'output': get_protocol_from_name( ser_conf.get('output', DEFAULT_OUTPUT_SERIALIZED)), 'inter': get_protocol_from_name( ser_conf.get('inter', DEFAULT_INTER_SERIALIZED)), } self._set_serializers(serializers)