コード例 #1
0
ファイル: streamer.py プロジェクト: Zemanta/hadoop-manager
    def _parse_serializers(self):
        ser_conf = self._get_env_conf(SERIALIZATION_CONF_PICKE_FILE_PATH) or {}

        serializers = {
                'input': get_protocol_from_name(ser_conf.get('input', DEFAULT_INPUT_SERIALIZED)),
                'output': get_protocol_from_name(ser_conf.get('output', DEFAULT_OUTPUT_SERIALIZED)),
                'inter': get_protocol_from_name(ser_conf.get('inter', DEFAULT_INTER_SERIALIZED)),
        }

        self._set_serializers(serializers)
コード例 #2
0
ファイル: hdpfs.py プロジェクト: Zemanta/hadoop-manager
    def cat(self, path, serializer='raw', tab_separated=False):
        """
        Returns a generator over files defined by path

        :param path: path to the files
        :param serializer: input serializer. Options are json, pickle and raw(default)
        :param tab_seperated: boolean if input is tab separated
        """
        job = self._hdpm._run_hadoop_cmd('fs', ('-cat', path))
        output = job.yield_stdout()
        output_serializer = get_protocol_from_name(serializer)

        for line in output:
            line = line.rstrip()

            if tab_separated:
                ls = line.split('\t')
                if len(ls) > 1:
                    yield tuple(output_serializer.decode(part) for part in line.split('\t'))
                else:
                    yield output_serializer.decode(line)
            else:
                yield output_serializer.decode(line)

        job.join()
コード例 #3
0
    def cat(self, path, serializer='raw', tab_separated=False):
        """
        Returns a generator over files defined by path

        :param path: path to the files
        :param serializer: input serializer. Options are json, pickle and raw(default)
        :param tab_seperated: boolean if input is tab separated
        """
        job = self._hdpm._run_hadoop_cmd('fs', ('-cat', path))
        output = job.yield_stdout()
        output_serializer = get_protocol_from_name(serializer)

        for line in output:
            line = line.rstrip()

            if tab_separated:
                ls = line.split('\t')
                if len(ls) > 1:
                    yield tuple(
                        output_serializer.decode(part)
                        for part in line.split('\t'))
                else:
                    yield output_serializer.decode(line)
            else:
                yield output_serializer.decode(line)

        job.join()
コード例 #4
0
ファイル: streamer.py プロジェクト: hamaxx/hadoop-manager
    def _parse_serializers(self):
        ser_conf = self._get_env_conf(SERIALIZATION_CONF_PICKE_FILE_PATH) or {}

        serializers = {
            'input':
            get_protocol_from_name(
                ser_conf.get('input', DEFAULT_INPUT_SERIALIZED)),
            'output':
            get_protocol_from_name(
                ser_conf.get('output', DEFAULT_OUTPUT_SERIALIZED)),
            'inter':
            get_protocol_from_name(
                ser_conf.get('inter', DEFAULT_INTER_SERIALIZED)),
        }

        self._set_serializers(serializers)