Ejemplo n.º 1
0
    def json(self, json_file):
        """
        Reads and parses the input of a json file handler or file.

        Json files are parsed differently depending on if the root is a dictionary or an array.

        1) If the json's root is a dictionary, these are parsed into a sequence of (Key, Value)
        pairs

        2) If the json's root is an array, these are parsed into a sequence
        of entries

        >>> seq.json('examples/users.json').first()
        [u'sarah', {u'date_created': u'08/08', u'news_email': True, u'email': u'*****@*****.**'}]

        :param json_file: path or file containing json content
        :return: Sequence wrapping jsonl file
        """
        if isinstance(json_file, str):
            file_open = get_read_function(json_file, self.disable_compression)
            input_file = file_open(json_file)
            json_input = jsonapi.load(input_file)
        elif hasattr(json_file, 'read'):
            json_input = jsonapi.load(json_file)
        else:
            raise ValueError('json_file must be a file path or implement the iterator interface')

        if isinstance(json_input, list):
            return self(json_input)
        else:
            return self(six.viewitems(json_input))
Ejemplo n.º 2
0
    def open(self, path, delimiter=None, mode='r', buffering=-1, encoding=None, errors=None,
             newline=None):
        """
        Reads and parses input files as defined.

        If delimiter is not None, then the file is read in bulk then split on it. If it is None
        (the default), then the file is parsed as sequence of lines. The rest of the options are
        passed directly to builtins.open with the exception that write/append file modes is not
        allowed.

        >>> seq.open('examples/gear_list.txt').take(1)
        [u'tent\\n']

        :param path: path to file
        :param delimiter: delimiter to split joined text on. if None, defaults to per line split
        :param mode: file open mode
        :param buffering: passed to builtins.open
        :param encoding: passed to builtins.open
        :param errors: passed to builtins.open
        :param newline: passed to builtins.open
        :return: output of file depending on options wrapped in a Sequence via seq
        """
        if not re.match('^[rbt]{1,3}$', mode):
            raise ValueError('mode argument must be only have r, b, and t')

        file_open = get_read_function(path, self.disable_compression)
        file = file_open(path, mode=mode, buffering=buffering, encoding=encoding, errors=errors,
                         newline=newline)
        if delimiter is None:
            return self(file)
        else:
            return self(''.join(list(file)).split(delimiter))
Ejemplo n.º 3
0
    def csv(self, csv_file, dialect='excel', **fmt_params):
        """
        Reads and parses the input of a csv stream or file.

        csv_file can be a filepath or an object that implements the iterator interface
        (defines next() or __next__() depending on python version).

        >>> seq.csv('examples/camping_purchases.csv').take(2)
        [['1', 'tent', '300'], ['2', 'food', '100']]

        :param csv_file: path to file or iterator object
        :param dialect: dialect of csv, passed to csv.reader
        :param fmt_params: options passed to csv.reader
        :return: Sequence wrapping csv file
        """
        if isinstance(csv_file, str):
            file_open = get_read_function(csv_file, self.disable_compression)
            input_file = file_open(csv_file)
        elif hasattr(csv_file, 'next') or hasattr(csv_file, '__next__'):
            input_file = csv_file
        else:
            raise ValueError('csv_file must be a file path or implement the iterator interface')

        csv_input = csvapi.reader(input_file, dialect=dialect, **fmt_params)
        return self(csv_input).cache(delete_lineage=True)
Ejemplo n.º 4
0
    def open(self, path, delimiter=None, mode='r', buffering=-1, encoding=None, errors=None,
             newline=None):
        """
        Reads and parses input files as defined.

        If delimiter is not None, then the file is read in bulk then split on it. If it is None
        (the default), then the file is parsed as sequence of lines. The rest of the options are
        passed directly to builtins.open with the exception that write/append file modes is not
        allowed.

        >>> seq.open('examples/gear_list.txt').take(1)
        [u'tent\\n']

        :param path: path to file
        :param delimiter: delimiter to split joined text on. if None, defaults to per line split
        :param mode: file open mode
        :param buffering: passed to builtins.open
        :param encoding: passed to builtins.open
        :param errors: passed to builtins.open
        :param newline: passed to builtins.open
        :return: output of file depending on options wrapped in a Sequence via seq
        """
        if not re.match('^[rbt]{1,3}$', mode):
            raise ValueError('mode argument must be only have r, b, and t')

        file_open = get_read_function(path, self.disable_compression)
        file = file_open(path, mode=mode, buffering=buffering, encoding=encoding, errors=errors,
                         newline=newline)
        if delimiter is None:
            return self(file)
        else:
            return self(''.join(list(file)).split(delimiter))
Ejemplo n.º 5
0
    def json(self, json_file):
        """
        Reads and parses the input of a json file handler or file.

        Json files are parsed differently depending on if the root is a dictionary or an array.

        1) If the json's root is a dictionary, these are parsed into a sequence of (Key, Value)
        pairs

        2) If the json's root is an array, these are parsed into a sequence
        of entries

        >>> seq.json('examples/users.json').first()
        [u'sarah', {u'date_created': u'08/08', u'news_email': True, u'email': u'*****@*****.**'}]

        :param json_file: path or file containing json content
        :return: Sequence wrapping jsonl file
        """
        if isinstance(json_file, str):
            file_open = get_read_function(json_file, self.disable_compression)
            input_file = file_open(json_file)
            json_input = jsonapi.load(input_file)
        elif hasattr(json_file, 'read'):
            json_input = jsonapi.load(json_file)
        else:
            raise ValueError('json_file must be a file path or implement the iterator interface')

        if isinstance(json_input, list):
            return self(json_input)
        else:
            return self(six.viewitems(json_input))
Ejemplo n.º 6
0
    def csv(self, csv_file, dialect='excel', **fmt_params):
        """
        Reads and parses the input of a csv stream or file.

        csv_file can be a filepath or an object that implements the iterator interface
        (defines next() or __next__() depending on python version).

        >>> seq.csv('examples/camping_purchases.csv').take(2)
        [['1', 'tent', '300'], ['2', 'food', '100']]

        :param csv_file: path to file or iterator object
        :param dialect: dialect of csv, passed to csv.reader
        :param fmt_params: options passed to csv.reader
        :return: Sequence wrapping csv file
        """
        if isinstance(csv_file, str):
            file_open = get_read_function(csv_file, self.disable_compression)
            input_file = file_open(csv_file)
        elif hasattr(csv_file, 'next') or hasattr(csv_file, '__next__'):
            input_file = csv_file
        else:
            raise ValueError('csv_file must be a file path or implement the iterator interface')

        csv_input = csvapi.reader(input_file, dialect=dialect, **fmt_params)
        return self(csv_input).cache(delete_lineage=True)
Ejemplo n.º 7
0
    def csv_dict_reader(self, csv_file, fieldnames=None, restkey=None, restval=None,
                        dialect='excel', **kwds):
        if isinstance(csv_file, str):
            file_open = get_read_function(csv_file, self.disable_compression)
            input_file = file_open(csv_file)
        elif hasattr(csv_file, 'next') or hasattr(csv_file, '__next__'):
            input_file = csv_file
        else:
            raise ValueError('csv_file must be a file path or implement the iterator interface')

        csv_input = csvapi.DictReader(input_file, fieldnames=fieldnames, restkey=restkey,
                                      restval=restval, dialect=dialect, **kwds)
        return self(csv_input).cache(delete_lineage=True)
Ejemplo n.º 8
0
    def jsonl(self, jsonl_file):
        """
        Reads and parses the input of a jsonl file stream or file.

        Jsonl formatted files must have a single valid json value on each line which is parsed by
        the python json module.

        >>> seq.jsonl('examples/chat_logs.jsonl').first()
        {u'date': u'10/09', u'message': u'hello anyone there?', u'user': u'bob'}

        :param jsonl_file: path or file containing jsonl content
        :return: Sequence wrapping jsonl file
        """
        if isinstance(jsonl_file, str):
            file_open = get_read_function(jsonl_file, self.disable_compression)
            input_file = file_open(jsonl_file)
        else:
            input_file = jsonl_file
        return self(input_file).map(jsonapi.loads).cache(delete_lineage=True)
Ejemplo n.º 9
0
    def jsonl(self, jsonl_file):
        """
        Reads and parses the input of a jsonl file stream or file.

        Jsonl formatted files must have a single valid json value on each line which is parsed by
        the python json module.

        >>> seq.jsonl('examples/chat_logs.jsonl').first()
        {u'date': u'10/09', u'message': u'hello anyone there?', u'user': u'bob'}

        :param jsonl_file: path or file containing jsonl content
        :return: Sequence wrapping jsonl file
        """
        if isinstance(jsonl_file, str):
            file_open = get_read_function(jsonl_file, self.disable_compression)
            input_file = file_open(jsonl_file)
        else:
            input_file = jsonl_file
        return self(input_file).map(jsonapi.loads).cache(delete_lineage=True)