Beispiel #1
0
    def __iter__(self):

        # prefer implementation using xlutils.view as dates are automatically
        # converted
        if self.use_view:
            from petl.io import xlutils_view
            source = read_source_from_arg(self.filename)
            with source.open('rb') as source2:
                source3 = source2.read()
                wb = xlutils_view.View(source3, **self.kwargs)
                if self.sheet is None:
                    ws = wb[0]
                else:
                    ws = wb[self.sheet]
                for row in ws:
                    yield tuple(row)
        else:
            import xlrd
            source = read_source_from_arg(self.filename)
            with source.open('rb') as source2:
                source3 = source2.read()
                with xlrd.open_workbook(file_contents=source3,
                                        on_demand=True,
                                        **self.kwargs) as wb:
                    if self.sheet is None:
                        ws = wb.sheet_by_index(0)
                    elif isinstance(self.sheet, int):
                        ws = wb.sheet_by_index(self.sheet)
                    else:
                        ws = wb.sheet_by_name(str(self.sheet))
                    for rownum in xrange(ws.nrows):
                        yield tuple(ws.row_values(rownum))
Beispiel #2
0
def frompickle(source=None):
    """
    Extract a table From data pickled in the given file. The rows in the
    table should have been pickled to the file one at a time. E.g.::

        >>> import petl as etl
        >>> import pickle
        >>> # set up a file to demonstrate with
        ... with open('example.p', 'wb') as f:
        ...     pickle.dump(['foo', 'bar'], f)
        ...     pickle.dump(['a', 1], f)
        ...     pickle.dump(['b', 2], f)
        ...     pickle.dump(['c', 2.5], f)
        ...
        >>> # now demonstrate the use of frompickle()
        ... table1 = etl.frompickle('example.p')
        >>> table1
        +-----+-----+
        | foo | bar |
        +=====+=====+
        | 'a' |   1 |
        +-----+-----+
        | 'b' |   2 |
        +-----+-----+
        | 'c' | 2.5 |
        +-----+-----+


    """

    source = read_source_from_arg(source)
    return PickleView(source)
Beispiel #3
0
def appendxlsx(tbl, filename, sheet=None, write_header=False):
    """
    Appends rows to an existing Excel .xlsx file.
    """

    import openpyxl
    source = read_source_from_arg(filename)
    with source.open('rb') as source2:
        wb = openpyxl.load_workbook(filename=source2, read_only=False)
        if sheet is None:
            ws = wb[wb.sheetnames[0]]
        elif isinstance(sheet, int):
            ws = wb[wb.sheetnames[sheet]]
        else:
            ws = wb[str(sheet)]
        if write_header:
            it = iter(tbl)
            hdr = next(it)
            flds = list(map(text_type, hdr))
            rows = itertools.chain([flds], it)
        else:
            rows = data(tbl)
        for row in rows:
            ws.append(row)
        target = write_source_from_arg(filename)
        with target.open('wb') as target2:
            wb.save(target2)
Beispiel #4
0
def frompickle(source=None):
    """
    Returns a table providing access to the data pickled in the given file. The
    rows in the table should have been pickled to the file one at a time. E.g.::

        >>> import pickle
        >>> # set up a file to demonstrate with
        ... with open('test.dat', 'wb') as f:
        ...     pickle.dump(['foo', 'bar'], f)
        ...     pickle.dump(['a', 1], f)
        ...     pickle.dump(['b', 2], f)
        ...     pickle.dump(['c', 2.5], f)
        ...
        >>> # now demonstrate the use of petl.frompickle
        ... from petl import frompickle, look
        >>> testdat = frompickle('test.dat')
        >>> look(testdat)
        +-------+-------+
        | 'foo' | 'bar' |
        +=======+=======+
        | 'a'   | 1     |
        +-------+-------+
        | 'b'   | 2     |
        +-------+-------+
        | 'c'   | 2.5   |
        +-------+-------+

    Supports transparent reading from URLs, ``.gz`` and ``.bz2`` files.

    """

    source = read_source_from_arg(source)
    return PickleView(source)
Beispiel #5
0
def frompickle(source=None):
    """
    Extract a table From data pickled in the given file. The rows in the
    table should have been pickled to the file one at a time. E.g.::

        >>> import petl as etl
        >>> import pickle
        >>> # set up a file to demonstrate with
        ... with open('example.p', 'wb') as f:
        ...     pickle.dump(['foo', 'bar'], f)
        ...     pickle.dump(['a', 1], f)
        ...     pickle.dump(['b', 2], f)
        ...     pickle.dump(['c', 2.5], f)
        ...
        >>> # now demonstrate the use of frompickle()
        ... table1 = etl.frompickle('example.p')
        >>> table1
        +-----+-----+
        | foo | bar |
        +=====+=====+
        | 'a' |   1 |
        +-----+-----+
        | 'b' |   2 |
        +-----+-----+
        | 'c' | 2.5 |
        +-----+-----+


    """

    source = read_source_from_arg(source)
    return PickleView(source)
Beispiel #6
0
def fromjson(source, *args, **kwargs):
    """
    Extract data from a JSON file. The file must contain a JSON array as the top
    level object, and each member of the array will be treated as a row of data.
    E.g.::

        >>> from petl import fromjson, look
        >>> data = '[{"foo": "a", "bar": 1}, {"foo": "b", "bar": 2}, {"foo": "c", "bar": 2}]'
        >>> with open('example1.json', 'w') as f:
        ...     f.write(data)
        ...
        >>> table1 = fromjson('example1.json')
        >>> look(table1)
        +--------+--------+
        | u'foo' | u'bar' |
        +========+========+
        | u'a'   | 1      |
        +--------+--------+
        | u'b'   | 2      |
        +--------+--------+
        | u'c'   | 2      |
        +--------+--------+

    If your JSON file does not fit this structure, you will need to parse it
    via :func:`json.load` and select the array to treat as the data, see also
    :func:`fromdicts`.

    Supports transparent reading from URLs, ``.gz`` and ``.bz2`` files.

    .. versionadded:: 0.5

    """

    source = read_source_from_arg(source)
    return JsonView(source, *args, **kwargs)
Beispiel #7
0
 def __iter__(self):
     import openpyxl
     source = read_source_from_arg(self.filename)
     with source.open('rb') as source2:
         wb = openpyxl.load_workbook(filename=source2,
                                     read_only=self.read_only,
                                     **self.kwargs)
         if self.sheet is None:
             ws = wb[wb.sheetnames[0]]
         elif isinstance(self.sheet, int):
             ws = wb[wb.sheetnames[self.sheet]]
         else:
             ws = wb[str(self.sheet)]
         if self.range_string is not None:
             rows = ws[self.range_string]
         else:
             rows = ws.iter_rows(min_row=self.min_row,
                                 min_col=self.min_col,
                                 max_row=self.max_row,
                                 max_col=self.max_col)
         for row in rows:
             yield tuple(cell.value for cell in row)
         try:
             wb._archive.close()
         except AttributeError:
             # just here in case openpyxl stops exposing an _archive property.
             pass
Beispiel #8
0
def fromfwf(source=None,
            widths=None,
            encoding=None,
            errors='strict',
            header=None,
            skiprows=None,
            skipfooter=None):
    """
    Extract a table (As defined by the petl package) from lines in the given fixed-width file.

        >>> import fromfwf
        >>> # setup a sample file
        ... text = '  18 5 2\\n2018 5 2\\n20180502'
        >>> with open('example.txt', 'w') as f:
        ...    f.write(text)
        ...
        28
        >>> table1 = etl.fromfwf('example.txt', widths=[4, 2, 2])
        >>> table1
        +--------+------+------+
        | lines  |      |      |
        +========+======+======+
        | '  18' | ' 5' | ' 2' |
        +--------+------+------+
        | '2018' | ' 5' | ' 2' |
        +--------+------+------+
        | '2018' | '05' | '02' |
        +--------+------+------+
        >>> # Specify headers for the file
        ... table1 = etl.fromfwf('example.txt', widths=[4, 2, 2], header=['year', 'month', 'day'])
        >>> table1
        +--------+-------+------+
        | year   | month | day  |
        +========+=======+======+
        | '  18' | ' 5'  | ' 2' |
        +--------+-------+------+
        | '2018' | ' 5'  | ' 2' |
        +--------+-------+------+
        | '2018' | '05'  | '02' |
        +--------+-------+------+
    """
    if widths is None:
        raise AttributeError('No field widths provided!')
    if isinstance(widths, (list, tuple)) is False:
        raise AttributeError(
            'Field widths must be a tuple or list of field widths!')
    if isinstance(source, Table):
        source = source
    else:
        source = read_source_from_arg(source)
    return FixedTextView(source,
                         widths=widths,
                         header=header,
                         encoding=encoding,
                         errors=errors,
                         skiprows=skiprows,
                         skipfooter=skipfooter)
Beispiel #9
0
def fromtext(source=None,
             encoding=None,
             errors=None,
             strip=None,
             header=('lines', )):
    """
    Extract a table from lines in the given text file. E.g.::

        >>> import petl as etl
        >>> # setup example file
        ... text = 'a,1\\nb,2\\nc,2\\n'
        >>> with open('example.txt', 'w') as f:
        ...     f.write(text)
        ...
        12
        >>> table1 = etl.fromtext('example.txt')
        >>> table1
        +-------+
        | lines |
        +=======+
        | 'a,1' |
        +-------+
        | 'b,2' |
        +-------+
        | 'c,2' |
        +-------+

        >>> # post-process, e.g., with capture()
        ... table2 = table1.capture('lines', '(.*),(.*)$', ['foo', 'bar'])
        >>> table2
        +-----+-----+
        | foo | bar |
        +=====+=====+
        | 'a' | '1' |
        +-----+-----+
        | 'b' | '2' |
        +-----+-----+
        | 'c' | '2' |
        +-----+-----+

    Note that the strip() function is called on each line, which by default
    will remove leading and trailing whitespace, including the end-of-line
    character - use the `strip` keyword argument to specify alternative
    characters to strip. Set the `strip` argument to `False` to disable this
    behaviour and leave line endings in place.

    """

    source = read_source_from_arg(source)
    return TextView(source,
                    header=header,
                    encoding=encoding,
                    errors=errors,
                    strip=strip)
Beispiel #10
0
def fromucsv(source=None, dialect=csv.excel, encoding='utf-8', **kwargs):
    """
    Returns a table containing unicode data extracted from a delimited file via
    the given encoding. Like :func:`fromcsv` but accepts an additional
    ``encoding`` argument which should be one of the Python supported encodings.
    See also :mod:`codecs`.

    .. versionadded:: 0.19
    """
    source = read_source_from_arg(source)
    return UnicodeCSVView(source=source, dialect=dialect, encoding=encoding,
                          **kwargs)
Beispiel #11
0
def fromutext(source=None, header=(u'lines',), encoding='utf-8', strip=None):
    """
    Construct a table from lines in the given text file via the given encoding.
    Like :func:`fromtext` but accepts an additional ``encoding`` argument
    which should be one of the Python supported encodings. See also
    :mod:`codecs`.

    .. versionadded:: 0.19

    """
    source = read_source_from_arg(source)
    return UnicodeTextView(source, header, encoding=encoding, strip=strip)
Beispiel #12
0
def fromucsv(source=None, dialect=csv.excel, encoding='utf-8', **kwargs):
    """
    Returns a table containing unicode data extracted from a delimited file via
    the given encoding. Like :func:`fromcsv` but accepts an additional
    ``encoding`` argument which should be one of the Python supported encodings.
    See also :mod:`codecs`.

    .. versionadded:: 0.19
    """
    source = read_source_from_arg(source)
    return UnicodeCSVView(source=source, dialect=dialect, encoding=encoding,
                          **kwargs)
Beispiel #13
0
def fromtext(source=None, header=('lines',), strip=None):
    """
    Construct a table from lines in the given text file. E.g.::

        >>> # example data
        ... with open('test.txt', 'w') as f:
        ...     f.write('a\\t1\\n')
        ...     f.write('b\\t2\\n')
        ...     f.write('c\\t3\\n')
        ...
        >>> from petl import fromtext, look
        >>> table1 = fromtext('test.txt')
        >>> look(table1)
        +--------------+
        | 'lines'      |
        +==============+
        | 'a\\t1'      |
        +--------------+
        | 'b\\t2'      |
        +--------------+
        | 'c\\t3'      |
        +--------------+

    The :func:`fromtext` function provides a starting point for custom handling
    of text files. E.g., using :func:`capture`::

        >>> from petl import capture
        >>> table2 = capture(table1, 'lines', '(.*)\\\\t(.*)$', ['foo', 'bar'])
        >>> look(table2)
        +-------+-------+
        | 'foo' | 'bar' |
        +=======+=======+
        | 'a'   | '1'   |
        +-------+-------+
        | 'b'   | '2'   |
        +-------+-------+
        | 'c'   | '3'   |
        +-------+-------+

    Supports transparent reading from URLs, ``.gz`` and ``.bz2`` files.

    .. versionchanged:: 0.4

    The strip() function is called on each line, which by default will remove
    leading and trailing whitespace, including the end-of-line character - use
    the `strip` keyword argument to specify alternative characters to strip.

    """

    source = read_source_from_arg(source)
    return TextView(source, header, strip=strip)
Beispiel #14
0
def fromtext(source=None, encoding=None, errors='strict', strip=None,
             header=('lines',)):
    """
    Extract a table from lines in the given text file. E.g.::

        >>> import petl as etl
        >>> # setup example file
        ... text = 'a,1\\nb,2\\nc,2\\n'
        >>> with open('example.txt', 'w') as f:
        ...     f.write(text)
        ...
        12
        >>> table1 = etl.fromtext('example.txt')
        >>> table1
        +-------+
        | lines |
        +=======+
        | 'a,1' |
        +-------+
        | 'b,2' |
        +-------+
        | 'c,2' |
        +-------+

        >>> # post-process, e.g., with capture()
        ... table2 = table1.capture('lines', '(.*),(.*)$', ['foo', 'bar'])
        >>> table2
        +-----+-----+
        | foo | bar |
        +=====+=====+
        | 'a' | '1' |
        +-----+-----+
        | 'b' | '2' |
        +-----+-----+
        | 'c' | '2' |
        +-----+-----+

    Note that the strip() function is called on each line, which by default
    will remove leading and trailing whitespace, including the end-of-line
    character - use the `strip` keyword argument to specify alternative
    characters to strip. Set the `strip` argument to `False` to disable this
    behaviour and leave line endings in place.

    """

    source = read_source_from_arg(source)
    return TextView(source, header=header, encoding=encoding,
                    errors=errors, strip=strip)
Beispiel #15
0
def fromjson(source, *args, **kwargs):
    """
    Extract data from a JSON file. The file must contain a JSON array as
    the top level object, and each member of the array will be treated as a
    row of data. E.g.::

        >>> import petl as etl
        >>> data = '''
        ... [{"foo": "a", "bar": 1},
        ... {"foo": "b", "bar": 2},
        ... {"foo": "c", "bar": 2}]
        ... '''
        >>> with open('example.json', 'w') as f:
        ...     f.write(data)
        ...
        74
        >>> table1 = etl.fromjson('example.json', header=['foo', 'bar'])
        >>> table1
        +-----+-----+
        | foo | bar |
        +=====+=====+
        | 'a' |   1 |
        +-----+-----+
        | 'b' |   2 |
        +-----+-----+
        | 'c' |   2 |
        +-----+-----+

    If your JSON file does not fit this structure, you will need to parse it
    via :func:`json.load` and select the array to treat as the data, see also
    :func:`petl.io.json.fromdicts`.

    .. versionchanged:: 1.1.0

    If no `header` is specified, fields will be discovered by sampling keys
    from the first `sample` objects in `source`. The header will be
    constructed from keys in the order discovered. Note that this
    ordering may not be stable, and therefore it may be advisable to specify
    an explicit `header` or to use another function like
    :func:`petl.transform.headers.sortheader` on the resulting table to
    guarantee stability.

    """

    source = read_source_from_arg(source)
    return JsonView(source, *args, **kwargs)
Beispiel #16
0
def fromjson(source, *args, **kwargs):
    """
    Extract data from a JSON file. The file must contain a JSON array as
    the top level object, and each member of the array will be treated as a
    row of data. E.g.::

        >>> import petl as etl
        >>> data = '''
        ... [{"foo": "a", "bar": 1},
        ... {"foo": "b", "bar": 2},
        ... {"foo": "c", "bar": 2}]
        ... '''
        >>> with open('example.json', 'w') as f:
        ...     f.write(data)
        ...
        74
        >>> table1 = etl.fromjson('example.json', header=['foo', 'bar'])
        >>> table1
        +-----+-----+
        | foo | bar |
        +=====+=====+
        | 'a' |   1 |
        +-----+-----+
        | 'b' |   2 |
        +-----+-----+
        | 'c' |   2 |
        +-----+-----+

    If your JSON file does not fit this structure, you will need to parse it
    via :func:`json.load` and select the array to treat as the data, see also
    :func:`petl.io.json.fromdicts`.

    .. versionchanged:: 1.1.0

    If no `header` is specified, fields will be discovered by sampling keys
    from the first `sample` objects in `source`. The header will be
    constructed from keys in the order discovered. Note that this
    ordering may not be stable, and therefore it may be advisable to specify
    an explicit `header` or to use another function like
    :func:`petl.transform.headers.sortheader` on the resulting table to
    guarantee stability.

    """

    source = read_source_from_arg(source)
    return JsonView(source, *args, **kwargs)
Beispiel #17
0
def _load_or_create_workbook(filename, mode, sheet):
    if PY3:
        FileNotFound = FileNotFoundError
    else:
        FileNotFound = IOError

    import openpyxl
    wb = None
    if not (mode == "overwrite" or (mode == "replace" and sheet is None)):
        try:
            source = read_source_from_arg(filename)
            with source.open('rb') as source2:
                wb = openpyxl.load_workbook(filename=source2, read_only=False)
        except FileNotFound:
            wb = None
    if wb is None:
        wb = openpyxl.Workbook(write_only=True)
    return wb
Beispiel #18
0
def fromcsv(source=None, encoding=None, errors='strict', **csvargs):
    """
    Extract a table from a delimited file. E.g.::

        >>> import petl as etl
        >>> import csv
        >>> # set up a CSV file to demonstrate with
        ... table1 = [['foo', 'bar'],
        ...           ['a', 1],
        ...           ['b', 2],
        ...           ['c', 2]]
        >>> with open('example.csv', 'w') as f:
        ...     writer = csv.writer(f)
        ...     writer.writerows(table1)
        ...
        >>> # now demonstrate the use of fromcsv()
        ... table2 = etl.fromcsv('example.csv')
        >>> table2
        +-----+-----+
        | foo | bar |
        +=====+=====+
        | 'a' | '1' |
        +-----+-----+
        | 'b' | '2' |
        +-----+-----+
        | 'c' | '2' |
        +-----+-----+

    The `source` argument is the path of the delimited file, all other keyword
    arguments are passed to :func:`csv.reader`. So, e.g., to override the
    delimiter from the default CSV dialect, provide the `delimiter` keyword
    argument.

    Note that all data values are strings, and any intended numeric values will
    need to be converted, see also :func:`petl.transform.conversions.convert`.

    """

    source = read_source_from_arg(source)
    csvargs.setdefault('dialect', 'excel')
    return fromcsv_impl(source=source,
                        encoding=encoding,
                        errors=errors,
                        **csvargs)
Beispiel #19
0
def fromcsv(source=None, encoding=None, errors='strict', header=None, 
            **csvargs):
    """
    Extract a table from a delimited file. E.g.::

        >>> import petl as etl
        >>> import csv
        >>> # set up a CSV file to demonstrate with
        ... table1 = [['foo', 'bar'],
        ...           ['a', 1],
        ...           ['b', 2],
        ...           ['c', 2]]
        >>> with open('example.csv', 'w') as f:
        ...     writer = csv.writer(f)
        ...     writer.writerows(table1)
        ...
        >>> # now demonstrate the use of fromcsv()
        ... table2 = etl.fromcsv('example.csv')
        >>> table2
        +-----+-----+
        | foo | bar |
        +=====+=====+
        | 'a' | '1' |
        +-----+-----+
        | 'b' | '2' |
        +-----+-----+
        | 'c' | '2' |
        +-----+-----+

    The `source` argument is the path of the delimited file, all other keyword
    arguments are passed to :func:`csv.reader`. So, e.g., to override the
    delimiter from the default CSV dialect, provide the `delimiter` keyword
    argument.

    Note that all data values are strings, and any intended numeric values will
    need to be converted, see also :func:`petl.transform.conversions.convert`.

    """

    source = read_source_from_arg(source)
    csvargs.setdefault('dialect', 'excel')
    return fromcsv_impl(source=source, encoding=encoding, errors=errors, 
                        header=header, **csvargs)
Beispiel #20
0
def fromcsv(source=None, dialect=csv.excel, **kwargs):
    """
    Wrapper for the standard :func:`csv.reader` function. Returns a table
    providing access to the data in the given delimited file. E.g.::

        >>> import csv
        >>> # set up a CSV file to demonstrate with
        ... with open('test.csv', 'wb') as f:
        ...     writer = csv.writer(f)
        ...     writer.writerow(['foo', 'bar'])
        ...     writer.writerow(['a', 1])
        ...     writer.writerow(['b', 2])
        ...     writer.writerow(['c', 2])
        ...
        >>> # now demonstrate the use of petl.fromcsv
        ... from petl import fromcsv, look
        >>> testcsv = fromcsv('test.csv')
        >>> look(testcsv)
        +-------+-------+
        | 'foo' | 'bar' |
        +=======+=======+
        | 'a'   | '1'   |
        +-------+-------+
        | 'b'   | '2'   |
        +-------+-------+
        | 'c'   | '2'   |
        +-------+-------+

    The `filename` argument is the path of the delimited file, all other keyword
    arguments are passed to :func:`csv.reader`. So, e.g., to override the
    delimiter from the default CSV dialect, provide the `delimiter` keyword
    argument.

    Note that all data values are strings, and any intended numeric values will
    need to be converted, see also :func:`convert`.

    Supports transparent reading from URLs, ``.gz`` and ``.bz2`` files.

    """

    source = read_source_from_arg(source)
    return CSVView(source=source, dialect=dialect, **kwargs)
Beispiel #21
0
def fromcsv(source=None, dialect=csv.excel, **kwargs):
    """
    Wrapper for the standard :func:`csv.reader` function. Returns a table
    providing access to the data in the given delimited file. E.g.::

        >>> import csv
        >>> # set up a CSV file to demonstrate with
        ... with open('test.csv', 'wb') as f:
        ...     writer = csv.writer(f)
        ...     writer.writerow(['foo', 'bar'])
        ...     writer.writerow(['a', 1])
        ...     writer.writerow(['b', 2])
        ...     writer.writerow(['c', 2])
        ...
        >>> # now demonstrate the use of petl.fromcsv
        ... from petl import fromcsv, look
        >>> testcsv = fromcsv('test.csv')
        >>> look(testcsv)
        +-------+-------+
        | 'foo' | 'bar' |
        +=======+=======+
        | 'a'   | '1'   |
        +-------+-------+
        | 'b'   | '2'   |
        +-------+-------+
        | 'c'   | '2'   |
        +-------+-------+

    The `filename` argument is the path of the delimited file, all other keyword
    arguments are passed to :func:`csv.reader`. So, e.g., to override the
    delimiter from the default CSV dialect, provide the `delimiter` keyword
    argument.

    Note that all data values are strings, and any intended numeric values will
    need to be converted, see also :func:`convert`.

    Supports transparent reading from URLs, ``.gz`` and ``.bz2`` files.

    """

    source = read_source_from_arg(source)
    return CSVView(source=source, dialect=dialect, **kwargs)
Beispiel #22
0
def fromjson(source, *args, **kwargs):
    """
    Extract data from a JSON file. The file must contain a JSON array as
    the top level object, and each member of the array will be treated as a
    row of data. E.g.::

        >>> import petl as etl
        >>> data = '''
        ... [{"foo": "a", "bar": 1},
        ... {"foo": "b", "bar": 2},
        ... {"foo": "c", "bar": 2}]
        ... '''
        >>> with open('example.json', 'w') as f:
        ...     f.write(data)
        ...
        74
        >>> table1 = etl.fromjson('example.json')
        >>> table1
        +-----+-----+
        | bar | foo |
        +=====+=====+
        |   1 | 'a' |
        +-----+-----+
        |   2 | 'b' |
        +-----+-----+
        |   2 | 'c' |
        +-----+-----+

    If your JSON file does not fit this structure, you will need to parse it
    via :func:`json.load` and select the array to treat as the data, see also
    :func:`petl.io.json.fromdicts`.

    """

    source = read_source_from_arg(source)
    return JsonView(source, *args, **kwargs)
Beispiel #23
0
def fromjson(source, *args, **kwargs):
    """
    Extract data from a JSON file. The file must contain a JSON array as
    the top level object, and each member of the array will be treated as a
    row of data. E.g.::

        >>> import petl as etl
        >>> data = '''
        ... [{"foo": "a", "bar": 1},
        ... {"foo": "b", "bar": 2},
        ... {"foo": "c", "bar": 2}]
        ... '''
        >>> with open('example.json', 'w') as f:
        ...     f.write(data)
        ...
        74
        >>> table1 = etl.fromjson('example.json')
        >>> table1
        +-----+-----+
        | bar | foo |
        +=====+=====+
        |   1 | 'a' |
        +-----+-----+
        |   2 | 'b' |
        +-----+-----+
        |   2 | 'c' |
        +-----+-----+

    If your JSON file does not fit this structure, you will need to parse it
    via :func:`json.load` and select the array to treat as the data, see also
    :func:`petl.io.json.fromdicts`.

    """

    source = read_source_from_arg(source)
    return JsonView(source, *args, **kwargs)
Beispiel #24
0
def fromxml(source, *args, **kwargs):
    """
    Extract data from an XML file. E.g.::

        >>> import petl as etl
        >>> # setup a file to demonstrate with
        ... d = '''<table>
        ...     <tr>
        ...         <td>foo</td><td>bar</td>
        ...     </tr>
        ...     <tr>
        ...         <td>a</td><td>1</td>
        ...     </tr>
        ...     <tr>
        ...         <td>b</td><td>2</td>
        ...     </tr>
        ...     <tr>
        ...         <td>c</td><td>2</td>
        ...     </tr>
        ... </table>'''
        >>> with open('example1.xml', 'w') as f:
        ...     f.write(d)
        ...
        212
        >>> table1 = etl.fromxml('example1.xml', 'tr', 'td')
        >>> table1
        +-----+-----+
        | foo | bar |
        +=====+=====+
        | 'a' | '1' |
        +-----+-----+
        | 'b' | '2' |
        +-----+-----+
        | 'c' | '2' |
        +-----+-----+


    If the data values are stored in an attribute, provide the attribute
    name as an extra positional argument::

        >>> d = '''<table>
        ...     <tr>
        ...         <td v='foo'/><td v='bar'/>
        ...     </tr>
        ...     <tr>
        ...         <td v='a'/><td v='1'/>
        ...     </tr>
        ...     <tr>
        ...         <td v='b'/><td v='2'/>
        ...     </tr>
        ...     <tr>
        ...         <td v='c'/><td v='2'/>
        ...     </tr>
        ... </table>'''
        >>> with open('example2.xml', 'w') as f:
        ...     f.write(d)
        ...
        220
        >>> table2 = etl.fromxml('example2.xml', 'tr', 'td', 'v')
        >>> table2
        +-----+-----+
        | foo | bar |
        +=====+=====+
        | 'a' | '1' |
        +-----+-----+
        | 'b' | '2' |
        +-----+-----+
        | 'c' | '2' |
        +-----+-----+

    Data values can also be extracted by providing a mapping of field
    names to element paths::

        >>> d = '''<table>
        ...     <row>
        ...         <foo>a</foo><baz><bar v='1'/><bar v='3'/></baz>
        ...     </row>
        ...     <row>
        ...         <foo>b</foo><baz><bar v='2'/></baz>
        ...     </row>
        ...     <row>
        ...         <foo>c</foo><baz><bar v='2'/></baz>
        ...     </row>
        ... </table>'''
        >>> with open('example3.xml', 'w') as f:
        ...     f.write(d)
        ...
        223
        >>> table3 = etl.fromxml('example3.xml', 'row',
        ...                      {'foo': 'foo', 'bar': ('baz/bar', 'v')})
        >>> table3
        +------------+-----+
        | bar        | foo |
        +============+=====+
        | ('1', '3') | 'a' |
        +------------+-----+
        | '2'        | 'b' |
        +------------+-----+
        | '2'        | 'c' |
        +------------+-----+

    If `lxml <http://lxml.de/>`_ is installed, full XPath expressions can be
    used.

    Note that the implementation is currently **not** streaming, i.e.,
    the whole document is loaded into memory.

    If multiple elements match a given field, all values are reported as a
    tuple.

    If there is more than one element name used for row values, a tuple
    or list of paths can be provided, e.g.,
    ``fromxml('example.html', './/tr', ('th', 'td'))``.

    Optionally a custom parser can be provided, e.g.::

        >>> from lxml import etree # doctest: +SKIP
        ... my_parser = etree.XMLParser(resolve_entities=False) # doctest: +SKIP
        ... table4 = etl.fromxml('example1.xml', 'tr', 'td', parser=my_parser) # doctest: +SKIP

    """

    source = read_source_from_arg(source)
    return XmlView(source, *args, **kwargs)
Beispiel #25
0
def fromxml(source, *args, **kwargs):
    """
    Extract data from an XML file. E.g.::

        >>> import petl as etl
        >>> # setup a file to demonstrate with
        ... d = '''<table>
        ...     <tr>
        ...         <td>foo</td><td>bar</td>
        ...     </tr>
        ...     <tr>
        ...         <td>a</td><td>1</td>
        ...     </tr>
        ...     <tr>
        ...         <td>b</td><td>2</td>
        ...     </tr>
        ...     <tr>
        ...         <td>c</td><td>2</td>
        ...     </tr>
        ... </table>'''
        >>> with open('example1.xml', 'w') as f:
        ...     f.write(d)
        ...
        212
        >>> table1 = etl.fromxml('example1.xml', 'tr', 'td')
        >>> table1
        +-----+-----+
        | foo | bar |
        +=====+=====+
        | 'a' | '1' |
        +-----+-----+
        | 'b' | '2' |
        +-----+-----+
        | 'c' | '2' |
        +-----+-----+

        >>> # if the data values are stored in an attribute, provide the
        ... # attribute name as an extra positional argument
        ... d = '''<table>
        ...     <tr>
        ...         <td v='foo'/><td v='bar'/>
        ...     </tr>
        ...     <tr>
        ...         <td v='a'/><td v='1'/>
        ...     </tr>
        ...     <tr>
        ...         <td v='b'/><td v='2'/>
        ...     </tr>
        ...     <tr>
        ...         <td v='c'/><td v='2'/>
        ...     </tr>
        ... </table>'''
        >>> with open('example2.xml', 'w') as f:
        ...     f.write(d)
        ...
        220
        >>> table2 = etl.fromxml('example2.xml', 'tr', 'td', 'v')
        >>> table2
        +-----+-----+
        | foo | bar |
        +=====+=====+
        | 'a' | '1' |
        +-----+-----+
        | 'b' | '2' |
        +-----+-----+
        | 'c' | '2' |
        +-----+-----+

        >>> # data values can also be extracted by providing a mapping of field
        ... # names to element paths
        ... d = '''<table>
        ...     <row>
        ...         <foo>a</foo><baz><bar v='1'/><bar v='3'/></baz>
        ...     </row>
        ...     <row>
        ...         <foo>b</foo><baz><bar v='2'/></baz>
        ...     </row>
        ...     <row>
        ...         <foo>c</foo><baz><bar v='2'/></baz>
        ...     </row>
        ... </table>'''
        >>> with open('example3.xml', 'w') as f:
        ...     f.write(d)
        ...
        223
        >>> table3 = etl.fromxml('example3.xml', 'row',
        ...                      {'foo': 'foo', 'bar': ('baz/bar', 'v')})
        >>> table3
        +------------+-----+
        | bar        | foo |
        +============+=====+
        | ('1', '3') | 'a' |
        +------------+-----+
        | '2'        | 'b' |
        +------------+-----+
        | '2'        | 'c' |
        +------------+-----+

    Note that the implementation is currently *not* streaming, i.e.,
    the whole document is loaded into memory.

    If multiple elements match a given field, all values are reported as a
    tuple.

    If there is more than one element name used for row values, a tuple
    or list of paths can be provided, e.g.,
    ``fromxml('example.html', './/tr', ('th', 'td'))``.

    """

    source = read_source_from_arg(source)
    return XmlView(source, *args, **kwargs)
Beispiel #26
0
def fromxml(source, *args, **kwargs):
    """
    Access data in an XML file. E.g.::

        >>> from petl import fromxml, look
        >>> data = \"""<table>
        ...     <tr>
        ...         <td>foo</td><td>bar</td>
        ...     </tr>
        ...     <tr>
        ...         <td>a</td><td>1</td>
        ...     </tr>
        ...     <tr>
        ...         <td>b</td><td>2</td>
        ...     </tr>
        ...     <tr>
        ...         <td>c</td><td>2</td>
        ...     </tr>
        ... </table>\"""
        >>> with open('example1.xml', 'w') as f:
        ...     f.write(data)
        ...     f.close()
        ...
        >>> table1 = fromxml('example1.xml', 'tr', 'td')
        >>> look(table1)
        +-------+-------+
        | 'foo' | 'bar' |
        +=======+=======+
        | 'a'   | '1'   |
        +-------+-------+
        | 'b'   | '2'   |
        +-------+-------+
        | 'c'   | '2'   |
        +-------+-------+

    If the data values are stored in an attribute, provide the attribute name
    as an extra positional argument, e.g.:

        >>> data = \"""<table>
        ...     <tr>
        ...         <td v='foo'/><td v='bar'/>
        ...     </tr>
        ...     <tr>
        ...         <td v='a'/><td v='1'/>
        ...     </tr>
        ...     <tr>
        ...         <td v='b'/><td v='2'/>
        ...     </tr>
        ...     <tr>
        ...         <td v='c'/><td v='2'/>
        ...     </tr>
        ... </table>\"""
        >>> with open('example2.xml', 'w') as f:
        ...     f.write(data)
        ...     f.close()
        ...
        >>> table2 = fromxml('example2.xml', 'tr', 'td', 'v')
        >>> look(table2)
        +-------+-------+
        | 'foo' | 'bar' |
        +=======+=======+
        | 'a'   | '1'   |
        +-------+-------+
        | 'b'   | '2'   |
        +-------+-------+
        | 'c'   | '2'   |
        +-------+-------+

    Data values can also be extracted by providing a mapping of field names
    to element paths, e.g.::

        >>> data = \"""<table>
        ...     <row>
        ...         <foo>a</foo><baz><bar v='1'/><bar v='3'/></baz>
        ...     </row>
        ...     <row>
        ...         <foo>b</foo><baz><bar v='2'/></baz>
        ...     </row>
        ...     <row>
        ...         <foo>c</foo><baz><bar v='2'/></baz>
        ...     </row>
        ... </table>\"""
        >>> with open('example3.xml', 'w') as f:
        ...     f.write(data)
        ...     f.close()
        ...
        >>> table3 = fromxml('example3.xml', 'row', {'foo': 'foo', 'bar': ('baz/bar', 'v')})
        >>> look(table3)
        +-------+------------+
        | 'foo' | 'bar'      |
        +=======+============+
        | 'a'   | ('1', '3') |
        +-------+------------+
        | 'b'   | '2'        |
        +-------+------------+
        | 'c'   | '2'        |
        +-------+------------+

    Note that the implementation is currently *not*
    streaming, i.e., the whole document is loaded into memory.

    Supports transparent reading from URLs, ``.gz`` and ``.bz2`` files.

    .. versionadded:: 0.4

    .. versionchanged:: 0.6

    If multiple elements match a given field, all values are reported as a
    tuple.

    .. versionchanged:: 0.25

    If there is more than one element name used for row values, a tuple
    or list of paths can be provided, e.g.,
    ``fromxml('example.html', './/tr', ('th', 'td'))``.

    """

    source = read_source_from_arg(source)
    return XmlView(source, *args, **kwargs)
Beispiel #27
0
def fromjson(source, *args, **kwargs):
    """
    Extract data from a JSON file. The file must contain a JSON array as
    the top level object, and each member of the array will be treated as a
    row of data. E.g.::

        >>> import petl as etl
        >>> data = '''
        ... [{"foo": "a", "bar": 1},
        ... {"foo": "b", "bar": 2},
        ... {"foo": "c", "bar": 2}]
        ... '''
        >>> with open('example.file1.json', 'w') as f:
        ...     f.write(data)
        ...
        74
        >>> table1 = etl.fromjson('example.file1.json', header=['foo', 'bar'])
        >>> table1
        +-----+-----+
        | foo | bar |
        +=====+=====+
        | 'a' |   1 |
        +-----+-----+
        | 'b' |   2 |
        +-----+-----+
        | 'c' |   2 |
        +-----+-----+

    Setting argument `lines` to `True` will enable to
    infer the document as a JSON lines document. For more details about JSON lines
    please visit https://jsonlines.org/.

        >>> import petl as etl
        >>> data_with_jlines = '''{"name": "Gilbert", "wins": [["straight", "7S"], ["one pair", "10H"]]}
        ... {"name": "Alexa", "wins": [["two pair", "4S"], ["two pair", "9S"]]}
        ... {"name": "May", "wins": []}
        ... {"name": "Deloise", "wins": [["three of a kind", "5S"]]}'''
        ...
        >>> with open('example.file2.json', 'w') as f:
        ...     f.write(data_with_jlines)
        ...
        223
        >>> table2 = etl.fromjson('example.file2.json', lines=True)
        >>> table2
        +-----------+-------------------------------------------+
        | name      | wins                                      |
        +===========+===========================================+
        | 'Gilbert' | [['straight', '7S'], ['one pair', '10H']] |
        +-----------+-------------------------------------------+
        | 'Alexa'   | [['two pair', '4S'], ['two pair', '9S']]  |
        +-----------+-------------------------------------------+
        | 'May'     | []                                        |
        +-----------+-------------------------------------------+
        | 'Deloise' | [['three of a kind', '5S']]               |
        +-----------+-------------------------------------------+

    If your JSON file does not fit this structure, you will need to parse it
    via :func:`json.load` and select the array to treat as the data, see also
    :func:`petl.io.json.fromdicts`.

    .. versionchanged:: 1.1.0

    If no `header` is specified, fields will be discovered by sampling keys
    from the first `sample` objects in `source`. The header will be
    constructed from keys in the order discovered. Note that this
    ordering may not be stable, and therefore it may be advisable to specify
    an explicit `header` or to use another function like
    :func:`petl.transform.headers.sortheader` on the resulting table to
    guarantee stability.

    """

    source = read_source_from_arg(source)
    return JsonView(source, *args, **kwargs)
Beispiel #28
0
def fromavro(source, limit=None, skip=0, **avro_args):
    """Extract a table from the records of a avro file.

    The `source` argument (string or file-like or fastavro.reader) can either
    be  the path of the file, a file-like input stream or a instance from
    fastavro.reader.

    The `limit` and `skip` arguments can be used to limit the range of rows 
    to extract.

    The `sample` argument (int, optional) defines how many rows are inspected
    for discovering the field types and building a schema for the avro file 
    when the `schema` argument is not passed.

    The rows fields read from file can have scalar values like int, string,
    float, datetime, date and decimal but can also have compound types like 
    enum, :ref:`array <array_schema>`, map, union and record. 
    The fields types can also have recursive structures defined 
    in :ref:`complex schemas <complex_schema>`.

    Also types with :ref:`logical types <logical_schema>` types are read and 
    translated to coresponding python types: long timestamp-millis and 
    long timestamp-micros: datetime.datetime, int date: datetime.date, 
    bytes decimal and fixed decimal: Decimal, int time-millis and 
    long time-micros: datetime.time.

    Example usage for reading files::

        >>> # set up a Avro file to demonstrate with
        ...
        >>> schema1 = {
        ...     'doc': 'Some people records.',
        ...     'name': 'People',
        ...     'namespace': 'test',
        ...     'type': 'record',
        ...     'fields': [
        ...         {'name': 'name', 'type': 'string'},
        ...         {'name': 'friends', 'type': 'int'},
        ...         {'name': 'age', 'type': 'int'},
        ...     ]
        ... }
        ...
        >>> records1 = [
        ...     {'name': 'Bob', 'friends': 42, 'age': 33},
        ...     {'name': 'Jim', 'friends': 13, 'age': 69},
        ...     {'name': 'Joe', 'friends': 86, 'age': 17},
        ...     {'name': 'Ted', 'friends': 23, 'age': 51}
        ... ]
        ...
        >>> import fastavro
        >>> parsed_schema1 = fastavro.parse_schema(schema1)
        >>> with open('example-file-to-read.avro', 'wb') as f1:
        ...     fastavro.writer(f1, parsed_schema1, records1)
        ...
        >>> # now demonstrate the use of fromavro()
        >>> import petl as etl
        >>> tbl1 = etl.fromavro('example-file-to-read.avro')
        >>> tbl1
        +-------+---------+-----+
        | name  | friends | age |
        +=======+=========+=====+
        | 'Bob' |      42 |  33 |
        +-------+---------+-----+
        | 'Jim' |      13 |  69 |
        +-------+---------+-----+
        | 'Joe' |      86 |  17 |
        +-------+---------+-----+
        | 'Ted' |      23 |  51 |
        +-------+---------+-----+

    .. versionadded:: 1.4.0

    """

    source2 = read_source_from_arg(source)
    return AvroView(source=source2, limit=limit, skip=skip, **avro_args)
Beispiel #29
0
def fromxml(source, *args, **kwargs):
    """
    Access data in an XML file. E.g.::

        >>> from petl import fromxml, look
        >>> data = \"""<table>
        ...     <tr>
        ...         <td>foo</td><td>bar</td>
        ...     </tr>
        ...     <tr>
        ...         <td>a</td><td>1</td>
        ...     </tr>
        ...     <tr>
        ...         <td>b</td><td>2</td>
        ...     </tr>
        ...     <tr>
        ...         <td>c</td><td>2</td>
        ...     </tr>
        ... </table>\"""
        >>> with open('example1.xml', 'w') as f:
        ...     f.write(data)
        ...     f.close()
        ...
        >>> table1 = fromxml('example1.xml', 'tr', 'td')
        >>> look(table1)
        +-------+-------+
        | 'foo' | 'bar' |
        +=======+=======+
        | 'a'   | '1'   |
        +-------+-------+
        | 'b'   | '2'   |
        +-------+-------+
        | 'c'   | '2'   |
        +-------+-------+

    If the data values are stored in an attribute, provide the attribute name
    as an extra positional argument, e.g.:

        >>> data = \"""<table>
        ...     <tr>
        ...         <td v='foo'/><td v='bar'/>
        ...     </tr>
        ...     <tr>
        ...         <td v='a'/><td v='1'/>
        ...     </tr>
        ...     <tr>
        ...         <td v='b'/><td v='2'/>
        ...     </tr>
        ...     <tr>
        ...         <td v='c'/><td v='2'/>
        ...     </tr>
        ... </table>\"""
        >>> with open('example2.xml', 'w') as f:
        ...     f.write(data)
        ...     f.close()
        ...
        >>> table2 = fromxml('example2.xml', 'tr', 'td', 'v')
        >>> look(table2)
        +-------+-------+
        | 'foo' | 'bar' |
        +=======+=======+
        | 'a'   | '1'   |
        +-------+-------+
        | 'b'   | '2'   |
        +-------+-------+
        | 'c'   | '2'   |
        +-------+-------+

    Data values can also be extracted by providing a mapping of field names
    to element paths, e.g.::

        >>> data = \"""<table>
        ...     <row>
        ...         <foo>a</foo><baz><bar v='1'/><bar v='3'/></baz>
        ...     </row>
        ...     <row>
        ...         <foo>b</foo><baz><bar v='2'/></baz>
        ...     </row>
        ...     <row>
        ...         <foo>c</foo><baz><bar v='2'/></baz>
        ...     </row>
        ... </table>\"""
        >>> with open('example3.xml', 'w') as f:
        ...     f.write(data)
        ...     f.close()
        ...
        >>> table3 = fromxml('example3.xml', 'row', {'foo': 'foo', 'bar': ('baz/bar', 'v')})
        >>> look(table3)
        +-------+------------+
        | 'foo' | 'bar'      |
        +=======+============+
        | 'a'   | ('1', '3') |
        +-------+------------+
        | 'b'   | '2'        |
        +-------+------------+
        | 'c'   | '2'        |
        +-------+------------+

    Note that the implementation is currently *not*
    streaming, i.e., the whole document is loaded into memory.

    Supports transparent reading from URLs, ``.gz`` and ``.bz2`` files.

    .. versionadded:: 0.4

    .. versionchanged:: 0.6

    If multiple elements match a given field, all values are reported as a
    tuple.

    .. versionchanged:: 0.25

    If there is more than one element name used for row values, a tuple
    or list of paths can be provided, e.g.,
    ``fromxml('example.html', './/tr', ('th', 'td'))``.

    """

    source = read_source_from_arg(source)
    return XmlView(source, *args, **kwargs)