Beispiel #1
0
def _write_toavro(table,
                  target,
                  mode,
                  schema,
                  sample,
                  codec='deflate',
                  compression_level=None,
                  **avro_args):
    if table is None or len(table) <= 0:
        return
    # build a schema when not defined by user
    if not schema:
        schema, table2 = _build_schema_from_values(table, sample)
    else:
        table2 = _fix_missing_headers(table, schema)

    # fastavro expects a iterator of dicts
    rows = dicts(table2) if PY3 else _ordered_dict_iterator(table2)

    with target.open(mode) as target_file:
        # delay the import of fastavro for not breaking when unused
        import fastavro
        parsed_schema = fastavro.parse_schema(schema)
        # this could raise a error when any value is not of supported tupe
        fastavro.writer(fo=target_file,
                        schema=parsed_schema,
                        records=rows,
                        codec=codec,
                        codec_compression_level=compression_level,
                        **avro_args)
Beispiel #2
0
def appendtextindex(table,
                    index_or_dirname,
                    indexname=None,
                    merge=True,
                    optimize=False):
    """
    Load all rows from `table` into a Whoosh index, adding them to any existing
    data in the index.

    Keyword arguments:

    table
        A table container with the data to be loaded.
    index_or_dirname
        Either an instance of `whoosh.index.Index` or a string containing the
        directory path where the index is to be stored.
    indexname
        String containing the name of the index, if multiple indexes are stored
        in the same directory.
    merge
        Merge small segments during commit?
    optimize
        Merge all segments together?

    """
    import whoosh.index

    # deal with polymorphic argument
    if isinstance(index_or_dirname, string_types):
        dirname = index_or_dirname
        index = whoosh.index.open_dir(dirname,
                                      indexname=indexname,
                                      readonly=False)
        needs_closing = True
    elif isinstance(index_or_dirname, whoosh.index.Index):
        index = index_or_dirname
        needs_closing = False
    else:
        raise ArgumentError('expected string or index, found %r' %
                            index_or_dirname)

    writer = index.writer()
    try:

        for d in dicts(table):
            writer.add_document(**d)
        writer.commit(merge=merge, optimize=optimize)

    except Exception:
        writer.cancel()
        raise

    finally:
        if needs_closing:
            index.close()
Beispiel #3
0
def appendtextindex(table, index_or_dirname, indexname=None, merge=True,
                    optimize=False):
    """
    Load all rows from `table` into a Whoosh index, adding them to any existing
    data in the index.

    Keyword arguments:

    table
        A table container with the data to be loaded.
    index_or_dirname
        Either an instance of `whoosh.index.Index` or a string containing the
        directory path where the index is to be stored.
    indexname
        String containing the name of the index, if multiple indexes are stored
        in the same directory.
    merge
        Merge small segments during commit?
    optimize
        Merge all segments together?

    """
    import whoosh.index

    # deal with polymorphic argument
    if isinstance(index_or_dirname, string_types):
        dirname = index_or_dirname
        index = whoosh.index.open_dir(dirname, indexname=indexname,
                                      readonly=False)
        needs_closing = True
    elif isinstance(index_or_dirname, whoosh.index.Index):
        index = index_or_dirname
        needs_closing = False
    else:
        raise ArgumentError('expected string or index, found %r'
                            % index_or_dirname)

    writer = index.writer()
    try:

        for d in dicts(table):
            writer.add_document(**d)
        writer.commit(merge=merge, optimize=optimize)

    except Exception:
        writer.cancel()
        raise

    finally:
        if needs_closing:
            index.close()
Beispiel #4
0
def _write_toavro(table, target, mode, schema, sample,
                  codec='deflate', compression_level=None, **avro_args):
    if table is None:
        return
    # build a schema when not defined by user
    if not schema:
        schema, table2 = _build_schema_from_values(table, sample)
    else:
        table2 = _fix_missing_headers(table, schema)
    # fastavro expects a iterator of dicts
    rows = dicts(table2) if PY3 else _ordered_dict_iterator(table2)

    target2 = write_source_from_arg(target, mode=mode)
    with target2.open(mode) as target_file:
        # delay the import of fastavro for not breaking when unused
        from fastavro import parse_schema
        from fastavro.write import Writer

        parsed_schema = parse_schema(schema)
        writer = Writer(fo=target_file,
                        schema=parsed_schema,
                        codec=codec,
                        compression_level=compression_level,
                        **avro_args)
        num = 1
        for record in rows:
            try:
                writer.write(record)
                num = num + 1
            except ValueError as verr:
                vmsg = _get_error_details(target, num, verr, record, schema)
                _raise_error(ValueError, vmsg)
            except TypeError as terr:
                tmsg = _get_error_details(target, num, terr, record, schema)
                _raise_error(TypeError, tmsg)
        # finish writing
        writer.flush()
Beispiel #5
0
 def _create_avro_example(test_schema, test_table):
     parsed_schema = fastavro.parse_schema(test_schema)
     rows = dicts(test_table)
     with NamedTemporaryFile(delete=False, mode='wb') as fo:
         fastavro.writer(fo, parsed_schema, rows)
         return fo.name
Beispiel #6
0
def test_dicts_shortrows():
    table = (('foo', 'bar'), ('a', 1), ('b', ))
    actual = dicts(table)
    expect = ({'foo': 'a', 'bar': 1}, {'foo': 'b', 'bar': None})
    ieq(expect, actual)
Beispiel #7
0
def test_dicts():
    table = (('foo', 'bar'), ('a', 1), ('b', 2))
    actual = dicts(table)
    expect = ({'foo': 'a', 'bar': 1}, {'foo': 'b', 'bar': 2})
    ieq(expect, actual)
Beispiel #8
0
def test_dicts_shortrows():
    table = (('foo', 'bar'), ('a', 1), ('b',))
    actual = dicts(table)
    expect = ({'foo': 'a', 'bar': 1}, {'foo': 'b', 'bar': None})
    ieq(expect, actual)
Beispiel #9
0
def test_dicts():
    table = (('foo', 'bar'), ('a', 1), ('b', 2))
    actual = dicts(table)
    expect = ({'foo': 'a', 'bar': 1}, {'foo': 'b', 'bar': 2})
    ieq(expect, actual)
Beispiel #10
0
def totextindex(table, index_or_dirname, schema=None, indexname=None,
                merge=False, optimize=False):
    """
    Load all rows from `table` into a Whoosh index. N.B., this will clear any
    existing data in the index before loading. E.g.::

        >>> import petl as etl
        >>> import datetime
        >>> import os
        >>> # here is the table we want to load into an index
        ... table = (('f0', 'f1', 'f2', 'f3', 'f4'),
        ...          ('AAA', 12, 4.3, True, datetime.datetime.now()),
        ...          ('BBB', 6, 3.4, False, datetime.datetime(1900, 1, 31)),
        ...          ('CCC', 42, 7.8, True, datetime.datetime(2100, 12, 25)))
        >>> # define a schema for the index
        ... from whoosh.fields import *
        >>> schema = Schema(f0=TEXT(stored=True),
        ...                 f1=NUMERIC(int, stored=True),
        ...                 f2=NUMERIC(float, stored=True),
        ...                 f3=BOOLEAN(stored=True),
        ...                 f4=DATETIME(stored=True))
        >>> # load index
        ... dirname = 'example.whoosh'
        >>> if not os.path.exists(dirname):
        ...     os.mkdir(dirname)
        ...
        >>> etl.totextindex(table, dirname, schema=schema)

    Keyword arguments:

    table
        A table container with the data to be loaded.
    index_or_dirname
        Either an instance of `whoosh.index.Index` or a string containing the
        directory path where the index is to be stored.
    schema
        Index schema to use if creating the index.
    indexname
        String containing the name of the index, if multiple indexes are stored
        in the same directory.
    merge
        Merge small segments during commit?
    optimize
        Merge all segments together?

    """
    import whoosh.index
    import whoosh.writing

    # deal with polymorphic argument
    if isinstance(index_or_dirname, string_types):
        dirname = index_or_dirname
        index = whoosh.index.create_in(dirname, schema,
                                       indexname=indexname)
        needs_closing = True
    elif isinstance(index_or_dirname, whoosh.index.Index):
        index = index_or_dirname
        needs_closing = False
    else:
        raise ArgumentError('expected string or index, found %r'
                            % index_or_dirname)

    writer = index.writer()
    try:

        for d in dicts(table):
            writer.add_document(**d)
        writer.commit(merge=merge, optimize=optimize,
                      mergetype=whoosh.writing.CLEAR)

    except:
        writer.cancel()
        raise

    finally:
        if needs_closing:
            index.close()
Beispiel #11
0
def totextindex(table,
                index_or_dirname,
                schema=None,
                indexname=None,
                merge=False,
                optimize=False):
    """
    Load all rows from `table` into a Whoosh index. N.B., this will clear any
    existing data in the index before loading. E.g.::

        >>> import petl as etl
        >>> import datetime
        >>> import os
        >>> # here is the table we want to load into an index
        ... table = (('f0', 'f1', 'f2', 'f3', 'f4'),
        ...          ('AAA', 12, 4.3, True, datetime.datetime.now()),
        ...          ('BBB', 6, 3.4, False, datetime.datetime(1900, 1, 31)),
        ...          ('CCC', 42, 7.8, True, datetime.datetime(2100, 12, 25)))
        >>> # define a schema for the index
        ... from whoosh.fields import *
        >>> schema = Schema(f0=TEXT(stored=True),
        ...                 f1=NUMERIC(int, stored=True),
        ...                 f2=NUMERIC(float, stored=True),
        ...                 f3=BOOLEAN(stored=True),
        ...                 f4=DATETIME(stored=True))
        >>> # load index
        ... dirname = 'example.whoosh'
        >>> if not os.path.exists(dirname):
        ...     os.mkdir(dirname)
        ...
        >>> etl.totextindex(table, dirname, schema=schema)

    Keyword arguments:

    table
        A table container with the data to be loaded.
    index_or_dirname
        Either an instance of `whoosh.index.Index` or a string containing the
        directory path where the index is to be stored.
    schema
        Index schema to use if creating the index.
    indexname
        String containing the name of the index, if multiple indexes are stored
        in the same directory.
    merge
        Merge small segments during commit?
    optimize
        Merge all segments together?

    """
    import whoosh.index
    import whoosh.writing

    # deal with polymorphic argument
    if isinstance(index_or_dirname, string_types):
        dirname = index_or_dirname
        index = whoosh.index.create_in(dirname, schema, indexname=indexname)
        needs_closing = True
    elif isinstance(index_or_dirname, whoosh.index.Index):
        index = index_or_dirname
        needs_closing = False
    else:
        raise ArgumentError('expected string or index, found %r' %
                            index_or_dirname)

    writer = index.writer()
    try:

        for d in dicts(table):
            writer.add_document(**d)
        writer.commit(merge=merge,
                      optimize=optimize,
                      mergetype=whoosh.writing.CLEAR)

    except:
        writer.cancel()
        raise

    finally:
        if needs_closing:
            index.close()