Example #1
0
def test_table_from_pandas_schema():
    # passed schema is source of truth for the columns
    import pandas as pd

    df = pd.DataFrame(
        OrderedDict([('strs', ['', 'foo', 'bar']), ('floats', [4.5, 5,
                                                               None])]))

    # with different but compatible schema
    schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float32())])
    table = pa.Table.from_pandas(df, schema=schema)
    assert pa.types.is_float32(table.column('floats').type)
    assert table.schema.remove_metadata() == schema

    # with different and incompatible schema
    schema = pa.schema([('strs', pa.utf8()), ('floats', pa.timestamp('s'))])
    with pytest.raises((NotImplementedError, TypeError)):
        pa.Table.from_pandas(df, schema=schema)

    # schema has columns not present in data -> error
    schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float64()),
                        ('ints', pa.int64())])
    with pytest.raises(KeyError, match='ints'):
        pa.Table.from_pandas(df, schema=schema)

    # data has columns not present in schema -> ignored
    schema = pa.schema([('strs', pa.utf8())])
    table = pa.Table.from_pandas(df, schema=schema)
    assert table.num_columns == 1
    assert table.schema.remove_metadata() == schema
    assert table.column_names == ['strs']
Example #2
0
def _infer_output_column_type(column: pyarrow.ChunkedArray) -> ColumnType:
    if column.type == pyarrow.utf8() or (hasattr(column.type, "value_type")
                                         and column.type.value_type
                                         == pyarrow.utf8()):
        return ColumnType.Text()
    else:
        return ColumnType.Number()
Example #3
0
def test_render_v0_add_retweet_status_screen_name():
    # Migration: what happens when we accumulate tweets
    # where the old stored table does not have retweet_status_screen_name?
    # We should consider those to have just None in that column
    input_table = pa.table({
        "screen_name": ["TheTweepyTester", "TheTweepyTester"],
        "created_at":
        pa.array(
            [dt("2016-11-05T21:38:46Z"),
             dt("2016-11-05T21:37:13Z")],
            pa.timestamp("ns"),
        ),
        "text": ["Hello", "testing 1000 https://t.co/3vt8ITRQ3w"],
        "retweet_count": [0, 0],
        "favorite_count": [0, 0],
        "in_reply_to_screen_name":
        pa.array([None, None], pa.utf8()),
        "user_description": ["", ""],
        "source": ["Twitter Web Client", "Tweepy dev"],
        "lang": ["en", "en"],
        "id": [795017539831103489, 795017147651162112],
    })
    with _temp_parquet_file(input_table) as parquet_path:
        _assert_render(
            twitter.FetchResult(parquet_path, []),
            P(accumulate=False),
            input_table.add_column(6, "retweeted_status_screen_name",
                                   pa.array([None, None], pa.utf8())),
            [],
        )
Example #4
0
def test_render_v0_truncate_fetch_results():
    all_rows = pa.table({
        "screen_name": ["TheTweepyTester", "TheTweepyTester"],
        "created_at":
        pa.array(
            [dt("2016-11-05T21:38:46Z"),
             dt("2016-11-05T21:37:13Z")],
            pa.timestamp("ns"),
        ),
        "text": ["Hello", "testing 1000 https://t.co/3vt8ITRQ3w"],
        "retweet_count": [0, 0],
        "favorite_count": [0, 0],
        "in_reply_to_screen_name":
        pa.array([None, None], pa.utf8()),
        "retweeted_status_screen_name":
        pa.array([None, None], pa.utf8()),
        "user_description": ["", ""],
        "source": ["Twitter Web Client", "Tweepy dev"],
        "lang": ["en", "en"],
        "id": [795017539831103489, 795017147651162112],
    })
    with _temp_parquet_file(all_rows) as parquet_path:
        _assert_render(
            twitter.FetchResult(parquet_path, []),
            P(accumulate=False),
            all_rows.slice(0, 1),
            [],
        )
Example #5
0
def test_render_v0_recover_after_bug_160258591():
    # https://www.pivotaltracker.com/story/show/160258591
    # 'id', 'retweet_count' and 'favorite_count' had wrong type after
    # accumulating an empty table. Now the bad data is in our database;
    # let's convert back to the type we want.
    input_table = pa.table({
        "screen_name": ["TheTweepyTester", "TheTweepyTester"],
        "created_at":
        pa.array(
            [dt("2016-11-05T21:38:46Z"),
             dt("2016-11-05T21:37:13Z")],
            pa.timestamp("ns"),
        ),
        "text": ["Hello", "testing 1000 https://t.co/3vt8ITRQ3w"],
        "retweet_count": ["0", "0"],
        "favorite_count": ["0", "0"],
        "in_reply_to_screen_name":
        pa.array([None, None], pa.utf8()),
        "retweeted_status_screen_name":
        pa.array([None, None], pa.utf8()),
        "user_description": ["", ""],
        "source": ["Twitter Web Client", "Tweepy dev"],
        "lang": ["en", "en"],
        "id": ["795017539831103489", "795017147651162112"],
    })
    with _temp_parquet_file(input_table) as parquet_path:
        _assert_render(
            twitter.FetchResult(parquet_path, []),
            P(accumulate=False),
            (input_table.set_column(3, "retweet_count", pa.array([
                0, 0
            ])).set_column(4, "favorite_count", pa.array([0, 0])).set_column(
                10, "id", pa.array([795017539831103489, 795017147651162112]))),
            [],
        )
Example #6
0
def test_table_from_pydict_arrow_arrays(data, klass):
    data = OrderedDict([('strs', klass(data[0])), ('floats', klass(data[1]))])
    schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float64())])

    # With arrays as values
    table = pa.Table.from_pydict(data)
    assert table.num_columns == 2
    assert table.num_rows == 3
    assert table.schema == schema

    # With explicit (matching) schema
    table = pa.Table.from_pydict(data, schema=schema)
    assert table.num_columns == 2
    assert table.num_rows == 3
    assert table.schema == schema

    # with different but compatible schema
    schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float32())])
    table = pa.Table.from_pydict(data, schema=schema)
    assert pa.types.is_float32(table.column('floats').type)
    assert table.num_columns == 2
    assert table.num_rows == 3
    assert table.schema == schema

    # with different and incompatible schema
    schema = pa.schema([('strs', pa.utf8()), ('floats', pa.timestamp('s'))])
    with pytest.raises((NotImplementedError, TypeError)):
        pa.Table.from_pydict(data, schema=schema)
Example #7
0
def test_from_arrays_schema(data, klass):
    data = [klass(data[0]), klass(data[1])]
    schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float32())])

    table = pa.Table.from_arrays(data, schema=schema)
    assert table.num_columns == 2
    assert table.num_rows == 3
    assert table.schema == schema

    # length of data and schema not matching
    schema = pa.schema([('strs', pa.utf8())])
    with pytest.raises(ValueError):
        pa.Table.from_arrays(data, schema=schema)

    # with different but compatible schema
    schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float32())])
    table = pa.Table.from_arrays(data, schema=schema)
    assert pa.types.is_float32(table.column('floats').type)
    assert table.num_columns == 2
    assert table.num_rows == 3
    assert table.schema == schema

    # with different and incompatible schema
    schema = pa.schema([('strs', pa.utf8()), ('floats', pa.timestamp('s'))])
    with pytest.raises((NotImplementedError, TypeError)):
        pa.Table.from_pydict(data, schema=schema)

    # Cannot pass both schema and metadata / names
    with pytest.raises(ValueError):
        pa.Table.from_arrays(data, schema=schema, names=['strs', 'floats'])

    with pytest.raises(ValueError):
        pa.Table.from_arrays(data, schema=schema, metadata={b'foo': b'bar'})
Example #8
0
    def test_column_types_dict(self):
        # Ask for dict-encoded column types in ConvertOptions
        column_types = [('a', pa.dictionary(pa.int32(), pa.utf8())),
                        ('b', pa.dictionary(pa.int32(), pa.int64())),
                        ('c', pa.dictionary(pa.int32(), pa.decimal128(11, 2))),
                        ('d', pa.dictionary(pa.int32(), pa.large_utf8()))]

        opts = ConvertOptions(column_types=dict(column_types))
        rows = (b"a,b,c,d\n"
                b"abc,123456,1.0,zz\n"
                b"defg,123456,0.5,xx\n"
                b"abc,N/A,1.0,xx\n")
        table = self.read_bytes(rows, convert_options=opts)

        schema = pa.schema(column_types)
        expected = {
            'a': ["abc", "defg", "abc"],
            'b': [123456, 123456, None],
            'c': [Decimal("1.00"),
                  Decimal("0.50"),
                  Decimal("1.00")],
            'd': ["zz", "xx", "xx"],
        }
        assert table.schema == schema
        assert table.to_pydict() == expected

        # Unsupported index type
        column_types[0] = ('a', pa.dictionary(pa.int8(), pa.utf8()))

        opts = ConvertOptions(column_types=dict(column_types))
        with pytest.raises(NotImplementedError):
            table = self.read_bytes(rows, convert_options=opts)
Example #9
0
def test_render_undefined_language_is_null():
    # https://blog.twitter.com/developer/en_us/a/2013/introducing-new-metadata-for-tweets.html
    with _temp_tarfile([
            lambda: _temp_json_path_lz4(
                "1088215462867959800.json.lz4",
                Path("tests/files/1_1_one_undefined_lang.json"),
                {"cjw:apiEndpoint": "1.1/statuses/user_timeline.json"},
            )
    ]) as tar_path:
        _assert_render(
            twitter.FetchResult(tar_path, []),
            P(accumulate=False),
            pa.table({
                "screen_name": ["workbenchdata"],
                "created_at":
                pa.array([dt("Wed Jan 23 23:22:39 +0000 2019")],
                         pa.timestamp("ns")),
                "text": ["🤖 https://t.co/FOhOfZT9MZ"],
                "retweet_count": [0],
                "favorite_count": [1],
                "in_reply_to_screen_name":
                pa.nulls(1, pa.utf8()),
                "retweeted_status_screen_name":
                pa.nulls(1, pa.utf8()),
                "user_description": [
                    "Scrape, clean and analyze data without code. Create reproducible data workflows that can be shared with others"
                ],
                "source": ["Twitter for iPhone"],
                # "lang" is the key data point we're testing
                "lang":
                pa.nulls(1, pa.utf8()),
                "id": [1088215462867959800],
            }),
            [],
        )
Example #10
0
def test_large_list_type():
    ty = pa.large_list(pa.utf8())
    assert isinstance(ty, pa.LargeListType)
    assert ty.value_type == pa.utf8()

    with pytest.raises(TypeError):
        pa.large_list(None)
Example #11
0
def test_large_list_type():
    ty = pa.large_list(pa.utf8())
    assert isinstance(ty, pa.LargeListType)
    assert ty.value_type == pa.utf8()
    assert ty.value_field == pa.field("item", pa.utf8(), nullable=True)

    with pytest.raises(TypeError):
        pa.large_list(None)
Example #12
0
def test_map_type():
    ty = pa.map_(pa.utf8(), pa.int32())
    assert isinstance(ty, pa.MapType)
    assert ty.key_type == pa.utf8()
    assert ty.item_type == pa.int32()

    with pytest.raises(TypeError):
        pa.map_(None)
    with pytest.raises(TypeError):
        pa.map_(pa.int32(), None)
Example #13
0
def test_arrow():
    a = pl.Series("a", [1, 2, 3, None])
    out = a.to_arrow()
    assert out == pa.array([1, 2, 3, None])

    a = pa.array(["foo", "bar"], pa.dictionary(pa.int32(), pa.utf8()))
    s = pl.Series("a", a)
    assert s.dtype == pl.Categorical
    assert (pl.from_arrow(
        pa.array([["foo"], ["foo", "bar"]],
                 pa.list_(pa.utf8()))).dtype == pl.List)
Example #14
0
 def test_has_header_when_n_rows_is_1(self):
     with _temp_csv("A,B") as path:
         assert_csv_result_equals(
             _internal_parse_csv(path, has_header=True),
             ParseCsvResult(
                 pa.table({
                     "A": pa.array([], pa.utf8()),
                     "B": pa.array([], pa.utf8())
                 }),
                 [],
             ),
         )
Example #15
0
def reencode_dictionary_array(array: pa.Array) -> pa.Array:
    if len(array.indices) <= len(array.dictionary):
        # Groupby often reduces the number of values considerably. Let's shy
        # away from dictionary when it gives us literally nothing.
        return array.cast(pa.utf8())

    used = np.zeros(len(array.dictionary), np.bool_)
    used[array.indices] = True
    if np.all(used):
        return array  # no edit

    return array.cast(pa.utf8()).dictionary_encode()  # TODO optimize
Example #16
0
def test_map_type():
    ty = pa.map_(pa.utf8(), pa.int32())
    assert isinstance(ty, pa.MapType)
    assert ty.key_type == pa.utf8()
    assert ty.key_field == pa.field("key", pa.utf8(), nullable=False)
    assert ty.item_type == pa.int32()
    assert ty.item_field == pa.field("value", pa.int32(), nullable=True)

    with pytest.raises(TypeError):
        pa.map_(None)
    with pytest.raises(TypeError):
        pa.map_(pa.int32(), None)
    with pytest.raises(TypeError):
        pa.map_(pa.field("name", pa.string(), nullable=True), pa.int64())
def test_is_map():
    m = pa.map_(pa.utf8(), pa.int32())

    assert types.is_map(m)
    assert not types.is_map(pa.int32())

    fields = pa.map_(pa.field('key_name', pa.utf8(), nullable=False),
                     pa.field('value_name', pa.int32()))
    assert types.is_map(fields)

    entries_type = pa.struct([pa.field('key', pa.int8()),
                              pa.field('value', pa.int8())])
    list_type = pa.list_(entries_type)
    assert not types.is_map(list_type)
Example #18
0
def test_view():
    # ARROW-5992
    arr = pa.array(['foo', 'bar', 'baz'], type=pa.utf8())
    expected = pa.array(['foo', 'bar', 'baz'], type=pa.binary())

    assert arr.view(pa.binary()).equals(expected)
    assert arr.view('binary').equals(expected)
Example #19
0
def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.large_list(pa.uint8()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.list_(pa.int8())),
                   pa.field('c', pa.string())]),
        ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)
Example #20
0
def test_type_ids():
    # Having this fixed is very important because internally we rely on this id
    # to parse from python
    for idx, arrow_type in [
        (0, pa.null()),
        (1, pa.bool_()),
        (2, pa.uint8()),
        (3, pa.int8()),
        (4, pa.uint16()),
        (5, pa.int16()),
        (6, pa.uint32()),
        (7, pa.int32()),
        (8, pa.uint64()),
        (9, pa.int64()),
        (10, pa.float16()),
        (11, pa.float32()),
        (12, pa.float64()),
        (13, pa.string()),
        (13, pa.utf8()),
        (14, pa.binary()),
        (16, pa.date32()),
        (17, pa.date64()),
        (18, pa.timestamp("us")),
        (19, pa.time32("s")),
        (20, pa.time64("us")),
        (23, pa.decimal128(8, 1)),
        (34, pa.large_utf8()),
        (35, pa.large_binary()),
    ]:
        assert idx == arrow_type.id
Example #21
0
def test_statistics_convert_logical_types(tempdir):
    # ARROW-5166, ARROW-4139

    # (min, max, type)
    cases = [
        (10, 11164359321221007157, pa.uint64()), (10, 4294967295, pa.uint32()),
        ("ähnlich", "öffentlich", pa.utf8()),
        (datetime.time(10, 30, 0, 1000), datetime.time(15, 30, 0,
                                                       1000), pa.time32('ms')),
        (datetime.time(10, 30, 0, 1000), datetime.time(15, 30, 0,
                                                       1000), pa.time64('us')),
        (datetime.datetime(2019, 6, 24, 0, 0, 0,
                           1000), datetime.datetime(2019, 6, 25, 0, 0, 0,
                                                    1000), pa.timestamp('ms')),
        (datetime.datetime(2019, 6, 24, 0, 0, 0,
                           1000), datetime.datetime(2019, 6, 25, 0, 0, 0,
                                                    1000), pa.timestamp('us'))
    ]

    for i, (min_val, max_val, typ) in enumerate(cases):
        t = pa.Table.from_arrays([pa.array([min_val, max_val], type=typ)],
                                 ['col'])
        path = str(tempdir / ('example{}.parquet'.format(i)))
        pq.write_table(t, path, version='2.0')
        pf = pq.ParquetFile(path)
        stats = pf.metadata.row_group(0).column(0).statistics
        assert stats.min == min_val
        assert stats.max == max_val
Example #22
0
def coerce_arrow(array: "pa.Array") -> "pa.Array":
    # also coerces timezone to naive representation
    # units are accounted for by pyarrow
    if "timestamp" in str(array.type):
        warnings.warn(
            "Conversion of (potentially) timezone aware to naive datetimes. TZ information may be lost",
        )
        ts_ms = pa.compute.cast(array, pa.timestamp("ms"), safe=False)
        ms = pa.compute.cast(ts_ms, pa.int64())
        del ts_ms
        array = pa.compute.cast(ms, pa.date64())
        del ms
    # note: Decimal256 could not be cast to float
    elif isinstance(array.type, pa.Decimal128Type):
        array = pa.compute.cast(array, pa.float64())

    # simplest solution is to cast to (large)-string arrays
    # this is copy and expensive
    elif isinstance(array, pa.DictionaryArray):
        if array.dictionary.type == pa.string():
            array = pa.compute.cast(pa.compute.cast(array, pa.utf8()),
                                    pa.large_utf8())
        else:
            raise ValueError(
                "polars does not support dictionary encoded types other than strings"
            )

    if hasattr(array, "num_chunks") and array.num_chunks > 1:
        array = array.combine_chunks()
    return array
Example #23
0
def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.list_(pa.int8())),
                   pa.field('c', pa.string())]),
        ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)
Example #24
0
def test_header_rows_convert_to_str():
    workbook = xl.Workbook()
    sheet = workbook.add_sheet("X")
    sheet.write(
        0, 0, datetime.date(2020, 1, 25), xl.easyxf(num_format_str="dd-mmm-yyyy")
    )
    sheet.write(0, 1, 123.4213)
    sheet.write(0, 2, 123.4213, xl.easyxf(num_format_str="#.00"))
    # Leave D1 blank
    # It'd be nice to set E1="", but xlwt treats "" as blank
    sheet.write(1, 0, "a")
    sheet.write(1, 1, "b")
    sheet.write(1, 2, "c")
    sheet.write(1, 3, "d")
    with tempfile.NamedTemporaryFile(suffix="-headers.arrow") as header_file:
        # ignore result
        do_convert_data(workbook, header_rows="0-1", header_rows_file=header_file.name)
        with pyarrow.ipc.open_file(header_file.name) as header_reader:
            header_table = header_reader.read_all()
    assert_table_equals(
        header_table,
        pyarrow.table(
            {
                "A": ["25-Jan-2020"],
                "B": ["123.4213"],
                "C": ["123.42"],
                "D": pyarrow.array([None], pyarrow.utf8()),
            }
        ),
    )
Example #25
0
def _query_conversations(db: sqlite3.Connection) -> pa.Table:
    with contextlib.closing(db.cursor()) as cursor:
        cursor.execute(CONVERSATIONS_SQL)
        table1 = _cursor_to_table(cursor)

    # Extract messages in Python, not SQLite UDF, so it's easy to debug the
    # query as described in the README.
    last_message_pattern = re.compile("(?:.*\n\u2063)*(.*)", re.DOTALL)

    def extract_last_message(messages_str: str) -> str:
        r"""Omit all but the final message from a conversation.

        Check doesn't use a JSON Array to delimit separate message texts.
        Instead, it delimits them by '\n\u2063'.
        """
        if messages_str is None:
            return None

        return last_message_pattern.match(messages_str).group(1)

    user_messages_list = table1["user_messages"].to_pylist()
    last_user_message_list = [
        extract_last_message(m) for m in user_messages_list
    ]
    return table1.add_column(
        table1.column_names.index("user_messages") + 1,
        "last_user_message",
        pa.array(last_user_message_list, pa.utf8()),
    )
Example #26
0
 def _convert_to_parquet(self):
     import pyarrow as pa
     final_schema = []
     PRIMITIVE = {
         'string': pa.utf8(),
         'int': pa.int32(),
         'long': pa.int64(),
         'boolean': pa.bool_(),
         'double': pa.float64(),
         'float': pa.float32()
     }
     for field in self._metadata['fields']:
         name = field['name']
         writer_type = field['type']
         if not isinstance(writer_type, list):
             writer_type = [writer_type]
         comment = {}
         nullable = False
         final_type = None
         for nntype in writer_type:
             if isinstance(nntype, str):
                 if nntype == 'null':
                     nullable = True
                 else:
                     final_type = PRIMITIVE[nntype]
             elif isinstance(nntype, dict):
                 if 'logicalType' in nntype:
                     logical_type = nntype['logicalType']
                     if logical_type == 'decimal':
                         scale = nntype.get('scale', 10)
                         precision = nntype.get('precision', 38)
                         final_type = pa.decimal128(precision, scale)
                     elif logical_type == 'timestamp-millis':
                         final_type = pa.timestamp('ms')
                     elif logical_type == 'timestamp-micro':
                         final_type = pa.timestamp('us')
                         comment['timeunit'] = 'us'
                     elif logical_type == 'date':
                         final_type = pa.date32()
                     else:
                         raise NotImplementedError(
                             'logicalType {} is not implemented'.format(
                                 logical_type))
                 elif 'type' in nntype:
                     if nntype['type'] in PRIMITIVE:
                         final_type = PRIMITIVE[nntype['type']]
                         if 'connect.name' in nntype:
                             comment['connect_type'] = nntype[
                                 'connect.name']
                     else:
                         raise NotImplementedError(
                             'Type {} is not implemented'.format(nntype))
                 else:
                     raise NotImplementedError(
                         'Type {} is not implemented'.format(nntype))
             else:
                 raise NotImplementedError(
                     'Type {} is not implemented'.format(nntype))
         final_schema.append(pa.field(name, final_type, nullable, comment))
     return pa.schema(final_schema)
Example #27
0
def case_basic_required(size=1):
    int64 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    float64 = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]
    string = [
        "Hello", "bbb", "aa", "", "bbb", "abc", "bbb", "bbb", "def", "aaa"
    ]
    boolean = [True, True, False, False, False, True, True, True, True, True]

    fields = [
        pa.field('int64', pa.int64(), nullable=False),
        pa.field('float64', pa.float64(), nullable=False),
        pa.field('string', pa.utf8(), nullable=False),
        pa.field('bool', pa.bool_(), nullable=False),
        pa.field('date', pa.timestamp('ms', ), nullable=False),
        pa.field('uint32', pa.uint32(), nullable=False),
    ]
    schema = pa.schema(fields)

    return {
        "int64": int64 * size,
        "float64": float64 * size,
        "string": string * size,
        "bool": boolean * size,
        "date": int64 * size,
        "uint32": int64 * size,
    }, schema, f"basic_required_{size*10}.parquet"
Example #28
0
def test_render_retweeted_status_full_text_twitter_api_v1():
    with _temp_tarfile([
            lambda: _temp_json_path_lz4(
                "1105492514289512400.json.lz4",
                Path("tests/files/1_1_one_extended_retweet.json"),
                {"cjw:apiEndpoint": "1.1/statuses/user_timeline.json"},
            )
    ]) as tar_path:
        _assert_render(
            twitter.FetchResult(tar_path, []),
            P(accumulate=False),
            pa.table({
                "screen_name": ["workbenchdata"],
                "created_at":
                pa.array([dt("Tue Mar 12 15:35:29 +0000 2019")],
                         pa.timestamp("ns")),
                "text": [
                    # "text" is the key data point we're testing
                    "RT @JacopoOttaviani: ⚡️ I'm playing with @workbenchdata: absolutely mindblowing. It's like a fusion between ScraperWiki, OpenRefine and Datawrapper. All of it online in the cloud and for free 👉🏽 https://t.co/fleqjI1qCI https://t.co/mmWHJLDjT2 #ddj #dataviz"
                ],
                "retweet_count": [7],
                "favorite_count": [0],
                "in_reply_to_screen_name":
                pa.nulls(1, pa.utf8()),
                "retweeted_status_screen_name": ["JacopoOttaviani"],
                "user_description": [
                    "Scrape, clean and analyze data without code. Create reproducible data workflows that can be shared with others"
                ],
                "source": ["Twitter for iPhone"],
                "lang": ["en"],
                "id": [1105492514289512400],
            }),
            [],
        )
Example #29
0
def test_dictionary_delta(stream_fixture):
    ty = pa.dictionary(pa.int8(), pa.utf8())
    data = [["foo", "foo", None],
            ["foo", "bar", "foo"],  # potential delta
            ["foo", "bar"],
            ["foo", None, "bar", "quux"],  # potential delta
            ["bar", "quux"],  # replacement
            ]
    batches = [
        pa.RecordBatch.from_arrays([pa.array(v, type=ty)], names=['dicts'])
        for v in data]
    schema = batches[0].schema

    def write_batches():
        with stream_fixture._get_writer(pa.MockOutputStream(),
                                        schema) as writer:
            for batch in batches:
                writer.write_batch(batch)
            return writer.stats

    st = write_batches()
    assert st.num_record_batches == 5
    assert st.num_dictionary_batches == 4
    assert st.num_replaced_dictionaries == 3
    assert st.num_dictionary_deltas == 0

    stream_fixture.use_legacy_ipc_format = None
    stream_fixture.options = pa.ipc.IpcWriteOptions(
        emit_dictionary_deltas=True)
    st = write_batches()
    assert st.num_record_batches == 5
    assert st.num_dictionary_batches == 4
    assert st.num_replaced_dictionaries == 1
    assert st.num_dictionary_deltas == 2
Example #30
0
    def test_request_serialized_example(self):
        example = text_format.Parse(
        """
        features {
            feature { key: "x_bytes" value { bytes_list { value: ["ASa8asdf"] }}}
            feature { key: "x" value { bytes_list { value: "JLK7ljk3" }}}
            feature { key: "y" value { int64_list { value: [1, 2] }}}
        }
        """, tf.train.Example())
        
        serialized_example_remote = [example.SerializeToString()]
        record_batch_remote = pa.RecordBatch.from_arrays(
            [
                pa.array([["ASa8asdf"]], type=pa.list_(pa.binary())),
                pa.array([["JLK7ljk3"]], type=pa.list_(pa.utf8())),
                pa.array([[1, 2]], type=pa.list_(pa.int32())),
                pa.array([[4.5, 5, 5.5]], type=pa.list_(pa.float32())),
                serialized_example_remote
            ],
            ['x_bytes', 'x', 'y', 'z', _RECORDBATCH_COLUMN]
        )

        result = list(bsl_util.RecordToJSON(record_batch_remote, True))
        self.assertEqual(result, [{
            'b64': base64.b64encode(example.SerializeToString()).decode()
        }])
Example #31
0
    def python_to_arrow_type(cls, python_type: type) -> pa.DataType:

        if python_type == bool:
            return pa.bool_()

        if python_type == int:
            return pa.int64()

        if python_type == float:
            return pa.float64()

        if python_type == decimal.Decimal:
            return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION,
                                 cls.__TRAC_DECIMAL_SCALE)

        if python_type == str:
            return pa.utf8()

        if python_type == dt.date:
            return pa.date32()

        if python_type == dt.datetime:
            return pa.timestamp(cls.__TRAC_TIMESTAMP_UNIT,
                                cls.__TRAC_TIMESTAMP_ZONE)

        raise _ex.ETracInternal(
            f"No Arrow type mapping available for Python type [{python_type}]")
Example #32
0
def simple_dicts_table():
    dict_values = pa.array(["foo", "baz", "quux"], type=pa.utf8())
    data = [
        pa.chunked_array([
            pa.DictionaryArray.from_arrays([1, 0, None], dict_values),
            pa.DictionaryArray.from_arrays([2, 1], dict_values)]),
    ]
    return pa.Table.from_arrays(data, names=['some_dicts'])
Example #33
0
def test_cast_binary_to_utf8():
    binary_arr = pa.array([b'foo', b'bar', b'baz'], type=pa.binary())
    utf8_arr = binary_arr.cast(pa.utf8())
    expected = pa.array(['foo', 'bar', 'baz'], type=pa.utf8())

    assert utf8_arr.equals(expected)

    non_utf8_values = [(u'mañana').encode('utf-16-le')]
    non_utf8_binary = pa.array(non_utf8_values)
    assert non_utf8_binary.type == pa.binary()
    with pytest.raises(ValueError):
        non_utf8_binary.cast(pa.string())

    non_utf8_all_null = pa.array(non_utf8_values, mask=np.array([True]),
                                 type=pa.binary())
    # No error
    casted = non_utf8_all_null.cast(pa.string())
    assert casted.null_count == 1
Example #34
0
def test_table_from_pydict():
    table = pa.Table.from_pydict({})
    assert table.num_columns == 0
    assert table.num_rows == 0
    assert table.schema == pa.schema([])
    assert table.to_pydict() == {}

    # With arrays as values
    data = OrderedDict([('strs', pa.array([u'', u'foo', u'bar'])),
                        ('floats', pa.array([4.5, 5, None]))])
    schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float64())])
    table = pa.Table.from_pydict(data)
    assert table.num_columns == 2
    assert table.num_rows == 3
    assert table.schema == schema

    # With chunked arrays as values
    data = OrderedDict([('strs', pa.chunked_array([[u''], [u'foo', u'bar']])),
                        ('floats', pa.chunked_array([[4.5], [5, None]]))])
    table = pa.Table.from_pydict(data)
    assert table.num_columns == 2
    assert table.num_rows == 3
    assert table.schema == schema

    # With lists as values
    data = OrderedDict([('strs', [u'', u'foo', u'bar']),
                        ('floats', [4.5, 5, None])])
    table = pa.Table.from_pydict(data)
    assert table.num_columns == 2
    assert table.num_rows == 3
    assert table.schema == schema
    assert table.to_pydict() == data

    # With metadata and inferred schema
    metadata = {b'foo': b'bar'}
    schema = schema.add_metadata(metadata)
    table = pa.Table.from_pydict(data, metadata=metadata)
    assert table.schema == schema
    assert table.schema.metadata == metadata
    assert table.to_pydict() == data

    # With explicit schema
    table = pa.Table.from_pydict(data, schema=schema)
    assert table.schema == schema
    assert table.schema.metadata == metadata
    assert table.to_pydict() == data

    # Cannot pass both schema and metadata
    with pytest.raises(ValueError):
        pa.Table.from_pydict(data, schema=schema, metadata=metadata)