Example #1
0
def test_convert_from_bool():
    assert list(tubes.Each([True, False]).to(bool).to(int)) == [1, 0]
    assert list(tubes.Each([True, False]).to(bool).to(float)) == [1., 0.]
    assert list(tubes.Each([True,
                            False]).to(bool).to(bytes)) == [b'True', b'False']
    assert list(tubes.Each([True,
                            False]).to(bool).to(str)) == ['True', 'False']
Example #2
0
def test_reading_nota_fileobj():
    buf1 = BytesIO(b"Mary had")
    buf2 = "string"
    tube = tubes.Each([buf1, buf2]).read_fileobj()
    with pytest.raises(ValueError) as exc:
        list(tube)
    assert exc.match(r'only accepts objects with a \.read\(\)')
Example #3
0
def test_fuzz_tsv(seed):
    random.seed(seed)
    n_rows = random.randint(30)
    cols_to_read = get_cols(32)
    tsv_rows = []
    expected_rows = []
    for _ in range(n_rows):
        tsv_row = []
        expected_row = (['xx'] * len(cols_to_read))
        for col_no in range(random.randint(30)):
            data = '\t'
            while '\t' in data:
                data = rand_chars()
            if col_no in cols_to_read:
                expected_row[cols_to_read.index(col_no)] = data
            data = data.encode("utf8")
            tsv_row.append(data)
        if len(tsv_row) == 0:
            if 0 in cols_to_read:
                expected_row[cols_to_read.index(0)] = ''
        expected_rows.append(tuple(expected_row))
        tsv_rows.append(b'\t'.join(tsv_row))
    slot_tube = tubes.Each(tsv_rows).to(tubes.TsvRow).multi(lambda x: [x.get(c, 'xx').to(str) for c in cols_to_read])
    actual_rows = list(slot_tube)
    for row_num in range(len(expected_rows)):
        for col_num in range(len(cols_to_read)):
            expected = expected_rows[row_num][col_num]
            if len(cols_to_read) == 1:
                actual = actual_rows[row_num]
            else:
                actual = actual_rows[row_num][col_num]
            assert expected == actual
    assert len(expected_rows) == len(actual_rows)
Example #4
0
def test_to_py_handles_refcount_iter():
    flag = Flag()
    a = Canary(flag)

    assert sys.getrefcount(a) == 2  # a
    tube = tubes.Each(iter([True, a])).to_py()
    assert sys.getrefcount(tube) == 2  # iter
    assert sys.getrefcount(a) == 3  # a + each_val
    it = iter(tube)
    gc.collect()
    assert sys.getrefcount(tube) == 2  # iter() doesn't keep reference to tube
    assert sys.getrefcount(a) == 3  # a + each_val
    assert next(it) is True
    val = next(it)
    assert val is a
    gc.collect()
    assert sys.getrefcount(a) == 6  # a + each_val + val + iter_cur + topy_cur
    del it
    gc.collect()
    assert sys.getrefcount(a) == 4  # a + each_val + val
    del tube
    gc.collect()
    assert sys.getrefcount(a) == 3  # a + val
    del val
    assert sys.getrefcount(a) == 2  # a
    del a
    gc.collect()
    assert flag.is_set
Example #5
0
def test_to_py_handles_refcount_list():
    """
    sys.getrefcount() value is always one higher than expected
    because the call to getrefcount() itself needs a reference..
    """
    flag = Flag()
    a = Canary(flag)

    assert sys.getrefcount(a) == 2  # a
    tube = tubes.Each([True, a]).to_py()
    assert sys.getrefcount(tube) == 2  # tube
    assert sys.getrefcount(a) == 3  # a + each_val
    it = iter(tube)
    gc.collect()
    assert sys.getrefcount(tube) == 2  # iter() doesn't keep reference to tube
    assert sys.getrefcount(a) == 3  # a + each_val
    assert next(it) is True
    val = next(it)
    assert val is a
    gc.collect()
    assert sys.getrefcount(a) == 6  # a + each_val + val + iter_cur + topy_cur
    del it
    gc.collect()
    assert sys.getrefcount(a) == 4  # a + each_val + val
    del tube
    gc.collect()
    assert sys.getrefcount(a) == 3  # a + val
    del val
    assert sys.getrefcount(a) == 2  # a
    del a
    gc.collect()
    assert flag.is_set
Example #6
0
def test_multi_index_get_on_json_value():
    tube = tubes.Each(["[1,2,3]", "[8,9,10]", '["a", "b", "c"]']).json().multi(lambda x: (
        x.get(0),
        x.get(2),
        x.get(1),
        ))
    assert list(tube) == [(1, 3, 2), (8, 10, 9), ('a', 'c', 'b')]
Example #7
0
def test_mixed_types():
    table = (tubes.Each(
        ['apple', 'banana', 'apple']).to(str).enumerate().multi(
            lambda x: (x.slot(0), x.slot(0).to(float), x.slot(1))).to_pyarrow(
                ('index', 'index_double', 'val')))
    assert isinstance(table, pa.Table)
    assert str(table.columns[0].type) == 'int64'
    assert str(table.columns[1].type) == 'double'
    assert str(table.columns[2].type) == 'string'
    assert table.to_pandas().to_dict() == {
        'index': {
            0: 0,
            1: 1,
            2: 2
        },
        'index_double': {
            0: 0.,
            1: 1.,
            2: 2.
        },
        'val': {
            0: 'apple',
            1: 'banana',
            2: 'apple'
        }
    }
Example #8
0
def test_reading_two_files_small_buffer():
    buf1 = BytesIO(b"Mary had")
    buf2 = BytesIO(b'a little lamb')
    tube = tubes.Each([buf1, buf2]).read_fileobj(size=2).to(str)
    assert list(tube) == [
        'Ma', 'ry', ' h', 'ad', 'a ', 'li', 'tt', 'le', ' l', 'am', 'b'
    ]
Example #9
0
def test_reading_unicode():
    buf1 = BytesIO(b"Mary had")
    buf2 = StringIO("string")
    tube = tubes.Each([buf1, buf2]).read_fileobj()
    with pytest.raises(ValueError) as exc:
        list(tube)
    assert exc.match('expects binary')
Example #10
0
def test_fuzz_random_double_to_str(seed, maker):
    numpy.random.seed(seed)
    array = maker(10240)
    actual = list(tubes.Each(array).to(float).to(str))
    expected = [
        str(x).replace('e-0', 'e-').replace('e+0', 'e+') for x in array
    ]
    assert actual == expected
Example #11
0
def test_csv_escaping():
    tube = tubes.Each(['a"x","b",""',
                       '"d","e,f",g']).to(tubes.CsvRow).multi(lambda x: (
                           x.get(0, 'xx'),
                           x.get(1, 'xx'),
                           x.get(2, 'xx'),
                       ))
    assert list(tube) == [(b'a"x"', b'b', b''), (b'd', b'e,f', b'g')]
Example #12
0
def test_csv_quote_escaping():
    tube = tubes.Each(['"a""b","""",""""""',
                       '"c""""d",e""f']).to(tubes.CsvRow).multi(lambda x: (
                           x.get(0),
                           x.get(1),
                           x.get(2, 'x'),
                       ))
    assert list(tube) == [(b'a"b', b'"', b'""'), (b'c""d', b'e""f', b'x')]
Example #13
0
def test_multi_index_get_on_pyobj():
    tube = tubes.Each([[1], [1, 1], [1, 2, 1], [1, 3, 3, 1], None]).multi(lambda x: tuple(x.get(i, 'X') for i in range(5)))
    assert list(tube) == [
        (1, 'X', 'X', 'X', 'X'),
        (1, 1, 'X', 'X', 'X'),
        (1, 2, 1, 'X', 'X'),
        (1, 3, 3, 1, 'X'),
        (None, 'X', 'X', 'X', 'X'),
    ]
Example #14
0
def test_fuzz_csv(seed, do_split):
    random.seed(seed)
    n_rows = random.randint(30)
    cols_to_read = get_cols(32)
    csv_rows = []
    expected_rows = []
    for _ in range(n_rows):
        csv_row = []
        expected_row = (['xx'] * len(cols_to_read))
        for col_no in range(random.randint(30)):
            data = '\r'
            while data.endswith('\r'):
                data = rand_chars()
            if col_no in cols_to_read:
                expected_row[cols_to_read.index(col_no)] = data
            data = data.encode("utf8")
            if b'"' in data or b'\n' in data or b',' in data or random.choice(
                [False, False, True]):
                data = csv_escape(data)
            csv_row.append(data)
        if len(csv_row) == 0:
            if 0 in cols_to_read:
                expected_row[cols_to_read.index(0)] = ''
        expected_rows.append(tuple(expected_row))
        csv_rows.append(b",".join(csv_row))
    if do_split:
        tube_input = [b'\n'.join(csv_rows)] if n_rows else []
        slot_tube = tubes.Each(tube_input).csv(headers=False,
                                               skip_empty_rows=False)
    else:
        slot_tube = tubes.Each(csv_rows).to(tubes.CsvRow)

    slot_tube = slot_tube.multi(
        lambda x: [x.get(c, 'xx').to(str) for c in cols_to_read])
    actual_rows = list(slot_tube)
    for row_num in range(len(expected_rows)):
        for col_num in range(len(cols_to_read)):
            expected = expected_rows[row_num][col_num]
            if len(cols_to_read) == 1:
                actual = actual_rows[row_num]
            else:
                actual = actual_rows[row_num][col_num]
            assert expected == actual
    assert len(expected_rows) == len(actual_rows)
Example #15
0
def test_passing_json_test_suite_cases(filename):
    test_path = path.join(TEST_CASE_DIR, filename)
    data = read_file(test_path)
    try:
        py_version = json.loads(data)
    except (ValueError, RecursionError):
        return

    tubes_version = tubes.Each([test_path]).map_files().json()
    assert list(tubes_version)[0] == py_version
Example #16
0
def test_reading_json_with_multiple_blank_lines():
    SAMPLE = """
[1, 2, 3]



9
"""
    values = list(tubes.Each([SAMPLE]).to(bytes).split().skip_if(tubes.is_blank).json())
    assert values == [[1, 2, 3], 9]
Example #17
0
def test_reading_json_with_blank_lines():
    SAMPLE = """{}
[1, 2, 3]

{"a": 2, "b": "c"}

9
"""
    values = list(tubes.Each([SAMPLE]).to(bytes).split().skip_if(lambda x: x.is_blank()).json())
    assert values == [{}, [1, 2, 3], {'a': 2, 'b': 'c'}, 9]
Example #18
0
def test_escaped_multi_index_get_on_json():
    tube = tubes.Each([
        r'["\t","\b","\u1234"]', r'["\"","","a"]', r'["x", "y\ta\bb\n", "z"]'
    ]).json().multi(lambda x: (
        x.get(0),
        x.get(1),
        x.get(2),
    )).to(str, str, str)
    assert list(tube) == [('\t', '\b', '\u1234'), ('"', '', 'a'),
                          ('x', 'y\ta\bb\n', 'z')]
Example #19
0
def test_fill_ndarray_mixed_type():
    nd = (tubes.Each([x * 10 for x in string.ascii_lowercase])
        .to(bytes)
        .enumerate()
        .ndarray(None, 5)
    )
    assert nd.shape == (26, )
    assert dict(nd.dtype.fields) == {'0': (np.dtype('int64'), 0), '1': (np.dtype('S6'), 8)}
    expected = [(i, (x * 5).encode('ascii')) for i, x in enumerate(string.ascii_lowercase)]
    assert [tuple(x) for x in nd] == expected
Example #20
0
 def test_str():
     tube = tubes.Each(['a', 'b', 'c', 'd', 'e']).to(str).enumerate()
     table = tube.to_pyarrow(('index', 'val'))
     assert isinstance(table, pa.Table)
     assert str(table.columns[0].type) == 'int64'
     assert str(table.columns[1].type) == 'string'
     assert table.to_pandas().to_dict() == {
         'index': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
         'val': {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e'}
     }
Example #21
0
def test_multi_index_pyobj_out_of_order():
    order = [4, 2, 0, 3, 0, 1]
    tube = tubes.Each([[1], [1, 1], [1, 2, 1], [1, 3, 3, 1], None]).multi(lambda x: tuple(x.get(i, 'X') for i in order))
    assert list(tube) == [
        ('X', 'X', 1, 'X', 1, 'X'),
        ('X', 'X', 1, 'X', 1, 1),
        ('X', 1, 1, 'X', 1, 2),
        ('X', 3, 1, 1, 1, 3),
        ('X', 'X', None, 'X', None, 'X'),
    ]
Example #22
0
def tubes_version():
    x = (tubes.Each(FILES)
        .read_files()
        .split(b'\n')
        .skip(SKIP)
        .json()
        .skip_unless(lambda x: x.get('country_code', '""').to(tubes.Utf8).equals("GB"))
        .first(TAKE)
        .multi(make_getters)
    )
    return list(x)
Example #23
0
def test_recover_bad_json():
    tube = tubes.Each(['[1,2]', '[', '{"a": 1}']).to(str).json()
    it = iter(tube)
    results = []
    while True:
        try:
            results.append(next(it))
        except ValueError as e:
            results.append('ERR')
        except StopIteration:
            break
    assert results == [[1, 2], 'ERR', {'a': 1}]
Example #24
0
def test_csv_uneven_rows_get_many():
    tube = tubes.Each(['a', 'b,c', 'd,e,',
                       'f,g,h']).to(tubes.CsvRow).multi(lambda x: (
                           x.get(0),
                           x.get(1, 'xx'),
                           x.get(2, 'xx'),
                       )).to(str, str, str)
    assert list(tube) == [
        ('a', 'xx', 'xx'),
        ('b', 'c', 'xx'),
        ('d', 'e', ''),
        ('f', 'g', 'h'),
    ]
Example #25
0
def test_reading_csv_headers_different_orders():
    tsv_1 = """a,b,c
1,2,3
4,5,6
"""
    tsv_2 = """c,a,b
9,7,8
12,10,11
"""
    tube = tubes.Each([tsv_1, tsv_2]).to(bytes).csv(headers=True).chunk(
        1).multi(lambda x: (x.get('a'), x.get('b'), x.get('c'))).to(
            int, int, int)
    assert list(tube) == [(1, 2, 3), (4, 5, 6), (7, 8, 9), (10, 11, 12)]
Example #26
0
def test_recover_bad_csv():
    tube = tubes.Each(['a,b\n1,2\n3,4\n"x'
                       ]).csv().multi(lambda x: (x.get(0), x.get(1)))
    it = iter(tube)
    results = []
    while True:
        try:
            results.append(next(it))
        except ValueError as e:
            results.append('ERR')
        except StopIteration:
            break
    assert results == [(b'1', b'2'), (b'3', b'4'), 'ERR']
Example #27
0
def test_tsv_uneven_rows_get_many():
    tube = tubes.Each(['a', 'b\tc', 'd\te\t',
                       'f\tg\th']).to(tubes.TsvRow).multi(lambda x: (
                           x.get(0),
                           x.get(1, 'xx'),
                           x.get(2, 'xx'),
                       )).to(str, str, str)
    assert list(tube) == [
        ('a', 'xx', 'xx'),
        ('b', 'c', 'xx'),
        ('d', 'e', ''),
        ('f', 'g', 'h'),
    ]
Example #28
0
def test_reading_tsv_headers_different_orders():
    tsv_1 = """a\tb\tc
1\t2\t3
4\t5\t6
"""
    tsv_2 = """c\ta\tb
9\t7\t8
12\t10\t11
"""
    tube = tubes.Each([tsv_1, tsv_2]).to(bytes).split().tsv(
        headers=True, split=False).chunk(1).multi(
            lambda x: (x.get('a'), x.get('b'), x.get('c'))).to(int, int, int)
    assert list(tube) == [(1, 2, 3), (4, 5, 6), (7, 8, 9), (10, 11, 12)]
Example #29
0
def test_fuzz_tsv(seed):
    tsv_rows, expected_rows, cols_to_read = get_tsv(seed)
    slot_tube = tubes.Each(tsv_rows).to(tubes.TsvRow).multi(lambda x: [x.get(c, 'xx').to(str) for c in cols_to_read])
    actual_rows = list(slot_tube)
    for row_num in range(len(expected_rows)):
        for col_num in range(len(cols_to_read)):
            expected = expected_rows[row_num][col_num]
            if len(cols_to_read) == 1:
                actual = actual_rows[row_num]
            else:
                actual = actual_rows[row_num][col_num]
            assert expected == actual
    assert len(expected_rows) == len(actual_rows)
Example #30
0
def test_recover_bad_json_with_skip():
    tube = tubes.Each(['[1,2]', '[', '{"a": 1}', '12']).to(str).json().skip(2)
    it = iter(tube)
    results = []
    while True:
        try:
            results.append(next(it))
        except ValueError as e:
            results.append('ERR')
        except StopIteration:
            break
    # TODO: This /should/ return {"a": 1}, 12
    # but rewinding the stack to the right place is hard
    assert results == ['ERR', 12]