コード例 #1
0
ファイル: unicode_io.py プロジェクト: birdsarah/karl_data
def get_csv_row_writer(stream, dialect=csv.excel, encoding="utf-8", **kwargs):
    """
    Create a csv, encoding from unicode, row writer.

    Use returned callable to write rows of unicode data
    to a stream, such as a file opened in write mode,
    in utf-8(or another) encoding.

    ::

        my_row_data = [
            [u'one', u'two'],
            [u'three', u'four'],
        ]

        with open('myfile.csv', 'wt') as myfile:
            unicode_row_writer = get_unicode_row_writer(myfile)
            for row in my_row_data:
                unicode_row_writer(row)
    """
    if is_py3():
        writer = csv.writer(stream, dialect=dialect, **kwargs)
        return writer.writerow

    else:
        queue = StringIO()
        writer = csv.writer(queue, dialect=dialect, **kwargs)
        encoder = codecs.getincrementalencoder(encoding)()
        return partial(_encode_write_row, stream, queue, writer, encoder)
コード例 #2
0
    def test_default_reader(self):
        """
        Ensure results have given items_func, collect_first, applied to them.
        """
        import karld
        from karld.run_together import distribute_multi_run_to_runners
        input_path = os.path.join(os.path.dirname(__file__),
                                  "test_data",
                                  "things_kinds")

        test_results = distribute_multi_run_to_runners(
            collect_first,
            in_dir=input_path,
            batch_size=10,
            filter_func=karld.io.is_file_csv)

        list_results = list(test_results)

        self.assertEqual(len(list_results[0]), 10)
        self.assertEqual(len(list_results[1]), 5)

        if is_py3():
            self.assertEqual(
                sorted(chain.from_iterable(list_results)),
                [87, 97, 99, 99, 99, 100, 105,
                 109, 111, 112, 112, 114, 116, 116, 116]
            )
        else:
            self.assertEqual(
                sorted(chain.from_iterable(list_results)),
                ['W', 'a', 'c', 'c', 'c', 'd',
                 'i', 'm', 'o', 'p', 'p', 'r', 't', 't', 't'],
            )
コード例 #3
0
ファイル: loadump.py プロジェクト: johnwlockwood/karl_data
def split_file_output(name, data, out_dir, max_lines=1100,
                      buffering=FILE_BUFFER_SIZE):
    """
    Split an iterable lines into groups and write each to
    a shard.

    :param name: Each shard will use this in it's name.
    :type name: str
    :param data: Iterable of data to write.
    :type data: iter
    :param out_dir: Path to directory to write the shards.
    :type out_dir: str
    :param max_lines: Max number of lines per shard.
    :type max_lines: int
    :param buffering: number of bytes to buffer files
    :type buffering: int
    """
    batches = i_batch(max_lines, data)

    if is_py3():
        join_str = b''
    else:
        join_str = ''

    index = count()
    for group in batches:
        file_path = os.path.join(out_dir,
                                 "{0}_{1}".format(next(index), name))
        with open(file_path, 'wb', buffering=buffering) as shard_file:
            shard_file.write(join_str.join(group))
コード例 #4
0
ファイル: loadump.py プロジェクト: johnwlockwood/karl_data
def i_get_csv_data(file_name, *args, **kwargs):
    """A generator for reading a csv file.
    """
    buffering = kwargs.get('buffering', FILE_BUFFER_SIZE)
    read_file_kwargs = dict(buffering=buffering)
    if is_py3():
        read_file_kwargs.update(dict(binary=False))
        read_file_kwargs.update(dict(py3_csv_read=True))

    data = i_read_buffered_file(file_name, **read_file_kwargs)

    for row in csv_reader(data, *args, **kwargs):
        yield row
コード例 #5
0
ファイル: loadump.py プロジェクト: johnwlockwood/karl_data
def i_read_buffered_file(file_name, buffering=FILE_BUFFER_SIZE, binary=True,
                         py3_csv_read=False, encoding='utf-8'):
    """
    Generator of lines of a file name, with buffering for
    speed.
    """
    kwargs = dict(buffering=buffering, )
    if is_py3():
        if not binary:
            kwargs.update(dict(encoding=encoding))
        if py3_csv_read:
            kwargs.update(dict(newline=''))

    with open(file_name, 'r' + ('b' if binary else 't'), **kwargs) as stream:
        for line in stream:
            yield line
コード例 #6
0
ファイル: merger.py プロジェクト: birdsarah/karl_data
def merge(*iterables, **kwargs):
    """Merge multiple sorted inputs into a single sorted output.

   Similar to sorted(itertools.chain(\*iterables)) but returns a generator,
   does not pull the data into memory all at once, and assumes that each of
   the input streams is already sorted (smallest to largest).

   >>> list(merge([[2,1],[2,3],[2,5],[2,7]],
   [[2,0],[2,2],[2,4],[2,8]],
   [[2,5],[2,10],[2,15],[2,20]],
   [], [[2,25]]), key=itemgetter(-1))
   [0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]

   """
    key = kwargs.get('key')
    _heappop, _heapreplace, _StopIteration = heapq.heappop, heapq.heapreplace, StopIteration
    if is_py3():
        next_method = attrgetter('__next__')
    else:
        next_method = attrgetter('next')

    h = []
    h_append = h.append
    key_is_None = key is None
    for itnum, it in enumerate(map(iter, iterables)):
        try:
            nnext = next_method(it)
            v = nnext()
            h_append([v if key_is_None else key(v), itnum, v, nnext])
        except _StopIteration:
            pass
    heapq.heapify(h)

    while 1:
        try:
            while 1:
                # raises IndexError when h is empty
                k, itnum, v, nnext = s = h[0]
                yield v
                v = nnext()                  # raises StopIteration when exhausted
                s[0] = v if key_is_None else key(v)
                s[2] = v
                _heapreplace(h, s)          # restore heap condition
        except _StopIteration:
            _heappop(h)                     # remove empty iterator
        except IndexError:
            return
コード例 #7
0
    def test_csv_file_to_file_integration(self):
        """
        Ensure
        """
        from karld.loadump import file_path_and_name
        from karld.run_together import csv_file_to_file

        out_dir = os.path.join(tempfile.gettempdir(),
                               "karld_test_csv_file_to_file")

        prefix = str(datetime.now())

        out_filename = "data_0.csv"
        input_path = os.path.join(os.path.dirname(__file__),
                                  "test_data",
                                  "things_kinds")

        def combiner(items):
            return items

        expected_file = os.path.join(out_dir,
                                     "{}{}".format(prefix, out_filename))

        if os.path.exists(expected_file):
            os.remove(expected_file)

        csv_file_to_file(combiner, prefix, out_dir, file_path_and_name(input_path, "data_0.csv"))

        self.assertTrue(os.path.exists(expected_file))

        expected_data = (b'mushroom,fungus\ntomato,fruit\ntopaz,mineral\n'
                         b'iron,metal\ndr\xc3\xb3\xc5\xbck\xc4\x85,'
                         b'utf-8 sample\napple,fruit\ncheese,dairy\n'
                         b'peach,fruit\ncelery,vegetable\n'.decode('utf-8'))

        if is_py3():
            with open(expected_file, 'rt') as result_file:
                contents = result_file.read()
                self.assertEqual(expected_data, contents)
        else:
            with open(expected_file, 'r') as result_file:
                contents = result_file.read()
                self.assertEqual(expected_data.splitlines(),
                                 contents.decode('utf-8').splitlines())

        if os.path.exists(expected_file):
            os.remove(expected_file)
コード例 #8
0
ファイル: loadump.py プロジェクト: johnwlockwood/karl_data
def i_get_unicode_lines(file_name, encoding='utf-8', **kwargs):
    """
    A generator for reading a text file as unicode lines.

    :param file_name: Path to file.
    :param encoding: Encoding of the file.
    :yields: Lines of the file decoded from encoding to unicode.
    """
    buffering = kwargs.get('buffering', FILE_BUFFER_SIZE)
    read_file_kwargs = dict(buffering=buffering, encoding=encoding)
    if is_py3():
        stream = i_read_buffered_text_file(file_name, **read_file_kwargs)
        for line in stream:
            yield line
    else:
        stream = i_read_buffered_binary_file(file_name, **read_file_kwargs)
        for line in codecs.iterdecode(stream, encoding, **kwargs):
            yield line
コード例 #9
0
def main():
    """
    From a source of data, shard it to csv files.
    """
    import pathlib

    if karld.is_py3():
        third = chr
    else:
        third = unichr

    # Your data source
    items = ((x, x + 1, third(x + 10)) for x in range(2000))

    out_dir = pathlib.Path('shard_out_json')

    karld.io.ensure_dir(str(out_dir))

    karld.io.split_file_output_json('big_data.json', items, str(out_dir))
コード例 #10
0
ファイル: unicode_io.py プロジェクト: birdsarah/karl_data
def csv_unicode_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
    """
    Generator the reads serialized unicode csv data.
    Use this if you have a stream of data
    in unicode and you want to access the rows
    of the data as sequences encoded as unicode.

    Unicode in, unicode out.

    :param unicode_csv_data: An iterable of unicode strings.
    :param dialect: csv dialect
    """
    if is_py3():
        return csv.reader(unicode_csv_data, dialect=dialect, **kwargs)
    else:
        encoded_utf8_data = imap(encode_utf8, unicode_csv_data)

        reader = csv.reader(encoded_utf8_data, dialect=dialect, **kwargs)

        return imap(map_decode_utf8_to_unicode, reader)
コード例 #11
0
ファイル: unicode_io.py プロジェクト: birdsarah/karl_data
def csv_reader(csv_data, dialect=csv.excel, encoding="utf-8", **kwargs):
    """
    Csv row generator that re-encodes to
    unicode from csv data with a given encoding.

    Utf-8 data in, unicode out. You may specify a different
     encoding of the incoming data.

    :param csv_data: An iterable of str of the specified encoding.
    :param dialect: csv dialect
    :param encoding: The encoding of the given data.
    """
    if is_py3():
        return csv.reader(csv_data, dialect=csv.excel, **kwargs)

    reader = csv.reader(
        _utf8_iter_recoder(csv_data, encoding),
        dialect=dialect, **kwargs
    )

    return imap(map_decode_utf8_to_unicode, reader)
コード例 #12
0
ファイル: loadump.py プロジェクト: johnwlockwood/karl_data
def write_as_csv(items, file_name, append=False,
                 line_buffer_size=None, buffering=FILE_BUFFER_SIZE,
                 get_csv_row_writer=get_csv_row_writer):
    """
    Writes out items to a csv file in groups.

    :param items: An iterable collection of collections.
    :param file_name: path to the output file.
    :param append: whether to append or overwrite the file.
    :param line_buffer_size: number of lines to write at a time.
    :param buffering: number of bytes to buffer files
    :type buffering: int
    :param get_csv_row_writer: callable that returns a csv row writer function,
     customize this for non-default options:
     `custom_writer = partial(get_csv_row_writer, delimiter="|");`
     `write_as_csv(items, 'my_out_file', get_csv_row_writer=custom_writer)`
    """
    if line_buffer_size is None:
        line_buffer_size = LINE_BUFFER_SIZE
    if append:
        mode = 'a'
    else:
        mode = 'w'

    kwargs = dict(buffering=buffering)
    if is_py3():
        mode += 't'
        kwargs.update(dict(newline=''))
    else:
        mode += 'b'

    with open(file_name, mode, **kwargs) as csv_file:
        write_row = get_csv_row_writer(csv_file)
        batches = i_batch(line_buffer_size, items)
        for batch in batches:
            for row in batch:
                write_row(row)
コード例 #13
0
from operator import methodcaller

try:
    from itertools import imap
except ImportError:
    imap = map

import logging
import re
from collections import OrderedDict

from karld import is_py3


if is_py3():
    unicode = str


NOT_NUMBER_REG = re.compile(r'\D')


str_strip = methodcaller('strip')


def apply_conversion_map(conversion_map, entity):
    """
    returns tuple of conversions
    """
    return tuple([conversion(entity) for key, conversion in conversion_map])