def get_csv_row_writer(stream, dialect=csv.excel, encoding="utf-8", **kwargs): """ Create a csv, encoding from unicode, row writer. Use returned callable to write rows of unicode data to a stream, such as a file opened in write mode, in utf-8(or another) encoding. :: my_row_data = [ [u'one', u'two'], [u'three', u'four'], ] with open('myfile.csv', 'wt') as myfile: unicode_row_writer = get_unicode_row_writer(myfile) for row in my_row_data: unicode_row_writer(row) """ if is_py3(): writer = csv.writer(stream, dialect=dialect, **kwargs) return writer.writerow else: queue = StringIO() writer = csv.writer(queue, dialect=dialect, **kwargs) encoder = codecs.getincrementalencoder(encoding)() return partial(_encode_write_row, stream, queue, writer, encoder)
def test_default_reader(self): """ Ensure results have given items_func, collect_first, applied to them. """ import karld from karld.run_together import distribute_multi_run_to_runners input_path = os.path.join(os.path.dirname(__file__), "test_data", "things_kinds") test_results = distribute_multi_run_to_runners( collect_first, in_dir=input_path, batch_size=10, filter_func=karld.io.is_file_csv) list_results = list(test_results) self.assertEqual(len(list_results[0]), 10) self.assertEqual(len(list_results[1]), 5) if is_py3(): self.assertEqual( sorted(chain.from_iterable(list_results)), [87, 97, 99, 99, 99, 100, 105, 109, 111, 112, 112, 114, 116, 116, 116] ) else: self.assertEqual( sorted(chain.from_iterable(list_results)), ['W', 'a', 'c', 'c', 'c', 'd', 'i', 'm', 'o', 'p', 'p', 'r', 't', 't', 't'], )
def split_file_output(name, data, out_dir, max_lines=1100, buffering=FILE_BUFFER_SIZE): """ Split an iterable lines into groups and write each to a shard. :param name: Each shard will use this in it's name. :type name: str :param data: Iterable of data to write. :type data: iter :param out_dir: Path to directory to write the shards. :type out_dir: str :param max_lines: Max number of lines per shard. :type max_lines: int :param buffering: number of bytes to buffer files :type buffering: int """ batches = i_batch(max_lines, data) if is_py3(): join_str = b'' else: join_str = '' index = count() for group in batches: file_path = os.path.join(out_dir, "{0}_{1}".format(next(index), name)) with open(file_path, 'wb', buffering=buffering) as shard_file: shard_file.write(join_str.join(group))
def i_get_csv_data(file_name, *args, **kwargs): """A generator for reading a csv file. """ buffering = kwargs.get('buffering', FILE_BUFFER_SIZE) read_file_kwargs = dict(buffering=buffering) if is_py3(): read_file_kwargs.update(dict(binary=False)) read_file_kwargs.update(dict(py3_csv_read=True)) data = i_read_buffered_file(file_name, **read_file_kwargs) for row in csv_reader(data, *args, **kwargs): yield row
def i_read_buffered_file(file_name, buffering=FILE_BUFFER_SIZE, binary=True, py3_csv_read=False, encoding='utf-8'): """ Generator of lines of a file name, with buffering for speed. """ kwargs = dict(buffering=buffering, ) if is_py3(): if not binary: kwargs.update(dict(encoding=encoding)) if py3_csv_read: kwargs.update(dict(newline='')) with open(file_name, 'r' + ('b' if binary else 't'), **kwargs) as stream: for line in stream: yield line
def merge(*iterables, **kwargs): """Merge multiple sorted inputs into a single sorted output. Similar to sorted(itertools.chain(\*iterables)) but returns a generator, does not pull the data into memory all at once, and assumes that each of the input streams is already sorted (smallest to largest). >>> list(merge([[2,1],[2,3],[2,5],[2,7]], [[2,0],[2,2],[2,4],[2,8]], [[2,5],[2,10],[2,15],[2,20]], [], [[2,25]]), key=itemgetter(-1)) [0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25] """ key = kwargs.get('key') _heappop, _heapreplace, _StopIteration = heapq.heappop, heapq.heapreplace, StopIteration if is_py3(): next_method = attrgetter('__next__') else: next_method = attrgetter('next') h = [] h_append = h.append key_is_None = key is None for itnum, it in enumerate(map(iter, iterables)): try: nnext = next_method(it) v = nnext() h_append([v if key_is_None else key(v), itnum, v, nnext]) except _StopIteration: pass heapq.heapify(h) while 1: try: while 1: # raises IndexError when h is empty k, itnum, v, nnext = s = h[0] yield v v = nnext() # raises StopIteration when exhausted s[0] = v if key_is_None else key(v) s[2] = v _heapreplace(h, s) # restore heap condition except _StopIteration: _heappop(h) # remove empty iterator except IndexError: return
def test_csv_file_to_file_integration(self): """ Ensure """ from karld.loadump import file_path_and_name from karld.run_together import csv_file_to_file out_dir = os.path.join(tempfile.gettempdir(), "karld_test_csv_file_to_file") prefix = str(datetime.now()) out_filename = "data_0.csv" input_path = os.path.join(os.path.dirname(__file__), "test_data", "things_kinds") def combiner(items): return items expected_file = os.path.join(out_dir, "{}{}".format(prefix, out_filename)) if os.path.exists(expected_file): os.remove(expected_file) csv_file_to_file(combiner, prefix, out_dir, file_path_and_name(input_path, "data_0.csv")) self.assertTrue(os.path.exists(expected_file)) expected_data = (b'mushroom,fungus\ntomato,fruit\ntopaz,mineral\n' b'iron,metal\ndr\xc3\xb3\xc5\xbck\xc4\x85,' b'utf-8 sample\napple,fruit\ncheese,dairy\n' b'peach,fruit\ncelery,vegetable\n'.decode('utf-8')) if is_py3(): with open(expected_file, 'rt') as result_file: contents = result_file.read() self.assertEqual(expected_data, contents) else: with open(expected_file, 'r') as result_file: contents = result_file.read() self.assertEqual(expected_data.splitlines(), contents.decode('utf-8').splitlines()) if os.path.exists(expected_file): os.remove(expected_file)
def i_get_unicode_lines(file_name, encoding='utf-8', **kwargs): """ A generator for reading a text file as unicode lines. :param file_name: Path to file. :param encoding: Encoding of the file. :yields: Lines of the file decoded from encoding to unicode. """ buffering = kwargs.get('buffering', FILE_BUFFER_SIZE) read_file_kwargs = dict(buffering=buffering, encoding=encoding) if is_py3(): stream = i_read_buffered_text_file(file_name, **read_file_kwargs) for line in stream: yield line else: stream = i_read_buffered_binary_file(file_name, **read_file_kwargs) for line in codecs.iterdecode(stream, encoding, **kwargs): yield line
def main(): """ From a source of data, shard it to csv files. """ import pathlib if karld.is_py3(): third = chr else: third = unichr # Your data source items = ((x, x + 1, third(x + 10)) for x in range(2000)) out_dir = pathlib.Path('shard_out_json') karld.io.ensure_dir(str(out_dir)) karld.io.split_file_output_json('big_data.json', items, str(out_dir))
def csv_unicode_reader(unicode_csv_data, dialect=csv.excel, **kwargs): """ Generator the reads serialized unicode csv data. Use this if you have a stream of data in unicode and you want to access the rows of the data as sequences encoded as unicode. Unicode in, unicode out. :param unicode_csv_data: An iterable of unicode strings. :param dialect: csv dialect """ if is_py3(): return csv.reader(unicode_csv_data, dialect=dialect, **kwargs) else: encoded_utf8_data = imap(encode_utf8, unicode_csv_data) reader = csv.reader(encoded_utf8_data, dialect=dialect, **kwargs) return imap(map_decode_utf8_to_unicode, reader)
def csv_reader(csv_data, dialect=csv.excel, encoding="utf-8", **kwargs): """ Csv row generator that re-encodes to unicode from csv data with a given encoding. Utf-8 data in, unicode out. You may specify a different encoding of the incoming data. :param csv_data: An iterable of str of the specified encoding. :param dialect: csv dialect :param encoding: The encoding of the given data. """ if is_py3(): return csv.reader(csv_data, dialect=csv.excel, **kwargs) reader = csv.reader( _utf8_iter_recoder(csv_data, encoding), dialect=dialect, **kwargs ) return imap(map_decode_utf8_to_unicode, reader)
def write_as_csv(items, file_name, append=False, line_buffer_size=None, buffering=FILE_BUFFER_SIZE, get_csv_row_writer=get_csv_row_writer): """ Writes out items to a csv file in groups. :param items: An iterable collection of collections. :param file_name: path to the output file. :param append: whether to append or overwrite the file. :param line_buffer_size: number of lines to write at a time. :param buffering: number of bytes to buffer files :type buffering: int :param get_csv_row_writer: callable that returns a csv row writer function, customize this for non-default options: `custom_writer = partial(get_csv_row_writer, delimiter="|");` `write_as_csv(items, 'my_out_file', get_csv_row_writer=custom_writer)` """ if line_buffer_size is None: line_buffer_size = LINE_BUFFER_SIZE if append: mode = 'a' else: mode = 'w' kwargs = dict(buffering=buffering) if is_py3(): mode += 't' kwargs.update(dict(newline='')) else: mode += 'b' with open(file_name, mode, **kwargs) as csv_file: write_row = get_csv_row_writer(csv_file) batches = i_batch(line_buffer_size, items) for batch in batches: for row in batch: write_row(row)
from operator import methodcaller try: from itertools import imap except ImportError: imap = map import logging import re from collections import OrderedDict from karld import is_py3 if is_py3(): unicode = str NOT_NUMBER_REG = re.compile(r'\D') str_strip = methodcaller('strip') def apply_conversion_map(conversion_map, entity): """ returns tuple of conversions """ return tuple([conversion(entity) for key, conversion in conversion_map])