def internal_create_csv_loader(filename, num_threads=0, allow_quoted_newlines=False, block_size=32768, number_only=False, no_header=False, max_level_name_length=None, max_levels=None, cat_names=None, text_names=None, num_names=None): """ Creates a ParaText internal C++ CSV reader object and reads the CSV file in parallel. This function ordinarily should not be called directly. Parameters ---------- {0} Returns ------- loader : a paratext_internal.ColBasedLoader object Returns a C++ loader object with the parsed CSV embedded in the parallel worker's scratch space. """ loader = pti.ColBasedLoader() params = pti.ParseParams() params.allow_quoted_newlines = allow_quoted_newlines if num_threads > 0: params.num_threads = num_threads else: params.num_threads = int(max(pti.get_num_cores(), 4)) params.number_only = number_only params.no_header = no_header if max_levels is not None: params.max_levels = max_levels if max_level_name_length is not None: params.max_level_name_length = max_level_name_length if six.PY2: encoder = lambda x: x.encode("utf-8") else: encoder = lambda x: x if cat_names is not None: for name in cat_names: name = encoder(name) loader.force_semantics(name, pti.CATEGORICAL) if num_names is not None: for name in num_names: name = encoder(name) loader.force_semantics(name, pti.NUMERIC) if text_names is not None: for name in text_names: name = encoder(name) loader.force_semantics(name, pti.TEXT) loader.load(_make_posix_filename(filename), params) return loader
def internal_create_csv_loader(filename, num_threads=0, allow_quoted_newlines=False, block_size=32768, number_only=False, no_header=False, max_level_name_length=None, max_levels=None, cat_names=None, text_names=None, num_names=None, in_encoding=None, out_encoding=None, convert_null_to_space=True): """ Creates a ParaText internal C++ CSV reader object and reads the CSV file in parallel. This function ordinarily should not be called directly. Parameters ---------- {0} Returns ------- loader : a paratext_internal.ColBasedLoader object Returns a C++ loader object with the parsed CSV embedded in the parallel worker's scratch space. """ loader = pti.ColBasedLoader() params = pti.ParseParams() params.allow_quoted_newlines = allow_quoted_newlines if num_threads > 0: params.num_threads = num_threads else: params.num_threads = int(max(pti.get_num_cores(), 4)) params.number_only = number_only params.no_header = no_header params.convert_null_to_space = convert_null_to_space if max_levels is not None: params.max_levels = max_levels; if max_level_name_length is not None: params.max_level_name_length = max_level_name_length if six.PY2: encoder = lambda x: x.encode("utf-8") else: encoder = lambda x: x if cat_names is not None: for name in cat_names: name = encoder(name) loader.force_semantics(name, pti.CATEGORICAL) if num_names is not None: for name in num_names: name = encoder(name) loader.force_semantics(name, pti.NUMERIC) if text_names is not None: for name in text_names: name = encoder(name) loader.force_semantics(name, pti.TEXT) if in_encoding is not None and in_encoding not in ("utf-8", "unknown"): raise ValueError("invalid encoding: " % in_encoding) if out_encoding is not None and out_encoding not in ("utf-8", "unknown"): raise ValueError("invalid encoding: " % out_encoding) if in_encoding == "utf-8": loader.set_in_encoding(pti.UNICODE_UTF8) if out_encoding == "utf-8": loader.set_out_encoding(pti.UNICODE_UTF8) loader.load(_make_posix_filename(filename), params) return loader