Beispiel #1
0
def internal_create_csv_loader(filename,
                               num_threads=0,
                               allow_quoted_newlines=False,
                               block_size=32768,
                               number_only=False,
                               no_header=False,
                               max_level_name_length=None,
                               max_levels=None,
                               cat_names=None,
                               text_names=None,
                               num_names=None):
    """
    Creates a ParaText internal C++ CSV reader object and reads the CSV
    file in parallel. This function ordinarily should not be called directly.

    Parameters
    ----------
    {0}

    Returns
    -------
    loader : a paratext_internal.ColBasedLoader object

         Returns a C++ loader object with the parsed CSV embedded in the
         parallel worker's scratch space.
    """
    loader = pti.ColBasedLoader()
    params = pti.ParseParams()
    params.allow_quoted_newlines = allow_quoted_newlines
    if num_threads > 0:
        params.num_threads = num_threads
    else:
        params.num_threads = int(max(pti.get_num_cores(), 4))
    params.number_only = number_only
    params.no_header = no_header
    if max_levels is not None:
        params.max_levels = max_levels
    if max_level_name_length is not None:
        params.max_level_name_length = max_level_name_length
    if six.PY2:
        encoder = lambda x: x.encode("utf-8")
    else:
        encoder = lambda x: x
    if cat_names is not None:
        for name in cat_names:
            name = encoder(name)
            loader.force_semantics(name, pti.CATEGORICAL)
    if num_names is not None:
        for name in num_names:
            name = encoder(name)
            loader.force_semantics(name, pti.NUMERIC)
    if text_names is not None:
        for name in text_names:
            name = encoder(name)
            loader.force_semantics(name, pti.TEXT)
    loader.load(_make_posix_filename(filename), params)
    return loader
Beispiel #2
0
def internal_create_csv_loader(filename, num_threads=0, allow_quoted_newlines=False, block_size=32768, number_only=False, no_header=False, max_level_name_length=None, max_levels=None, cat_names=None, text_names=None, num_names=None, in_encoding=None, out_encoding=None, convert_null_to_space=True):
    """
    Creates a ParaText internal C++ CSV reader object and reads the CSV
    file in parallel. This function ordinarily should not be called directly.

    Parameters
    ----------
    {0}

    Returns
    -------
    loader : a paratext_internal.ColBasedLoader object

         Returns a C++ loader object with the parsed CSV embedded in the
         parallel worker's scratch space.
    """
    loader = pti.ColBasedLoader()
    params = pti.ParseParams()
    params.allow_quoted_newlines = allow_quoted_newlines
    if num_threads > 0:
        params.num_threads = num_threads
    else:
        params.num_threads = int(max(pti.get_num_cores(), 4))
    params.number_only = number_only
    params.no_header = no_header
    params.convert_null_to_space = convert_null_to_space
    if max_levels is not None:
        params.max_levels = max_levels;
    if max_level_name_length is not None:
        params.max_level_name_length = max_level_name_length
    if six.PY2:
        encoder = lambda x: x.encode("utf-8")
    else:
        encoder = lambda x: x
    if cat_names is not None:
        for name in cat_names:
            name = encoder(name)
            loader.force_semantics(name, pti.CATEGORICAL)
    if num_names is not None:
        for name in num_names:
            name = encoder(name)
            loader.force_semantics(name, pti.NUMERIC)
    if text_names is not None:
        for name in text_names:
            name = encoder(name)
            loader.force_semantics(name, pti.TEXT)
    if in_encoding is not None and in_encoding not in ("utf-8", "unknown"):
        raise ValueError("invalid encoding: " % in_encoding)
    if out_encoding is not None and out_encoding not in ("utf-8", "unknown"):
        raise ValueError("invalid encoding: " % out_encoding)
    if in_encoding == "utf-8":
        loader.set_in_encoding(pti.UNICODE_UTF8)
    if out_encoding == "utf-8":
        loader.set_out_encoding(pti.UNICODE_UTF8)
    loader.load(_make_posix_filename(filename), params)
    return loader
Beispiel #3
0
def _get_params(num_threads=0, allow_quoted_newlines=False, block_size=32768, number_only=False, no_header=False, max_level_name_length=None, max_levels=None, convert_null_to_space=True):
    params = pti.ParseParams()
    params.allow_quoted_newlines = allow_quoted_newlines
    if num_threads > 0:
        params.num_threads = num_threads
    else:
        params.num_threads = int(max(pti.get_num_cores(), 4))
    params.number_only = number_only
    params.no_header = no_header
    params.convert_null_to_space = convert_null_to_space
    if max_levels is not None:
        params.max_levels = max_levels;
    if max_level_name_length is not None:
        params.max_level_name_length = max_level_name_length
    return params