Beispiel #1
0
    def __init__(
        self,
        source,
        suffix,
        mode="a",
        if_exists=RAISE,
        create=False,
        md5=True,
        **kwargs,
    ):
        """
        Parameters
        ----------
        source
            path to directory / zip file
        suffix
            only members whose name matches the suffix are considered included
        mode : str
            file opening mode, defaults to append
        if_exists : str
             behaviour when the destination already exists. Valid constants are
             defined in this file as OVERWRITE, SKIP, RAISE, IGNORE (they
             correspond to lower case version of the same word)
        create : bool
            if True, the destination is created
        md5 : bool
            record md5 hexadecimal checksum of data when possible
        """
        from cogent3.util.warning import discontinued

        discontinued(
            "class",
            self.__class__.__name__,
            "2021.10.01",
            reason=
            "zips are not efficient for incremental inclusion of files, use a tinydb instead",
        )

        ReadOnlyZippedDataStore.__init__(self,
                                         source=source,
                                         suffix=suffix,
                                         md5=md5)
        WritableDataStoreBase.__init__(self,
                                       if_exists=if_exists,
                                       create=create)

        d = locals()
        self._persistent = {k: v for k, v in d.items() if k != "self"}
        self.mode = "a" or mode
Beispiel #2
0
    def __init__(self, conversion, by_column=True):
        """handles conversions of columns or lines

        Parameters
        ----------
        by_column
            conversion will by done for each column, otherwise
            done by entire line

        """
        super(ConvertFields, self).__init__()
        discontinued("function", "ConvertFields", "2020.11.1")

        self.conversion = conversion
        self.by_column = by_column

        self._func = self.convert_by_columns

        if not self.by_column:
            assert isinstance(
                conversion,
                Callable), "conversion must be callable to convert by line"
            self._func = self.convert_by_line
Beispiel #3
0
def SeparatorFormatParser(
    with_header=True,
    converter=None,
    ignore=None,
    sep=",",
    strip_wspace=True,
    limit=None,
    **kw,
):
    """Returns a parser for a delimited tabular file.

    Parameters
    ----------
    with_header
        when True, first line is taken to be the header. Not
        passed to converter.
    converter
        a callable that returns a correctly formatted line.
    ignore
        lines for which ignore returns True are ignored. White
        lines are always skipped.
    sep
        the delimiter separating fields.
    strip_wspace
        removes redundant white
    limit
        exits after this many lines

    """
    if ignore is None:  # keep all lines
        ignore = lambda x: False

    by_column = getattr(converter, "by_column", True)
    discontinued("function", "SeparatorFormatParser", "2020.11.1")

    def callable(lines):
        num_lines = 0
        header = None
        for line in lines:
            if is_empty(line):
                continue

            line = line.strip("\n").split(sep)
            if strip_wspace and by_column:
                line = [field.strip() for field in line]

            if with_header and not header:
                header = True
                yield line
                continue

            if converter:
                line = converter(line)

            if ignore(line):
                continue

            yield line

            num_lines += 1
            if limit is not None and num_lines >= limit:
                break

    return callable
Beispiel #4
0
def load_table(
    filename,
    sep=None,
    reader=None,
    digits=4,
    space=4,
    title="",
    missing_data="",
    max_width=1e100,
    index_name=None,
    legend="",
    column_templates=None,
    static_column_types=False,
    limit=None,
    format="simple",
    skip_inconsistent=False,
    **kwargs,
):
    """

    Parameters
    ----------
    filename
        path to file containing a tabular data
    sep
        the delimiting character between columns
    reader
        a parser for reading filename. This approach assumes the first
        row returned by the reader will be the header row.
    static_column_types
        if True, and reader is None, identifies columns
        with a numeric/bool data types from the first non-header row.
        This assumes all subsequent entries in that column are of the same type.
        Default is False.
    digits
        floating point resolution
    space
        number of spaces between columns or a string
    title
        as implied
    missing_data
        character assigned if a row has no entry for a column
    max_width
        maximum column width for printing
    index_name
        column name with values to be used as row identifiers and keys
        for slicing. All column values must be unique.
    legend
        table legend
    column_templates
        dict of column headings
        or a function that will handle the formatting.
    limit
        exits after this many lines. Only applied for non pickled data
        file types.
    format
        output format when using str(Table)
    skip_inconsistent
        skips rows that have different length to header row
    """
    import pathlib

    if not any(isinstance(filename, t) for t in (str, pathlib.PurePath)):
        raise TypeError(
            "filename must be string or Path, perhaps you want make_table()")

    if "index" in kwargs:
        deprecated("argument", "index", "index_name", "2021.11")
        index_name = kwargs.pop("index", index_name)

    if "dtype" in kwargs:
        kwargs.pop("dtype")
        discontinued("argument", "dtype", "2021.04")

    sep = sep or kwargs.pop("delimiter", None)
    file_format, compress_format = get_format_suffixes(filename)

    if file_format == "json":
        return load_from_json(filename, (_Table, ))

    if file_format in ("pickle", "pkl"):
        f = open_(filename, mode="rb")
        loaded_table = pickle.load(f)
        f.close()
        r = _Table()
        r.__setstate__(loaded_table)
        return r

    if not reader:
        if file_format == "csv":
            sep = sep or ","
        elif file_format == "tsv":
            sep = sep or "\t"

        header, rows, loaded_title, legend = load_delimited(filename,
                                                            delimiter=sep,
                                                            limit=limit,
                                                            **kwargs)
        if skip_inconsistent:
            num_fields = len(header)
            rows = [r for r in rows if len(r) == num_fields]
        else:
            lengths = set(map(len, [header] + rows))
            if len(lengths) != 1:
                msg = f"inconsistent number of fields {lengths}"
                raise ValueError(msg)

        title = title or loaded_title
        data = {column[0]: column[1:] for column in zip(header, *rows)}
    else:
        f = open_(filename, newline=None)
        data = [row for row in reader(f)]
        header = data[0]
        data = {column[0]: column[1:] for column in zip(*data)}
        f.close()

    for key, value in data.items():
        data[key] = cast_str_to_array(value, static_type=static_column_types)

    return make_table(
        header=header,
        data=data,
        digits=digits,
        title=title,
        column_templates=column_templates,
        space=space,
        missing_data=missing_data,
        max_width=max_width,
        index_name=index_name,
        legend=legend,
        format=format,
    )
Beispiel #5
0
def make_table(
    header=None,
    data=None,
    row_order=None,
    digits=4,
    space=4,
    title="",
    max_width=1e100,
    index_name=None,
    legend="",
    missing_data="",
    column_templates=None,
    dtype=None,
    data_frame=None,
    format="simple",
    **kwargs,
):
    """

    Parameters
    ----------
    header
        column headings
    data
        a 2D dict, list or tuple. If a dict, it must have column
        headings as top level keys, and common row labels as keys in each
        column.
    row_order
        the order in which rows will be pulled from the twoDdict
    digits
        floating point resolution
    space
        number of spaces between columns or a string
    title
        as implied
    max_width
        maximum column width for printing
    index_name
        column name with values to be used as row identifiers and keys
        for slicing. All column values must be unique.
    legend
        table legend
    missing_data
        replace missing data with this
    column_templates
        dict of column headings
        or a function that will handle the formatting.
    limit
        exits after this many lines. Only applied for non pickled data
        file types.
    data_frame
        a pandas DataFrame, supersedes header/rows
    format
        output format when using str(Table)

    """
    if any([isinstance(a, str) for a in (header, data)]):
        raise TypeError(f"str type invalid, if its a path use load_table()")

    if "index" in kwargs:
        deprecated("argument", "index", "index_name", "2021.11")
        index_name = kwargs.pop("index", index_name)

    if "dtype" in kwargs:
        kwargs.pop("dtype")
        discontinued("argument", "dtype", "2021.04")

    data = kwargs.get("rows", data)
    if data_frame is not None:
        from pandas import DataFrame

        if not isinstance(data_frame, DataFrame):
            raise TypeError(f"expecting a DataFrame, got{type(data_frame)}")

        data = {c: data_frame[c].to_numpy() for c in data_frame}

    table = _Table(
        header=header,
        data=data,
        digits=digits,
        row_order=row_order,
        title=title,
        dtype=dtype,
        column_templates=column_templates,
        space=space,
        missing_data=missing_data,
        max_width=max_width,
        index_name=index_name,
        legend=legend,
        data_frame=data_frame,
        format=format,
    )

    return table