def __init__( self, source, suffix, mode="a", if_exists=RAISE, create=False, md5=True, **kwargs, ): """ Parameters ---------- source path to directory / zip file suffix only members whose name matches the suffix are considered included mode : str file opening mode, defaults to append if_exists : str behaviour when the destination already exists. Valid constants are defined in this file as OVERWRITE, SKIP, RAISE, IGNORE (they correspond to lower case version of the same word) create : bool if True, the destination is created md5 : bool record md5 hexadecimal checksum of data when possible """ from cogent3.util.warning import discontinued discontinued( "class", self.__class__.__name__, "2021.10.01", reason= "zips are not efficient for incremental inclusion of files, use a tinydb instead", ) ReadOnlyZippedDataStore.__init__(self, source=source, suffix=suffix, md5=md5) WritableDataStoreBase.__init__(self, if_exists=if_exists, create=create) d = locals() self._persistent = {k: v for k, v in d.items() if k != "self"} self.mode = "a" or mode
def __init__(self, conversion, by_column=True): """handles conversions of columns or lines Parameters ---------- by_column conversion will by done for each column, otherwise done by entire line """ super(ConvertFields, self).__init__() discontinued("function", "ConvertFields", "2020.11.1") self.conversion = conversion self.by_column = by_column self._func = self.convert_by_columns if not self.by_column: assert isinstance( conversion, Callable), "conversion must be callable to convert by line" self._func = self.convert_by_line
def SeparatorFormatParser( with_header=True, converter=None, ignore=None, sep=",", strip_wspace=True, limit=None, **kw, ): """Returns a parser for a delimited tabular file. Parameters ---------- with_header when True, first line is taken to be the header. Not passed to converter. converter a callable that returns a correctly formatted line. ignore lines for which ignore returns True are ignored. White lines are always skipped. sep the delimiter separating fields. strip_wspace removes redundant white limit exits after this many lines """ if ignore is None: # keep all lines ignore = lambda x: False by_column = getattr(converter, "by_column", True) discontinued("function", "SeparatorFormatParser", "2020.11.1") def callable(lines): num_lines = 0 header = None for line in lines: if is_empty(line): continue line = line.strip("\n").split(sep) if strip_wspace and by_column: line = [field.strip() for field in line] if with_header and not header: header = True yield line continue if converter: line = converter(line) if ignore(line): continue yield line num_lines += 1 if limit is not None and num_lines >= limit: break return callable
def load_table( filename, sep=None, reader=None, digits=4, space=4, title="", missing_data="", max_width=1e100, index_name=None, legend="", column_templates=None, static_column_types=False, limit=None, format="simple", skip_inconsistent=False, **kwargs, ): """ Parameters ---------- filename path to file containing a tabular data sep the delimiting character between columns reader a parser for reading filename. This approach assumes the first row returned by the reader will be the header row. static_column_types if True, and reader is None, identifies columns with a numeric/bool data types from the first non-header row. This assumes all subsequent entries in that column are of the same type. Default is False. digits floating point resolution space number of spaces between columns or a string title as implied missing_data character assigned if a row has no entry for a column max_width maximum column width for printing index_name column name with values to be used as row identifiers and keys for slicing. All column values must be unique. legend table legend column_templates dict of column headings or a function that will handle the formatting. limit exits after this many lines. Only applied for non pickled data file types. format output format when using str(Table) skip_inconsistent skips rows that have different length to header row """ import pathlib if not any(isinstance(filename, t) for t in (str, pathlib.PurePath)): raise TypeError( "filename must be string or Path, perhaps you want make_table()") if "index" in kwargs: deprecated("argument", "index", "index_name", "2021.11") index_name = kwargs.pop("index", index_name) if "dtype" in kwargs: kwargs.pop("dtype") discontinued("argument", "dtype", "2021.04") sep = sep or kwargs.pop("delimiter", None) file_format, compress_format = get_format_suffixes(filename) if file_format == "json": return load_from_json(filename, (_Table, )) if file_format in ("pickle", "pkl"): f = open_(filename, mode="rb") loaded_table = pickle.load(f) f.close() r = _Table() r.__setstate__(loaded_table) return r if not reader: if file_format == "csv": sep = sep or "," elif file_format == "tsv": sep = sep or "\t" header, rows, loaded_title, legend = load_delimited(filename, delimiter=sep, limit=limit, **kwargs) if skip_inconsistent: num_fields = len(header) rows = [r for r in rows if len(r) == num_fields] else: lengths = set(map(len, [header] + rows)) if len(lengths) != 1: msg = f"inconsistent number of fields {lengths}" raise ValueError(msg) title = title or loaded_title data = {column[0]: column[1:] for column in zip(header, *rows)} else: f = open_(filename, newline=None) data = [row for row in reader(f)] header = data[0] data = {column[0]: column[1:] for column in zip(*data)} f.close() for key, value in data.items(): data[key] = cast_str_to_array(value, static_type=static_column_types) return make_table( header=header, data=data, digits=digits, title=title, column_templates=column_templates, space=space, missing_data=missing_data, max_width=max_width, index_name=index_name, legend=legend, format=format, )
def make_table( header=None, data=None, row_order=None, digits=4, space=4, title="", max_width=1e100, index_name=None, legend="", missing_data="", column_templates=None, dtype=None, data_frame=None, format="simple", **kwargs, ): """ Parameters ---------- header column headings data a 2D dict, list or tuple. If a dict, it must have column headings as top level keys, and common row labels as keys in each column. row_order the order in which rows will be pulled from the twoDdict digits floating point resolution space number of spaces between columns or a string title as implied max_width maximum column width for printing index_name column name with values to be used as row identifiers and keys for slicing. All column values must be unique. legend table legend missing_data replace missing data with this column_templates dict of column headings or a function that will handle the formatting. limit exits after this many lines. Only applied for non pickled data file types. data_frame a pandas DataFrame, supersedes header/rows format output format when using str(Table) """ if any([isinstance(a, str) for a in (header, data)]): raise TypeError(f"str type invalid, if its a path use load_table()") if "index" in kwargs: deprecated("argument", "index", "index_name", "2021.11") index_name = kwargs.pop("index", index_name) if "dtype" in kwargs: kwargs.pop("dtype") discontinued("argument", "dtype", "2021.04") data = kwargs.get("rows", data) if data_frame is not None: from pandas import DataFrame if not isinstance(data_frame, DataFrame): raise TypeError(f"expecting a DataFrame, got{type(data_frame)}") data = {c: data_frame[c].to_numpy() for c in data_frame} table = _Table( header=header, data=data, digits=digits, row_order=row_order, title=title, dtype=dtype, column_templates=column_templates, space=space, missing_data=missing_data, max_width=max_width, index_name=index_name, legend=legend, data_frame=data_frame, format=format, ) return table