def test_get_format_suffixes(self): """correctly return suffixes for compressed etc.. formats""" a, b = get_format_suffixes("no_suffixes") self.assertTrue(a == b == None) a, b = get_format_suffixes("suffixes.gz") self.assertTrue(a == None and b == "gz") a, b = get_format_suffixes("suffixes.abcd") self.assertTrue(a == "abcd" and b == None) a, b = get_format_suffixes("suffixes.abcd.bz2") self.assertTrue(a == "abcd" and b == "bz2") a, b = get_format_suffixes("suffixes.zip") self.assertTrue(a == None and b == "zip")
def test_get_format_suffixes_pathlib(self): """correctly return suffixes for compressed etc.. formats from pathlib""" Path = pathlib.Path a, b = get_format_suffixes(Path("no_suffixes")) self.assertTrue(a == b == None) a, b = get_format_suffixes(Path("suffixes.gz")) self.assertTrue(a == None and b == "gz") a, b = get_format_suffixes(Path("suffixes.abcd")) self.assertTrue(a == "abcd" and b == None) a, b = get_format_suffixes(Path("suffixes.abcd.bz2")) self.assertTrue(a == "abcd" and b == "bz2") a, b = get_format_suffixes(Path("suffixes.zip")) self.assertTrue(a == None and b == "zip")
def _has_other_suffixes(self, path, suffix): p = Path(path) allowed = {str(suffix).lower(), "log"} for f in p.iterdir(): if get_format_suffixes(str(f))[0] not in allowed: return True return False
def make_relative_identifier(self, data): """returns identifier for a new member relative to source""" from cogent3.app.composable import _get_source if isinstance(data, DataStoreMember): data = data.name elif type(data) != str: data = _get_source(data) if data is None: raise ValueError("objects for storage require either a " "source or info.source string attribute") basename = os.path.basename(data) suffix, comp = get_format_suffixes(basename) if suffix and comp: pattern = f".{suffix}.{comp}$" elif suffix: pattern = f".{suffix}$" elif comp: pattern = f".{comp}*$" else: pattern = None if pattern: basename = re.sub(pattern, "", basename) basename = f"{basename}.{self.suffix}" return basename
def load_tree(filename, format=None, underscore_unmunge=False): """Constructor for tree. Parameters ---------- filename : str a file path containing a newick or xml formatted tree. format : str either newick, xml or cogent3 json, default is newick underscore_unmunge : bool replace underscores with spaces in all names read, i.e. "sp_name" becomes "sp name". Notes ----- Underscore unmunging is turned off by default, although it is part of the Newick format. Returns ------- PhyloNode """ file_format, _ = get_format_suffixes(filename) if file_format == "json": return load_from_json(filename, (TreeNode, PhyloNode)) with open_(filename) as tfile: treestring = tfile.read() if format is None and filename.endswith(".xml"): format = "xml" return make_tree(treestring, format=format, underscore_unmunge=underscore_unmunge)
def load_aligned_seqs( filename, format=None, array_align=True, moltype=None, label_to_name=None, parser_kw=None, info=None, **kw, ): """ loads aligned sequences from file Parameters ---------- filename : str path to sequence file format : str sequence file format, if not specified tries to guess from the path suffix moltype the moltype, eg DNA, PROTEIN, 'dna', 'protein' array_align : bool if True, returns ArrayAlignment, otherwise an annotatable Alignment label_to_name function for converting original name into another name. parser_kw : dict optional arguments for the parser Returns ------- ``ArrayAlignment`` or ``Alignment`` instance """ file_format, _ = get_format_suffixes(filename) if file_format == "json": return load_from_json(filename, (Alignment, ArrayAlignment)) format = format or file_format if not format: msg = "could not determined file format, set using the format argument" raise ValueError(msg) parser_kw = parser_kw or {} for other_kw in ("constructor_kw", "kw"): other_kw = kw.pop(other_kw, None) or {} kw.update(other_kw) data = list(FromFilenameParser(filename, format, **parser_kw)) return make_aligned_seqs( data, array_align=array_align, label_to_name=label_to_name, moltype=moltype, source=filename, info=info, **kw, )
def write( self, filename, mode=None, writer=None, format=None, sep=None, compress=None, **kwargs, ): """Write table to filename in the specified format. If a format is not specified, it attempts to use a filename suffix. Note if a sep argument is provided, unformatted values are written to file in order to preserve numerical accuracy. Parameters ---------- mode file opening mode format Valid formats are those of the to_string method plus pickle. Will try and guess from filename if not specified. writer a function for formatting the data for output. sep a character delimiter for fields. compress if True, gzips the file and appends .gz to the filename (if not already added). """ file_suffix, compress_suffix = get_format_suffixes(filename) format = format or file_suffix compress = compress or compress_suffix is not None mode = mode or {"pickle": "wb"}.get(format, "w") if compress: if not filename.endswith(".gz"): filename = "%s.gz" % filename mode = "wt" outfile = open_(filename, mode) if format is None: # try guessing from filename suffix if compress: index = -2 else: index = -1 suffix = filename.split(".") if len(suffix) > 1: format = suffix[index] if format == "csv": sep = sep or "," elif format == "tsv": sep = sep or "\t" if writer: rows = self.tolist() rows.insert(0, self.header[:]) rows = writer(rows, has_header=True) outfile.writelines("\n".join(rows)) elif format == "pickle": data = self.__getstate__() pickle.dump(data, outfile, protocol=1) elif sep is not None and format != "bedgraph": writer = csv.writer(outfile, delimiter=sep, lineterminator="\n") if self.title: writer.writerow([self.title]) writer.writerow(self.header) writer.writerows(self.array) if self.legend: writer.writerow([self.legend]) else: table = self.to_string(format=format, sep=sep, **kwargs) outfile.writelines(table + "\n") outfile.close()
def _has_other_suffixes(self, path, suffix): allowed = {str(suffix), "log"} for f in zipfile.ZipFile(path).namelist(): if get_format_suffixes(str(f))[0] not in allowed: return True return False
def load_table( filename, sep=None, reader=None, digits=4, space=4, title="", missing_data="", max_width=1e100, row_ids=None, legend="", column_templates=None, dtype=None, static_column_types=False, limit=None, format="simple", **kwargs, ): """ Parameters ---------- filename path to file containing a tabular data sep the delimiting character between columns reader a parser for reading filename. This approach assumes the first row returned by the reader will be the header row. static_column_types if True, and reader is None, identifies columns with a numeric/bool data types from the first non-header row. This assumes all subsequent entries in that column are of the same type. Default is False. header column headings rows a 2D dict, list or tuple. If a dict, it must have column headings as top level keys, and common row labels as keys in each column. row_order the order in which rows will be pulled from the twoDdict digits floating point resolution space number of spaces between columns or a string title as implied missing_data character assigned if a row has no entry for a column max_width maximum column width for printing row_ids if True, the 0'th column is used as row identifiers and keys for slicing. legend table legend column_templates dict of column headings or a function that will handle the formatting. dtype optional numpy array typecode. limit exits after this many lines. Only applied for non pickled data file types. data_frame a pandas DataFrame, supersedes header/rows format output format when using str(Table) """ sep = sep or kwargs.pop("delimiter", None) file_format, compress_format = get_format_suffixes(filename) if not (reader or static_column_types): if file_format == "pickle": f = open_(filename, mode="rb") loaded_table = pickle.load(f) f.close() return _Table(**loaded_table) elif file_format == "csv": sep = sep or "," elif file_format == "tsv": sep = sep or "\t" header, rows, loaded_title, legend = load_delimited(filename, delimiter=sep, limit=limit, **kwargs) title = title or loaded_title else: f = open_(filename, newline=None) if not reader: if file_format == "csv": sep = sep or "," elif file_format == "tsv": sep = sep or "\t" elif not sep: raise ValueError("static_column_types option requires a value " "for sep") reader = autogen_reader(f, sep, limit=limit, with_title=kwargs.get("with_title", False)) rows = [row for row in reader(f)] f.close() header = rows.pop(0) return make_table( header=header, rows=rows, digits=digits, title=title, dtype=dtype, column_templates=column_templates, space=space, missing_data=missing_data, max_width=max_width, row_ids=row_ids, legend=legend, format=format, ) return table
def get_basename(path): '''returns a file basename without the suffixes''' bn = os.path.basename(path) suffix, cmp_suffix = get_format_suffixes(bn) rindex = bn.rfind(f'.{suffix}') return bn[:rindex]
def _has_other_suffixes(self, path, suffix): for f in zipfile.ZipFile(path).namelist(): if get_format_suffixes(f)[0] != suffix: return True return False
def _has_other_suffixes(self, path, suffix): p = Path(path) for f in p.iterdir(): if get_format_suffixes(str(f))[0] != suffix: return True return False
def LoadTable( filename=None, sep=None, reader=None, header=None, rows=None, row_order=None, digits=4, space=4, title="", missing_data="", max_width=1e100, row_ids=None, legend="", column_templates=None, dtype=None, static_column_types=False, limit=None, data_frame=None, format="simple", **kwargs, ): """ .. deprecated:: 2019.8.30a ``LoadTable`` will be removed in ``cogent3`` 2020.1.1. It's replaced by ``load_table`` and ``make_table``. """ sep = sep or kwargs.pop("delimiter", None) if filename is not None: file_format, compress_format = get_format_suffixes(filename) if filename is not None and not (reader or static_column_types): if file_format == "pickle": f = open_(filename, mode="rb") loaded_table = pickle.load(f) f.close() return _Table(**loaded_table) elif file_format == "csv": sep = sep or "," elif file_format == "tsv": sep = sep or "\t" header, rows, loaded_title, legend = load_delimited( filename, delimiter=sep, limit=limit, **kwargs ) title = title or loaded_title elif filename and (reader or static_column_types): f = open_(filename, newline=None) if not reader: if file_format == "csv": sep = sep or "," elif file_format == "tsv": sep = sep or "\t" elif not sep: raise ValueError( "static_column_types option requires a value " "for sep" ) reader = autogen_reader( f, sep, limit=limit, with_title=kwargs.get("with_title", False) ) rows = [row for row in reader(f)] f.close() header = rows.pop(0) table = _Table( header=header, rows=rows, digits=digits, row_order=row_order, title=title, dtype=dtype, column_templates=column_templates, space=space, missing_data=missing_data, max_width=max_width, row_ids=row_ids, legend=legend, data_frame=data_frame, format=format, ) return table
def load_table( filename, sep=None, reader=None, digits=4, space=4, title="", missing_data="", max_width=1e100, index_name=None, legend="", column_templates=None, static_column_types=False, limit=None, format="simple", skip_inconsistent=False, **kwargs, ): """ Parameters ---------- filename path to file containing a tabular data sep the delimiting character between columns reader a parser for reading filename. This approach assumes the first row returned by the reader will be the header row. static_column_types if True, and reader is None, identifies columns with a numeric/bool data types from the first non-header row. This assumes all subsequent entries in that column are of the same type. Default is False. digits floating point resolution space number of spaces between columns or a string title as implied missing_data character assigned if a row has no entry for a column max_width maximum column width for printing index_name column name with values to be used as row identifiers and keys for slicing. All column values must be unique. legend table legend column_templates dict of column headings or a function that will handle the formatting. limit exits after this many lines. Only applied for non pickled data file types. format output format when using str(Table) skip_inconsistent skips rows that have different length to header row """ import pathlib if not any(isinstance(filename, t) for t in (str, pathlib.PurePath)): raise TypeError( "filename must be string or Path, perhaps you want make_table()") if "index" in kwargs: deprecated("argument", "index", "index_name", "2021.11") index_name = kwargs.pop("index", index_name) sep = sep or kwargs.pop("delimiter", None) file_format, compress_format = get_format_suffixes(filename) if file_format == "json": return load_from_json(filename, (_Table, )) elif file_format in ("pickle", "pkl"): f = open_(filename, mode="rb") loaded_table = pickle.load(f) f.close() r = _Table() r.__setstate__(loaded_table) return r if reader: with open_(filename, newline=None) as f: data = [row for row in reader(f)] header = data[0] data = {column[0]: column[1:] for column in zip(*data)} else: if file_format == "csv": sep = sep or "," elif file_format == "tsv": sep = sep or "\t" header, rows, loaded_title, legend = load_delimited(filename, sep=sep, limit=limit, **kwargs) if skip_inconsistent: num_fields = len(header) rows = [r for r in rows if len(r) == num_fields] else: lengths = set(map(len, [header] + rows)) if len(lengths) != 1: msg = f"inconsistent number of fields {lengths}" raise ValueError(msg) title = title or loaded_title data = {column[0]: column[1:] for column in zip(header, *rows)} for key, value in data.items(): data[key] = cast_str_to_array(value, static_type=static_column_types) return make_table( header=header, data=data, digits=digits, title=title, column_templates=column_templates, space=space, missing_data=missing_data, max_width=max_width, index_name=index_name, legend=legend, format=format, )
def load_table( filename, sep=None, reader=None, digits=4, space=4, title="", missing_data="", max_width=1e100, index=None, legend="", column_templates=None, dtype=None, static_column_types=False, limit=None, format="simple", skip_inconsistent=False, **kwargs, ): """ Parameters ---------- filename path to file containing a tabular data sep the delimiting character between columns reader a parser for reading filename. This approach assumes the first row returned by the reader will be the header row. static_column_types if True, and reader is None, identifies columns with a numeric/bool data types from the first non-header row. This assumes all subsequent entries in that column are of the same type. Default is False. header column headings rows a 2D dict, list or tuple. If a dict, it must have column headings as top level keys, and common row labels as keys in each column. row_order the order in which rows will be pulled from the twoDdict digits floating point resolution space number of spaces between columns or a string title as implied missing_data character assigned if a row has no entry for a column max_width maximum column width for printing index if True, the 0'th column is used as row identifiers and keys for slicing. legend table legend column_templates dict of column headings or a function that will handle the formatting. dtype optional numpy array typecode. limit exits after this many lines. Only applied for non pickled data file types. data_frame a pandas DataFrame, supersedes header/rows format output format when using str(Table) skip_inconsistent skips rows that have different length to header row """ sep = sep or kwargs.pop("delimiter", None) file_format, compress_format = get_format_suffixes(filename) if file_format in ("pickle", "pkl"): f = open_(filename, mode="rb") loaded_table = pickle.load(f) f.close() r = _Table() r.__setstate__(loaded_table) return r if not reader: if file_format == "csv": sep = sep or "," elif file_format == "tsv": sep = sep or "\t" header, rows, loaded_title, legend = load_delimited( filename, delimiter=sep, limit=limit, **kwargs ) if skip_inconsistent: num_fields = len(header) rows = [r for r in rows if len(r) == num_fields] else: lengths = set(map(len, [header] + rows)) if len(lengths) != 1: msg = f"inconsistent number of fields {lengths}" raise ValueError(msg) title = title or loaded_title data = {column[0]: column[1:] for column in zip(header, *rows)} else: f = open_(filename, newline=None) data = [row for row in reader(f)] header = data[0] data = {column[0]: column[1:] for column in zip(*data)} f.close() for key, value in data.items(): data[key] = cast_str_to_array(value, static_type=static_column_types) return make_table( header=header, data=data, digits=digits, title=title, dtype=dtype, column_templates=column_templates, space=space, missing_data=missing_data, max_width=max_width, index=index, legend=legend, format=format, )