def load_delimited( filename, header=True, delimiter=",", with_title=False, with_legend=False, limit=None, ): if limit is not None: limit += 1 # don't count header line with open_(filename) as f: reader = csv.reader(f, dialect="excel", delimiter=delimiter) title = "".join(next(reader)) if with_title else "" rows = [] num_lines = 0 for row in reader: rows.append(row) num_lines += 1 if limit is not None and num_lines >= limit: break header = rows.pop(0) if header else None legend = "".join(rows.pop(-1)) if with_legend else "" return header, rows, title, legend
def open(self, identifier): identifier = self.get_absolute_identifier(identifier, from_relative=False) if not os.path.exists(identifier): raise ValueError(f"path '{identifier}' does not exist") return open_(identifier)
def load_from_json(filename, classes): """Loads objects from json files. Parameters ---------- filename: name of the json file classes: A series of the Cogent3 types, for example: (Alignment, ArrayAlignment) """ assert all( (isinstance(klass, type) for klass in classes) ), "classes should be a series of Cogent3 types, for example: (Alignment, ArrayAlignment)" with open_(filename) as f: content = json.loads(f.read()) try: _, data, completed = load_record_from_json(content) if not completed: raise TypeError("json file is a record for type NotCompleted.") except (KeyError, TypeError): data = content type_ = data.get("type", None) if type_ is None: raise TypeError("json does not contain 'type' key") valid_types = {get_object_provenance(klass) for klass in classes} if type_ not in valid_types: raise TypeError( f"Invalid data type: {type_} is not one of {valid_types}") return deserialise_object(data)
def MinimalFastaParser( infile, strict=True, label_to_name=str, finder=FastaFinder, label_characters=">" ): """Yields successive sequences from infile as (label, seq) tuples. If strict is True (default), raises RecordError when label or seq missing. """ try: infile = open_(infile) close_at_end = True except (TypeError, AttributeError): close_at_end = False for rec in finder(infile): # first line must be a label line if not rec[0][0] in label_characters: if strict: raise RecordError("Found Fasta record without label line: %s" % rec) continue # record must have at least one sequence if len(rec) < 2: if strict: raise RecordError("Found label line without sequences: %s" % rec) else: continue label = rec[0][1:].strip() label = label_to_name(label) seq = "".join(rec[1:]) yield label, seq if close_at_end: infile.close()
def load_tree(filename, format=None, underscore_unmunge=False): """Constructor for tree. Parameters ---------- filename : str a file path containing a newick or xml formatted tree. format : str either newick, xml or cogent3 json, default is newick underscore_unmunge : bool replace underscores with spaces in all names read, i.e. "sp_name" becomes "sp name". Notes ----- Underscore unmunging is turned off by default, although it is part of the Newick format. Returns ------- PhyloNode """ file_format, _ = get_format_suffixes(filename) if file_format == "json": return load_from_json(filename, (TreeNode, PhyloNode)) with open_(filename) as tfile: treestring = tfile.read() if format is None and filename.endswith(".xml"): format = "xml" return make_tree(treestring, format=format, underscore_unmunge=underscore_unmunge)
def FromFilenameParser(filename, format=None, **kw): """Arguments: - filename: name of the sequence alignment file - format: the multiple sequence file format """ format = format_from_filename(filename, format) f = open_(filename, newline=None, mode="rt") return FromFileParser(f, format, **kw)
def load_delimited( filename, header=True, sep=",", delimiter=None, with_title=False, with_legend=False, limit=None, ): """ basic processing of tabular data Parameters ---------- filename: Path path to delimited file (can begin with ~) header: bool whether the first line of the file (after the title, if present) is a header sep: str the character separating columns with_title: bool whether the first line of the file is a title with_legend: bool whether the last line of the file is a legend limit: int maximum number of lines to read from the file Returns ------- header, rows, title, legend Notes ----- All row values remain as strings. """ if delimiter: sep = delimiter deprecated("argument", "delimiter", "sep", "2022.1") if limit is not None and header: limit += 1 # don't count header line with open_(filename) as f: reader = csv.reader(f, dialect="excel", delimiter=sep) title = "".join(next(reader)) if with_title else "" rows = [] num_lines = 0 for row in reader: rows.append(row) num_lines += 1 if limit is not None and num_lines >= limit: break header = rows.pop(0) if header else None legend = "".join(rows.pop(-1)) if with_legend else "" return header, rows, title, legend
def gff_parser(f): """delegates to the correct gff_parser based on the version""" f = f if not isinstance(f, Path) else str(f) if isinstance(f, str): with open_(f) as infile: yield from gff2_parser(infile) elif isinstance(f, StringIO): yield from gff2_parser(f) else: raise TypeError
def deserialise_object(data): """ deserialises from json Parameters ---------- data path to json file, json string or a dict Returns ------- If the dict from json.loads does not contain a "type" key, the object will be returned as is. Otherwise, it will be deserialised to a cogent3 object. """ if path_exists(data): with open_(data) as infile: data = json.load(infile) if type(data) is str: data = json.loads(data) type_ = data.get("type", None) if type_ is None: return data if "core.sequence" in type_: func = deserialise_seq elif "core.alignment" in type_: func = deserialise_seq_collections elif "core.tree" in type_: func = deserialise_tree elif ( "evolve.substitution_model" in type_ or "evolve.ns_substitution_model" in type_ ): func = deserialise_substitution_model elif "evolve.parameter_controller" in type_: func = deserialise_likelihood_function elif "core.moltype" in type_: func = deserialise_moltype elif "core.alphabet" in type_: func = deserialise_alphabet elif "app.result" in type_: func = deserialise_result elif "notcompleted" in type_.lower(): func = deserialise_not_completed elif type_.lower().endswith("table"): func = deserialise_tabular elif "dictarray" in type_.lower(): func = deserialise_tabular elif "distancematrix" in type_.lower(): func = deserialise_tabular else: msg = "deserialising '%s' from json" % type_ raise NotImplementedError(msg) return func(data)
def __call__(self, lines): """a generator that yields individual lines processed according to the provided conditions Parameters ---------- lines: path or iterable If file path, handles file open and close. Will expand user component (i.e. '~/') of path. Notes ----- Elements within a row are strings """ input_from_path = False if isinstance(lines, str) or isinstance(lines, pathlib.Path): path = pathlib.Path(lines).expanduser() input_from_path = path.exists() if input_from_path: lines = open_(path) num_lines = 0 header = None match = not self.negate for line in lines: if is_empty(line): continue line = line.split(self.sep) line = [e.strip() for e in line] if header is None and self.with_header: header = True if self.columns: self._column_names_to_indices(line) line = [line[i] for i in self.columns] yield line continue if self.columns: line = [line[i] for i in self.columns] if self.condition and self.condition(line) != match: continue yield line num_lines += 1 if self.limit is not None and num_lines >= self.limit: break if input_from_path: lines.close()
def load_classifier(path): '''returns dict of pickled classifier and features info''' with open_(path, 'rb') as clf: classifier = pickle.load(clf) try: feature_params = classifier['feature_params'] scaler = classifier.get('scaler', None) classifier = classifier['classifier'] except KeyError: raise ValueError('pickle formatted file does not ' 'contain classifier') return classifier, feature_params, scaler
def MinimalNexusAlignParser(align_path): """returns {label: seq, ...}""" if type(align_path) == str: infile = open_(align_path) else: infile = align_path isblock = re.compile(r"begin\s+(data|characters)").search inblock = False try: line = infile.readline() except AttributeError: # guessing it's a list of strings from a nexus file line = infile.pop(0) if not line.lower().startswith("#nexus"): raise ValueError("not a nexus file") block = [] index = None for line in infile: if isblock(line.lower()): inblock = True elif inblock and line.lower().startswith("end;"): break elif inblock: line = line.strip() if line.lower().startswith("matrix"): index = len(block) elif not line.startswith(";"): block.append(line) if hasattr(infile, "close"): infile.close() if not block: raise ValueError("not found DATA or CHARACTER block") elif index is None: raise RecordError("malformed block, no 'matrix' line") block = block[index:] seqs = defaultdict(list) for line in block: if not line or (line.startswith("[") and line.endswith("]")): # blank or comment line continue line = line.split() seqs[line[0]].append("".join(line[1:])) for n, s in seqs.items(): yield n, "".join(s)
def write(self, path, format="tsv", sep="\t"): """ writes a flattened version to path Parameters ---------- path : str format possible formats are 'rest'/'rst', 'markdown'/'md', 'latex', 'html', 'phylip', 'bedgraph', 'csv', 'tsv', or 'simple' (default). sep : str used to split fields, will be inferred from path suffix if not provided """ data = self.to_string(format=format, sep=sep) with open_(path, "w") as outfile: outfile.write(data)
def save_to_filename(alignment, filename, format, **kw): """Arguments: - alignment: to be written - filename: name of the sequence alignment file - format: the multiple sequence file format """ if format is None: raise FileFormatError("format not known") f = open_(filename, "wt") try: write_alignment_to_file(f, alignment, format, **kw) except Exception: try: os.unlink(filename) except Exception: pass raise finally: f.close()
def gff_parser(f): """parses a gff file Parameters ----------- f accepts string path or pathlib.Path or file-like object (e.g. StringIO) Returns ------- dict contains each of the 9 parameters specified by gff3, and comments. """ # calling a separate function to ensure file closes correctly f = f if not isinstance(f, Path) else str(f) if isinstance(f, str): with open_(f) as infile: yield from _gff_parser(infile) else: yield from _gff_parser(f)
def load_tree(filename, format=None, underscore_unmunge=False): """Constructor for tree. Parameters ---------- filename a file containing a newick or xml formatted tree. Notes ----- Underscore unmunging is turned off by default, although it is part of the Newick format. Set ``underscore_unmunge=True`` to replace underscores with spaces in all names read. """ with open_(filename) as tfile: treestring = tfile.read() if format is None and filename.endswith(".xml"): format = "xml" tree = make_tree(treestring, format=format, underscore_unmunge=underscore_unmunge) return tree
def __call__(self, lines): input_from_path = False if isinstance(lines, str) or isinstance(lines, pathlib.Path): path = pathlib.Path(lines) input_from_path = path.exists() if input_from_path: lines = open_(path) num_lines = 0 header = None match = not self.negate for line in lines: if is_empty(line): continue line = line.split(self.sep) line = [e.strip() for e in line] if header is None and self.with_header: header = True if self.columns: self._column_names_to_indices(line) line = [line[i] for i in self.columns] yield line continue if self.columns: line = [line[i] for i in self.columns] if self.condition and self.condition(line) != match: continue yield line num_lines += 1 if self.limit is not None and num_lines >= self.limit: break if input_from_path: lines.close()
def load(self, data): """returns sequences Parameters ---------- data file path or cogent3 sequence collection / alignment """ if type(data) == str: with open_(data) as infile: data = dict(record for record in self._parser(infile)) seqs = self.klass(data=data, moltype=self.moltype) seqs.info.path = data elif not isinstance(data, SequenceCollection): if self.aligned: seqs = make_aligned_seqs(data, moltype=self.moltype) else: seqs = make_unaligned_seqs(data, moltype=self.moltype) if not (self._output_types & {"aligned"}): seqs = seqs.degap() return seqs
def write( self, filename, mode=None, writer=None, format=None, sep=None, compress=None, **kwargs, ): """Write table to filename in the specified format. If a format is not specified, it attempts to use a filename suffix. Note if a sep argument is provided, unformatted values are written to file in order to preserve numerical accuracy. Parameters ---------- mode file opening mode format Valid formats are those of the to_string method plus pickle. Will try and guess from filename if not specified. writer a function for formatting the data for output. sep a character delimiter for fields. compress if True, gzips the file and appends .gz to the filename (if not already added). """ file_suffix, compress_suffix = get_format_suffixes(filename) format = format or file_suffix compress = compress or compress_suffix is not None mode = mode or {"pickle": "wb"}.get(format, "w") if compress: if not filename.endswith(".gz"): filename = "%s.gz" % filename mode = "wt" outfile = open_(filename, mode) if format is None: # try guessing from filename suffix if compress: index = -2 else: index = -1 suffix = filename.split(".") if len(suffix) > 1: format = suffix[index] if format == "csv": sep = sep or "," elif format == "tsv": sep = sep or "\t" if writer: rows = self.tolist() rows.insert(0, self.header[:]) rows = writer(rows, has_header=True) outfile.writelines("\n".join(rows)) elif format == "pickle": data = self.__getstate__() pickle.dump(data, outfile, protocol=1) elif sep is not None and format != "bedgraph": writer = csv.writer(outfile, delimiter=sep, lineterminator="\n") if self.title: writer.writerow([self.title]) writer.writerow(self.header) writer.writerows(self.array) if self.legend: writer.writerow([self.legend]) else: table = self.to_string(format=format, sep=sep, **kwargs) outfile.writelines(table + "\n") outfile.close()
def load_table( filename, sep=None, reader=None, digits=4, space=4, title="", missing_data="", max_width=1e100, index=None, legend="", column_templates=None, dtype=None, static_column_types=False, limit=None, format="simple", skip_inconsistent=False, **kwargs, ): """ Parameters ---------- filename path to file containing a tabular data sep the delimiting character between columns reader a parser for reading filename. This approach assumes the first row returned by the reader will be the header row. static_column_types if True, and reader is None, identifies columns with a numeric/bool data types from the first non-header row. This assumes all subsequent entries in that column are of the same type. Default is False. header column headings rows a 2D dict, list or tuple. If a dict, it must have column headings as top level keys, and common row labels as keys in each column. row_order the order in which rows will be pulled from the twoDdict digits floating point resolution space number of spaces between columns or a string title as implied missing_data character assigned if a row has no entry for a column max_width maximum column width for printing index if True, the 0'th column is used as row identifiers and keys for slicing. legend table legend column_templates dict of column headings or a function that will handle the formatting. dtype optional numpy array typecode. limit exits after this many lines. Only applied for non pickled data file types. data_frame a pandas DataFrame, supersedes header/rows format output format when using str(Table) skip_inconsistent skips rows that have different length to header row """ sep = sep or kwargs.pop("delimiter", None) file_format, compress_format = get_format_suffixes(filename) if file_format in ("pickle", "pkl"): f = open_(filename, mode="rb") loaded_table = pickle.load(f) f.close() r = _Table() r.__setstate__(loaded_table) return r if not reader: if file_format == "csv": sep = sep or "," elif file_format == "tsv": sep = sep or "\t" header, rows, loaded_title, legend = load_delimited( filename, delimiter=sep, limit=limit, **kwargs ) if skip_inconsistent: num_fields = len(header) rows = [r for r in rows if len(r) == num_fields] else: lengths = set(map(len, [header] + rows)) if len(lengths) != 1: msg = f"inconsistent number of fields {lengths}" raise ValueError(msg) title = title or loaded_title data = {column[0]: column[1:] for column in zip(header, *rows)} else: f = open_(filename, newline=None) data = [row for row in reader(f)] header = data[0] data = {column[0]: column[1:] for column in zip(*data)} f.close() for key, value in data.items(): data[key] = cast_str_to_array(value, static_type=static_column_types) return make_table( header=header, data=data, digits=digits, title=title, dtype=dtype, column_templates=column_templates, space=space, missing_data=missing_data, max_width=max_width, index=index, legend=legend, format=format, )
def load_table( filename, sep=None, reader=None, digits=4, space=4, title="", missing_data="", max_width=1e100, index_name=None, legend="", column_templates=None, static_column_types=False, limit=None, format="simple", skip_inconsistent=False, **kwargs, ): """ Parameters ---------- filename path to file containing a tabular data sep the delimiting character between columns reader a parser for reading filename. This approach assumes the first row returned by the reader will be the header row. static_column_types if True, and reader is None, identifies columns with a numeric/bool data types from the first non-header row. This assumes all subsequent entries in that column are of the same type. Default is False. digits floating point resolution space number of spaces between columns or a string title as implied missing_data character assigned if a row has no entry for a column max_width maximum column width for printing index_name column name with values to be used as row identifiers and keys for slicing. All column values must be unique. legend table legend column_templates dict of column headings or a function that will handle the formatting. limit exits after this many lines. Only applied for non pickled data file types. format output format when using str(Table) skip_inconsistent skips rows that have different length to header row """ import pathlib if not any(isinstance(filename, t) for t in (str, pathlib.PurePath)): raise TypeError( "filename must be string or Path, perhaps you want make_table()") if "index" in kwargs: deprecated("argument", "index", "index_name", "2021.11") index_name = kwargs.pop("index", index_name) sep = sep or kwargs.pop("delimiter", None) file_format, compress_format = get_format_suffixes(filename) if file_format == "json": return load_from_json(filename, (_Table, )) elif file_format in ("pickle", "pkl"): f = open_(filename, mode="rb") loaded_table = pickle.load(f) f.close() r = _Table() r.__setstate__(loaded_table) return r if reader: with open_(filename, newline=None) as f: data = [row for row in reader(f)] header = data[0] data = {column[0]: column[1:] for column in zip(*data)} else: if file_format == "csv": sep = sep or "," elif file_format == "tsv": sep = sep or "\t" header, rows, loaded_title, legend = load_delimited(filename, sep=sep, limit=limit, **kwargs) if skip_inconsistent: num_fields = len(header) rows = [r for r in rows if len(r) == num_fields] else: lengths = set(map(len, [header] + rows)) if len(lengths) != 1: msg = f"inconsistent number of fields {lengths}" raise ValueError(msg) title = title or loaded_title data = {column[0]: column[1:] for column in zip(header, *rows)} for key, value in data.items(): data[key] = cast_str_to_array(value, static_type=static_column_types) return make_table( header=header, data=data, digits=digits, title=title, column_templates=column_templates, space=space, missing_data=missing_data, max_width=max_width, index_name=index_name, legend=legend, format=format, )
def LoadTable( filename=None, sep=None, reader=None, header=None, rows=None, row_order=None, digits=4, space=4, title="", missing_data="", max_width=1e100, row_ids=None, legend="", column_templates=None, dtype=None, static_column_types=False, limit=None, data_frame=None, format="simple", **kwargs, ): """ .. deprecated:: 2019.8.30a ``LoadTable`` will be removed in ``cogent3`` 2020.1.1. It's replaced by ``load_table`` and ``make_table``. """ sep = sep or kwargs.pop("delimiter", None) if filename is not None: file_format, compress_format = get_format_suffixes(filename) if filename is not None and not (reader or static_column_types): if file_format == "pickle": f = open_(filename, mode="rb") loaded_table = pickle.load(f) f.close() return _Table(**loaded_table) elif file_format == "csv": sep = sep or "," elif file_format == "tsv": sep = sep or "\t" header, rows, loaded_title, legend = load_delimited( filename, delimiter=sep, limit=limit, **kwargs ) title = title or loaded_title elif filename and (reader or static_column_types): f = open_(filename, newline=None) if not reader: if file_format == "csv": sep = sep or "," elif file_format == "tsv": sep = sep or "\t" elif not sep: raise ValueError( "static_column_types option requires a value " "for sep" ) reader = autogen_reader( f, sep, limit=limit, with_title=kwargs.get("with_title", False) ) rows = [row for row in reader(f)] f.close() header = rows.pop(0) table = _Table( header=header, rows=rows, digits=digits, row_order=row_order, title=title, dtype=dtype, column_templates=column_templates, space=space, missing_data=missing_data, max_width=max_width, row_ids=row_ids, legend=legend, data_frame=data_frame, format=format, ) return table
def load_table( filename, sep=None, reader=None, digits=4, space=4, title="", missing_data="", max_width=1e100, row_ids=None, legend="", column_templates=None, dtype=None, static_column_types=False, limit=None, format="simple", **kwargs, ): """ Parameters ---------- filename path to file containing a tabular data sep the delimiting character between columns reader a parser for reading filename. This approach assumes the first row returned by the reader will be the header row. static_column_types if True, and reader is None, identifies columns with a numeric/bool data types from the first non-header row. This assumes all subsequent entries in that column are of the same type. Default is False. header column headings rows a 2D dict, list or tuple. If a dict, it must have column headings as top level keys, and common row labels as keys in each column. row_order the order in which rows will be pulled from the twoDdict digits floating point resolution space number of spaces between columns or a string title as implied missing_data character assigned if a row has no entry for a column max_width maximum column width for printing row_ids if True, the 0'th column is used as row identifiers and keys for slicing. legend table legend column_templates dict of column headings or a function that will handle the formatting. dtype optional numpy array typecode. limit exits after this many lines. Only applied for non pickled data file types. data_frame a pandas DataFrame, supersedes header/rows format output format when using str(Table) """ sep = sep or kwargs.pop("delimiter", None) file_format, compress_format = get_format_suffixes(filename) if not (reader or static_column_types): if file_format == "pickle": f = open_(filename, mode="rb") loaded_table = pickle.load(f) f.close() return _Table(**loaded_table) elif file_format == "csv": sep = sep or "," elif file_format == "tsv": sep = sep or "\t" header, rows, loaded_title, legend = load_delimited(filename, delimiter=sep, limit=limit, **kwargs) title = title or loaded_title else: f = open_(filename, newline=None) if not reader: if file_format == "csv": sep = sep or "," elif file_format == "tsv": sep = sep or "\t" elif not sep: raise ValueError("static_column_types option requires a value " "for sep") reader = autogen_reader(f, sep, limit=limit, with_title=kwargs.get("with_title", False)) rows = [row for row in reader(f)] f.close() header = rows.pop(0) return make_table( header=header, rows=rows, digits=digits, title=title, dtype=dtype, column_templates=column_templates, space=space, missing_data=missing_data, max_width=max_width, row_ids=row_ids, legend=legend, format=format, ) return table
def dump_json(path, data): '''dumps data in json format''' with open_(path, mode='wt') as outfile: json.dump(data, outfile)
def load_json(path): '''loads raw data object from json file''' with open_(path) as infile: data = json.load(infile) return data