Beispiel #1
0
 def test_get_format_suffixes(self):
     """correctly return suffixes for compressed etc.. formats"""
     a, b = get_format_suffixes("no_suffixes")
     self.assertTrue(a == b == None)
     a, b = get_format_suffixes("suffixes.gz")
     self.assertTrue(a == None and b == "gz")
     a, b = get_format_suffixes("suffixes.abcd")
     self.assertTrue(a == "abcd" and b == None)
     a, b = get_format_suffixes("suffixes.abcd.bz2")
     self.assertTrue(a == "abcd" and b == "bz2")
     a, b = get_format_suffixes("suffixes.zip")
     self.assertTrue(a == None and b == "zip")
Beispiel #2
0
 def test_get_format_suffixes_pathlib(self):
     """correctly return suffixes for compressed etc.. formats from pathlib"""
     Path = pathlib.Path
     a, b = get_format_suffixes(Path("no_suffixes"))
     self.assertTrue(a == b == None)
     a, b = get_format_suffixes(Path("suffixes.gz"))
     self.assertTrue(a == None and b == "gz")
     a, b = get_format_suffixes(Path("suffixes.abcd"))
     self.assertTrue(a == "abcd" and b == None)
     a, b = get_format_suffixes(Path("suffixes.abcd.bz2"))
     self.assertTrue(a == "abcd" and b == "bz2")
     a, b = get_format_suffixes(Path("suffixes.zip"))
     self.assertTrue(a == None and b == "zip")
Beispiel #3
0
 def _has_other_suffixes(self, path, suffix):
     p = Path(path)
     allowed = {str(suffix).lower(), "log"}
     for f in p.iterdir():
         if get_format_suffixes(str(f))[0] not in allowed:
             return True
     return False
Beispiel #4
0
    def make_relative_identifier(self, data):
        """returns identifier for a new member relative to source"""
        from cogent3.app.composable import _get_source

        if isinstance(data, DataStoreMember):
            data = data.name
        elif type(data) != str:
            data = _get_source(data)
            if data is None:
                raise ValueError("objects for storage require either a "
                                 "source or info.source string attribute")
        basename = os.path.basename(data)
        suffix, comp = get_format_suffixes(basename)
        if suffix and comp:
            pattern = f".{suffix}.{comp}$"
        elif suffix:
            pattern = f".{suffix}$"
        elif comp:
            pattern = f".{comp}*$"
        else:
            pattern = None
        if pattern:
            basename = re.sub(pattern, "", basename)
        basename = f"{basename}.{self.suffix}"
        return basename
Beispiel #5
0
def load_tree(filename, format=None, underscore_unmunge=False):
    """Constructor for tree.

    Parameters
    ----------
    filename : str
        a file path containing a newick or xml formatted tree.
    format : str
        either newick, xml or cogent3 json, default is newick
    underscore_unmunge : bool
        replace underscores with spaces in all names read, i.e. "sp_name"
        becomes "sp name".

    Notes
    -----
    Underscore unmunging is turned off by default, although it is part
    of the Newick format.

    Returns
    -------
    PhyloNode
    """
    file_format, _ = get_format_suffixes(filename)
    if file_format == "json":
        return load_from_json(filename, (TreeNode, PhyloNode))

    with open_(filename) as tfile:
        treestring = tfile.read()
        if format is None and filename.endswith(".xml"):
            format = "xml"

    return make_tree(treestring,
                     format=format,
                     underscore_unmunge=underscore_unmunge)
Beispiel #6
0
def load_aligned_seqs(
    filename,
    format=None,
    array_align=True,
    moltype=None,
    label_to_name=None,
    parser_kw=None,
    info=None,
    **kw,
):
    """
    loads aligned sequences from file

    Parameters
    ----------
    filename : str
        path to sequence file
    format : str
        sequence file format, if not specified tries to guess from the path suffix
    moltype
        the moltype, eg DNA, PROTEIN, 'dna', 'protein'
    array_align : bool
        if True, returns ArrayAlignment, otherwise an annotatable Alignment
    label_to_name
        function for converting original name into another name.
    parser_kw : dict
        optional arguments for the parser

    Returns
    -------
    ``ArrayAlignment`` or ``Alignment`` instance
    """
    file_format, _ = get_format_suffixes(filename)
    if file_format == "json":
        return load_from_json(filename, (Alignment, ArrayAlignment))

    format = format or file_format
    if not format:
        msg = "could not determined file format, set using the format argument"
        raise ValueError(msg)

    parser_kw = parser_kw or {}
    for other_kw in ("constructor_kw", "kw"):
        other_kw = kw.pop(other_kw, None) or {}
        kw.update(other_kw)
    data = list(FromFilenameParser(filename, format, **parser_kw))
    return make_aligned_seqs(
        data,
        array_align=array_align,
        label_to_name=label_to_name,
        moltype=moltype,
        source=filename,
        info=info,
        **kw,
    )
Beispiel #7
0
    def write(
        self,
        filename,
        mode=None,
        writer=None,
        format=None,
        sep=None,
        compress=None,
        **kwargs,
    ):
        """Write table to filename in the specified format. If a format is not
        specified, it attempts to use a filename suffix. Note if a sep argument
        is provided, unformatted values are written to file in order to
        preserve numerical accuracy.

        Parameters
        ----------
        mode
            file opening mode
        format
            Valid formats are those of the to_string method plus
            pickle. Will try and guess from filename if not specified.
        writer
            a function for formatting the data for output.
        sep
            a character delimiter for fields.
        compress
            if True, gzips the file and appends .gz to the
            filename (if not already added).

        """
        file_suffix, compress_suffix = get_format_suffixes(filename)
        format = format or file_suffix
        compress = compress or compress_suffix is not None

        mode = mode or {"pickle": "wb"}.get(format, "w")

        if compress:
            if not filename.endswith(".gz"):
                filename = "%s.gz" % filename
            mode = "wt"

        outfile = open_(filename, mode)

        if format is None:
            # try guessing from filename suffix
            if compress:
                index = -2
            else:
                index = -1
            suffix = filename.split(".")
            if len(suffix) > 1:
                format = suffix[index]

        if format == "csv":
            sep = sep or ","
        elif format == "tsv":
            sep = sep or "\t"

        if writer:
            rows = self.tolist()
            rows.insert(0, self.header[:])
            rows = writer(rows, has_header=True)
            outfile.writelines("\n".join(rows))
        elif format == "pickle":
            data = self.__getstate__()
            pickle.dump(data, outfile, protocol=1)
        elif sep is not None and format != "bedgraph":
            writer = csv.writer(outfile, delimiter=sep, lineterminator="\n")
            if self.title:
                writer.writerow([self.title])
            writer.writerow(self.header)
            writer.writerows(self.array)
            if self.legend:
                writer.writerow([self.legend])
        else:
            table = self.to_string(format=format, sep=sep, **kwargs)
            outfile.writelines(table + "\n")
        outfile.close()
Beispiel #8
0
 def _has_other_suffixes(self, path, suffix):
     allowed = {str(suffix), "log"}
     for f in zipfile.ZipFile(path).namelist():
         if get_format_suffixes(str(f))[0] not in allowed:
             return True
     return False
Beispiel #9
0
def load_table(
    filename,
    sep=None,
    reader=None,
    digits=4,
    space=4,
    title="",
    missing_data="",
    max_width=1e100,
    row_ids=None,
    legend="",
    column_templates=None,
    dtype=None,
    static_column_types=False,
    limit=None,
    format="simple",
    **kwargs,
):
    """

    Parameters
    ----------
    filename
        path to file containing a tabular data
    sep
        the delimiting character between columns
    reader
        a parser for reading filename. This approach assumes the first
        row returned by the reader will be the header row.
    static_column_types
        if True, and reader is None, identifies columns
        with a numeric/bool data types from the first non-header row.
        This assumes all subsequent entries in that column are of the same type.
        Default is False.
    header
        column headings
    rows
        a 2D dict, list or tuple. If a dict, it must have column
        headings as top level keys, and common row labels as keys in each
        column.
    row_order
        the order in which rows will be pulled from the twoDdict
    digits
        floating point resolution
    space
        number of spaces between columns or a string
    title
        as implied
    missing_data
        character assigned if a row has no entry for a column
    max_width
        maximum column width for printing
    row_ids
        if True, the 0'th column is used as row identifiers and keys
        for slicing.
    legend
        table legend
    column_templates
        dict of column headings
        or a function that will handle the formatting.
    dtype
        optional numpy array typecode.
    limit
        exits after this many lines. Only applied for non pickled data
        file types.
    data_frame
        a pandas DataFrame, supersedes header/rows
    format
        output format when using str(Table)

    """
    sep = sep or kwargs.pop("delimiter", None)
    file_format, compress_format = get_format_suffixes(filename)

    if not (reader or static_column_types):
        if file_format == "pickle":
            f = open_(filename, mode="rb")
            loaded_table = pickle.load(f)
            f.close()
            return _Table(**loaded_table)
        elif file_format == "csv":
            sep = sep or ","
        elif file_format == "tsv":
            sep = sep or "\t"

        header, rows, loaded_title, legend = load_delimited(filename,
                                                            delimiter=sep,
                                                            limit=limit,
                                                            **kwargs)
        title = title or loaded_title
    else:
        f = open_(filename, newline=None)
        if not reader:
            if file_format == "csv":
                sep = sep or ","
            elif file_format == "tsv":
                sep = sep or "\t"
            elif not sep:
                raise ValueError("static_column_types option requires a value "
                                 "for sep")

            reader = autogen_reader(f,
                                    sep,
                                    limit=limit,
                                    with_title=kwargs.get("with_title", False))

        rows = [row for row in reader(f)]
        f.close()
        header = rows.pop(0)
    return make_table(
        header=header,
        rows=rows,
        digits=digits,
        title=title,
        dtype=dtype,
        column_templates=column_templates,
        space=space,
        missing_data=missing_data,
        max_width=max_width,
        row_ids=row_ids,
        legend=legend,
        format=format,
    )

    return table
Beispiel #10
0
def get_basename(path):
    '''returns a file basename without the suffixes'''
    bn = os.path.basename(path)
    suffix, cmp_suffix = get_format_suffixes(bn)
    rindex = bn.rfind(f'.{suffix}')
    return bn[:rindex]
Beispiel #11
0
 def _has_other_suffixes(self, path, suffix):
     for f in zipfile.ZipFile(path).namelist():
         if get_format_suffixes(f)[0] != suffix:
             return True
     return False
Beispiel #12
0
 def _has_other_suffixes(self, path, suffix):
     p = Path(path)
     for f in p.iterdir():
         if get_format_suffixes(str(f))[0] != suffix:
             return True
     return False
Beispiel #13
0
def LoadTable(
    filename=None,
    sep=None,
    reader=None,
    header=None,
    rows=None,
    row_order=None,
    digits=4,
    space=4,
    title="",
    missing_data="",
    max_width=1e100,
    row_ids=None,
    legend="",
    column_templates=None,
    dtype=None,
    static_column_types=False,
    limit=None,
    data_frame=None,
    format="simple",
    **kwargs,
):
    """
    .. deprecated:: 2019.8.30a

        ``LoadTable`` will be removed in ``cogent3`` 2020.1.1. It's replaced by
        ``load_table`` and ``make_table``.
    """
    sep = sep or kwargs.pop("delimiter", None)
    if filename is not None:
        file_format, compress_format = get_format_suffixes(filename)

    if filename is not None and not (reader or static_column_types):
        if file_format == "pickle":
            f = open_(filename, mode="rb")
            loaded_table = pickle.load(f)
            f.close()
            return _Table(**loaded_table)
        elif file_format == "csv":
            sep = sep or ","
        elif file_format == "tsv":
            sep = sep or "\t"

        header, rows, loaded_title, legend = load_delimited(
            filename, delimiter=sep, limit=limit, **kwargs
        )
        title = title or loaded_title
    elif filename and (reader or static_column_types):
        f = open_(filename, newline=None)
        if not reader:
            if file_format == "csv":
                sep = sep or ","
            elif file_format == "tsv":
                sep = sep or "\t"
            elif not sep:
                raise ValueError(
                    "static_column_types option requires a value " "for sep"
                )

            reader = autogen_reader(
                f, sep, limit=limit, with_title=kwargs.get("with_title", False)
            )

        rows = [row for row in reader(f)]
        f.close()
        header = rows.pop(0)

    table = _Table(
        header=header,
        rows=rows,
        digits=digits,
        row_order=row_order,
        title=title,
        dtype=dtype,
        column_templates=column_templates,
        space=space,
        missing_data=missing_data,
        max_width=max_width,
        row_ids=row_ids,
        legend=legend,
        data_frame=data_frame,
        format=format,
    )

    return table
Beispiel #14
0
def load_table(
    filename,
    sep=None,
    reader=None,
    digits=4,
    space=4,
    title="",
    missing_data="",
    max_width=1e100,
    index_name=None,
    legend="",
    column_templates=None,
    static_column_types=False,
    limit=None,
    format="simple",
    skip_inconsistent=False,
    **kwargs,
):
    """

    Parameters
    ----------
    filename
        path to file containing a tabular data
    sep
        the delimiting character between columns
    reader
        a parser for reading filename. This approach assumes the first
        row returned by the reader will be the header row.
    static_column_types
        if True, and reader is None, identifies columns
        with a numeric/bool data types from the first non-header row.
        This assumes all subsequent entries in that column are of the same type.
        Default is False.
    digits
        floating point resolution
    space
        number of spaces between columns or a string
    title
        as implied
    missing_data
        character assigned if a row has no entry for a column
    max_width
        maximum column width for printing
    index_name
        column name with values to be used as row identifiers and keys
        for slicing. All column values must be unique.
    legend
        table legend
    column_templates
        dict of column headings
        or a function that will handle the formatting.
    limit
        exits after this many lines. Only applied for non pickled data
        file types.
    format
        output format when using str(Table)
    skip_inconsistent
        skips rows that have different length to header row
    """
    import pathlib

    if not any(isinstance(filename, t) for t in (str, pathlib.PurePath)):
        raise TypeError(
            "filename must be string or Path, perhaps you want make_table()")

    if "index" in kwargs:
        deprecated("argument", "index", "index_name", "2021.11")
        index_name = kwargs.pop("index", index_name)

    sep = sep or kwargs.pop("delimiter", None)
    file_format, compress_format = get_format_suffixes(filename)

    if file_format == "json":
        return load_from_json(filename, (_Table, ))
    elif file_format in ("pickle", "pkl"):
        f = open_(filename, mode="rb")
        loaded_table = pickle.load(f)
        f.close()
        r = _Table()
        r.__setstate__(loaded_table)
        return r

    if reader:
        with open_(filename, newline=None) as f:
            data = [row for row in reader(f)]
            header = data[0]
            data = {column[0]: column[1:] for column in zip(*data)}
    else:
        if file_format == "csv":
            sep = sep or ","
        elif file_format == "tsv":
            sep = sep or "\t"

        header, rows, loaded_title, legend = load_delimited(filename,
                                                            sep=sep,
                                                            limit=limit,
                                                            **kwargs)
        if skip_inconsistent:
            num_fields = len(header)
            rows = [r for r in rows if len(r) == num_fields]
        else:
            lengths = set(map(len, [header] + rows))
            if len(lengths) != 1:
                msg = f"inconsistent number of fields {lengths}"
                raise ValueError(msg)

        title = title or loaded_title
        data = {column[0]: column[1:] for column in zip(header, *rows)}

    for key, value in data.items():
        data[key] = cast_str_to_array(value, static_type=static_column_types)

    return make_table(
        header=header,
        data=data,
        digits=digits,
        title=title,
        column_templates=column_templates,
        space=space,
        missing_data=missing_data,
        max_width=max_width,
        index_name=index_name,
        legend=legend,
        format=format,
    )
Beispiel #15
0
def load_table(
    filename,
    sep=None,
    reader=None,
    digits=4,
    space=4,
    title="",
    missing_data="",
    max_width=1e100,
    index=None,
    legend="",
    column_templates=None,
    dtype=None,
    static_column_types=False,
    limit=None,
    format="simple",
    skip_inconsistent=False,
    **kwargs,
):
    """

    Parameters
    ----------
    filename
        path to file containing a tabular data
    sep
        the delimiting character between columns
    reader
        a parser for reading filename. This approach assumes the first
        row returned by the reader will be the header row.
    static_column_types
        if True, and reader is None, identifies columns
        with a numeric/bool data types from the first non-header row.
        This assumes all subsequent entries in that column are of the same type.
        Default is False.
    header
        column headings
    rows
        a 2D dict, list or tuple. If a dict, it must have column
        headings as top level keys, and common row labels as keys in each
        column.
    row_order
        the order in which rows will be pulled from the twoDdict
    digits
        floating point resolution
    space
        number of spaces between columns or a string
    title
        as implied
    missing_data
        character assigned if a row has no entry for a column
    max_width
        maximum column width for printing
    index
        if True, the 0'th column is used as row identifiers and keys
        for slicing.
    legend
        table legend
    column_templates
        dict of column headings
        or a function that will handle the formatting.
    dtype
        optional numpy array typecode.
    limit
        exits after this many lines. Only applied for non pickled data
        file types.
    data_frame
        a pandas DataFrame, supersedes header/rows
    format
        output format when using str(Table)
    skip_inconsistent
        skips rows that have different length to header row
    """
    sep = sep or kwargs.pop("delimiter", None)
    file_format, compress_format = get_format_suffixes(filename)

    if file_format in ("pickle", "pkl"):
        f = open_(filename, mode="rb")
        loaded_table = pickle.load(f)
        f.close()
        r = _Table()
        r.__setstate__(loaded_table)
        return r

    if not reader:
        if file_format == "csv":
            sep = sep or ","
        elif file_format == "tsv":
            sep = sep or "\t"

        header, rows, loaded_title, legend = load_delimited(
            filename, delimiter=sep, limit=limit, **kwargs
        )
        if skip_inconsistent:
            num_fields = len(header)
            rows = [r for r in rows if len(r) == num_fields]
        else:
            lengths = set(map(len, [header] + rows))
            if len(lengths) != 1:
                msg = f"inconsistent number of fields {lengths}"
                raise ValueError(msg)

        title = title or loaded_title
        data = {column[0]: column[1:] for column in zip(header, *rows)}
    else:
        f = open_(filename, newline=None)
        data = [row for row in reader(f)]
        header = data[0]
        data = {column[0]: column[1:] for column in zip(*data)}
        f.close()

    for key, value in data.items():
        data[key] = cast_str_to_array(value, static_type=static_column_types)

    return make_table(
        header=header,
        data=data,
        digits=digits,
        title=title,
        dtype=dtype,
        column_templates=column_templates,
        space=space,
        missing_data=missing_data,
        max_width=max_width,
        index=index,
        legend=legend,
        format=format,
    )