Exemple #1
0
def LoadTree(
    filename=None,
    treestring=None,
    tip_names=None,
    format=None,
    underscore_unmunge=False,
):
    """
    .. deprecated:: 2019.8.30a

        ``LoadTree`` will be removed in ``cogent3`` 2020.1.1. It's replaced by
        ``load_tree`` and ``make_tree``.
    """
    from cogent3.util.warning import deprecated

    if filename:
        deprecated("function", "LoadTree", "load_tree", "2020.1.1", 1)
        return load_tree(filename,
                         format=format,
                         underscore_unmunge=underscore_unmunge)

    deprecated("function", "LoadTree", "make_tree", "2020.1.1", 1)
    tree = make_tree(
        treestring=treestring,
        tip_names=tip_names,
        format=format,
        underscore_unmunge=underscore_unmunge,
    )
    return tree
Exemple #2
0
def load_delimited(
    filename,
    header=True,
    sep=",",
    delimiter=None,
    with_title=False,
    with_legend=False,
    limit=None,
):
    """
    basic processing of tabular data

    Parameters
    ----------
    filename: Path
        path to delimited file (can begin with ~)
    header: bool
        whether the first line of the file (after the title, if present) is a header
    sep: str
        the character separating columns
    with_title: bool
        whether the first line of the file is a title
    with_legend: bool
        whether the last line of the file is a legend
    limit: int
        maximum number of lines to read from the file

    Returns
    -------
    header, rows, title, legend

    Notes
    -----
    All row values remain as strings.
    """
    if delimiter:
        sep = delimiter
        deprecated("argument", "delimiter", "sep", "2022.1")

    if limit is not None and header:
        limit += 1  # don't count header line

    with open_(filename) as f:
        reader = csv.reader(f, dialect="excel", delimiter=sep)
        title = "".join(next(reader)) if with_title else ""
        rows = []
        num_lines = 0
        for row in reader:
            rows.append(row)
            num_lines += 1
            if limit is not None and num_lines >= limit:
                break

    header = rows.pop(0) if header else None
    legend = "".join(rows.pop(-1)) if with_legend else ""
    return header, rows, title, legend
Exemple #3
0
def LoadTable(
    filename=None,
    sep=None,
    reader=None,
    header=None,
    rows=None,
    row_order=None,
    digits=4,
    space=4,
    title="",
    missing_data="",
    max_width=1e100,
    row_ids=None,
    legend="",
    column_templates=None,
    dtype=None,
    static_column_types=False,
    limit=None,
    data_frame=None,
    format="simple",
    **kwargs,
):
    """
    .. deprecated:: 2019.8.30a

        ``LoadTable`` will be removed in ``cogent3`` 2020.1.1. It's replaced by
        ``load_table`` and ``make_table``.
    """
    from cogent3.util.warning import deprecated

    args = {k: v for k, v in locals().items() if k != "deprecated"}
    kwargs = args.pop("kwargs", {})
    args.update(kwargs)
    if filename:
        deprecated("function", "LoadTable", "load_table", "2020.1.1", 1)

        return load_table(**args)

    deprecated("function", "LoadTable", "make_table", "2020.1.1", 1)
    for skip in ("filename", "sep", "reader", "static_column_types", "limit"):
        del args[skip]
    return make_table(**args)
Exemple #4
0
def LoadSeqs(
    filename=None,
    format=None,
    data=None,
    moltype=None,
    name=None,
    aligned=True,
    label_to_name=None,
    parser_kw=None,
    constructor_kw=None,
    array_align=True,
    **kw,
):
    """
    .. deprecated:: 2019.8.30a

        ``LoadSeqs`` will be removed in ``cogent3`` 2020.1.1. It's replaced by
        ``load_unaligned_seqs``, ``load_aligned_seqs``, ``make_unaligned_seqs``
        and ``make_aligned_seqs``.
    """
    kwargs = locals()
    from cogent3.util.warning import deprecated

    if filename and aligned:
        deprecated("function", "LoadSeqs", "load_aligned_seqs", "2020.1.1", 1)
        for key in ("aligned", "data"):
            del kwargs[key]
        return load_aligned_seqs(**kwargs)
    elif filename:
        deprecated("function", "LoadSeqs", "load_unaligned_seqs", "2020.1.1",
                   1)
        for key in ("aligned", "data", "array_align"):
            del kwargs[key]
        return load_unaligned_seqs(**kwargs)
    elif aligned:
        deprecated("function", "LoadSeqs", "make_aligned_seqs", "2020.1.1", 1)
        for key in ("filename", "format", "aligned", "parser_kw"):
            del kwargs[key]
        return make_aligned_seqs(**kwargs)
    else:
        deprecated("function", "LoadSeqs", "make_unaligned_seqs", "2020.1.1",
                   1)
        for key in ("filename", "format", "aligned", "array_align",
                    "parser_kw"):
            del kwargs[key]
        return make_unaligned_seqs(**kwargs)
Exemple #5
0
def clustal_from_alignment(aln, interleave_len=None, wrap=None):
    """
    Parameters
    ----------
    aln
        can be an Alignment object or a dict
    wrap
        sequence line width.  Only available if sequences are
        aligned.

    Returns
    -------
    Returns a string in Clustal format
    """
    if interleave_len is not None:
        from cogent3.util.warning import deprecated

        deprecated("argument", "interleave_len", "wrap", "2021.6")
        wrap = interleave_len if wrap == 60 else wrap

    if not aln:
        return ""

    # get seq output order
    try:
        order = aln.RowOrder
    except:
        order = list(aln.keys())
        order.sort()

    seqs = SequenceCollection(aln)
    clustal_list = ["CLUSTAL\n"]

    if seqs.is_ragged():
        raise ValueError(
            "Sequences in alignment are not all the same length." +
            "Cannot generate Clustal format.")

    aln_len = seqs.seq_len
    # Get all labels
    labels = copy(seqs.names)

    # Find all label lengths in order to get padding.
    label_lengths = [len(l) for l in labels]
    label_max = max(label_lengths)
    max_spaces = label_max + 4

    # Get ordered seqs
    ordered_seqs = [seqs.named_seqs[label] for label in order]

    if wrap is not None:
        curr_ix = 0
        while curr_ix < aln_len:
            clustal_list.extend([
                "%s%s%s" % (
                    x,
                    " " * (max_spaces - len(x)),
                    y[curr_ix:curr_ix + wrap],
                ) for x, y in zip(order, ordered_seqs)
            ])
            clustal_list.append("")
            curr_ix += wrap
    else:
        clustal_list.extend([
            "%s%s%s" % (x, " " * (max_spaces - len(x)), y)
            for x, y in zip(order, ordered_seqs)
        ])
        clustal_list.append("")

    return "\n".join(clustal_list)
Exemple #6
0
def load_table(
    filename,
    sep=None,
    reader=None,
    digits=4,
    space=4,
    title="",
    missing_data="",
    max_width=1e100,
    index_name=None,
    legend="",
    column_templates=None,
    static_column_types=False,
    limit=None,
    format="simple",
    skip_inconsistent=False,
    **kwargs,
):
    """

    Parameters
    ----------
    filename
        path to file containing a tabular data
    sep
        the delimiting character between columns
    reader
        a parser for reading filename. This approach assumes the first
        row returned by the reader will be the header row.
    static_column_types
        if True, and reader is None, identifies columns
        with a numeric/bool data types from the first non-header row.
        This assumes all subsequent entries in that column are of the same type.
        Default is False.
    digits
        floating point resolution
    space
        number of spaces between columns or a string
    title
        as implied
    missing_data
        character assigned if a row has no entry for a column
    max_width
        maximum column width for printing
    index_name
        column name with values to be used as row identifiers and keys
        for slicing. All column values must be unique.
    legend
        table legend
    column_templates
        dict of column headings
        or a function that will handle the formatting.
    limit
        exits after this many lines. Only applied for non pickled data
        file types.
    format
        output format when using str(Table)
    skip_inconsistent
        skips rows that have different length to header row
    """
    import pathlib

    if not any(isinstance(filename, t) for t in (str, pathlib.PurePath)):
        raise TypeError(
            "filename must be string or Path, perhaps you want make_table()")

    if "index" in kwargs:
        deprecated("argument", "index", "index_name", "2021.11")
        index_name = kwargs.pop("index", index_name)

    sep = sep or kwargs.pop("delimiter", None)
    file_format, compress_format = get_format_suffixes(filename)

    if file_format == "json":
        return load_from_json(filename, (_Table, ))
    elif file_format in ("pickle", "pkl"):
        f = open_(filename, mode="rb")
        loaded_table = pickle.load(f)
        f.close()
        r = _Table()
        r.__setstate__(loaded_table)
        return r

    if reader:
        with open_(filename, newline=None) as f:
            data = [row for row in reader(f)]
            header = data[0]
            data = {column[0]: column[1:] for column in zip(*data)}
    else:
        if file_format == "csv":
            sep = sep or ","
        elif file_format == "tsv":
            sep = sep or "\t"

        header, rows, loaded_title, legend = load_delimited(filename,
                                                            sep=sep,
                                                            limit=limit,
                                                            **kwargs)
        if skip_inconsistent:
            num_fields = len(header)
            rows = [r for r in rows if len(r) == num_fields]
        else:
            lengths = set(map(len, [header] + rows))
            if len(lengths) != 1:
                msg = f"inconsistent number of fields {lengths}"
                raise ValueError(msg)

        title = title or loaded_title
        data = {column[0]: column[1:] for column in zip(header, *rows)}

    for key, value in data.items():
        data[key] = cast_str_to_array(value, static_type=static_column_types)

    return make_table(
        header=header,
        data=data,
        digits=digits,
        title=title,
        column_templates=column_templates,
        space=space,
        missing_data=missing_data,
        max_width=max_width,
        index_name=index_name,
        legend=legend,
        format=format,
    )
Exemple #7
0
def make_table(
    header=None,
    data=None,
    row_order=None,
    digits=4,
    space=4,
    title="",
    max_width=1e100,
    index_name=None,
    legend="",
    missing_data="",
    column_templates=None,
    data_frame=None,
    format="simple",
    **kwargs,
):
    """

    Parameters
    ----------
    header
        column headings
    data
        a 2D dict, list or tuple. If a dict, it must have column
        headings as top level keys, and common row labels as keys in each
        column.
    row_order
        the order in which rows will be pulled from the twoDdict
    digits
        floating point resolution
    space
        number of spaces between columns or a string
    title
        as implied
    max_width
        maximum column width for printing
    index_name
        column name with values to be used as row identifiers and keys
        for slicing. All column values must be unique.
    legend
        table legend
    missing_data
        replace missing data with this
    column_templates
        dict of column headings
        or a function that will handle the formatting.
    limit
        exits after this many lines. Only applied for non pickled data
        file types.
    data_frame
        a pandas DataFrame, supersedes header/rows
    format
        output format when using str(Table)

    """
    if any(isinstance(a, str) for a in (header, data)):
        raise TypeError(f"str type invalid, if its a path use load_table()")

    if "index" in kwargs:
        deprecated("argument", "index", "index_name", "2021.11")
        index_name = kwargs.pop("index", index_name)

    data = kwargs.get("rows", data)
    if data_frame is not None:
        from pandas import DataFrame

        if not isinstance(data_frame, DataFrame):
            raise TypeError(f"expecting a DataFrame, got{type(data_frame)}")

        data = {c: data_frame[c].to_numpy() for c in data_frame}

    return _Table(
        header=header,
        data=data,
        digits=digits,
        row_order=row_order,
        title=title,
        column_templates=column_templates,
        space=space,
        missing_data=missing_data,
        max_width=max_width,
        index_name=index_name,
        legend=legend,
        data_frame=data_frame,
        format=format,
    )