Ejemplo n.º 1
0
def save_enriched_motifs(df, fname: str) -> None:
    """
    Save enriched motifs.

    Supported file formats are CSV, TSV, GMT, DAT (pickle), JSON or YAML.

    :param df:
    :param fname:
    :return:
    """
    extension = PurePath(fname).suffixes
    if is_valid_suffix(extension, 'ctx'):
        df.to_csv(fname, sep=suffixes_to_separator(extension))
    else:
        regulons = df2regulons(df)
        if '.json' in extension:
            name2targets = {
                r.name: list(r.gene2weight.keys())
                for r in regulons
            }
            with openfile(fname, 'w') as f:
                f.write(json.dumps(name2targets))
        elif '.dat' in extension:
            with openfile(fname, 'wb') as f:
                pickle.dump(regulons, f)
        elif '.gmt' in extension:
            GeneSignature.to_gmt(fname, regulons)
        elif is_valid_suffix(extension, 'ctx_yaml'):
            save_to_yaml(regulons, fname)
        else:
            raise ValueError("Unknown file format \"{}\".".format(fname))
Ejemplo n.º 2
0
def load_signatures(fname: str) -> Sequence[Type[GeneSignature]]:
    """
    Load genes signatures from disk.

    Supported file formats are GMT, DAT (pickled), YAML or CSV (enriched motifs).

    :param fname: The name of the file that contains the signatures.
    :return: A list of gene signatures.
    """
    extension = PurePath(fname).suffixes
    if is_valid_suffix(extension, 'ctx'):
        # csv/tsv
        return df2regulons(
            load_motifs(fname, sep=suffixes_to_separator(extension)))
    elif is_valid_suffix(extension, 'ctx_yaml'):
        return load_from_yaml(fname)
    elif '.gmt' in extension:
        sep = guess_separator(fname)
        return GeneSignature.from_gmt(fname,
                                      field_separator=sep,
                                      gene_separator=sep)
    elif extension == '.dat':
        with openfile(fname, 'rb') as f:
            return pickle.load(f)
    else:
        raise ValueError("Unknown file format \"{}\".".format(fname))
Ejemplo n.º 3
0
def load_modules(fname: str) -> Sequence[Type[GeneSignature]]:
    # Loading from YAML is extremely slow. Therefore this is a potential performance improvement.
    # Potential improvements are switching to JSON or to use a CLoader:
    # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml
    # The alternative for which was opted in the end is binary pickling.
    extension = PurePath(fname).suffixes
    if is_valid_suffix(extension, 'ctx_yaml'):
        return load_from_yaml(fname)
    elif '.dat' in extension:
        with openfile(fname, 'rb') as f:
            return pickle.load(f)
    elif '.gmt' in extension:
        return GeneSignature.from_gmt(fname)
    else:
        raise ValueError("Unknown file format for \"{}\".".format(fname))
Ejemplo n.º 4
0
def guess_separator(fname: str) -> str:
    with openfile(fname, 'r') as f:
        lines = f.readlines()

    # decode if gzipped file:
    for i,x in enumerate(lines):
        if isinstance(x, (bytes, bytearray)):
            lines[i] = x.decode()

    def count_columns(sep):
        return [len(line.split(sep)) for line in lines if not line.strip().startswith('#') and line.strip()]

    # Check if '\t' is used:
    for sep in ('\t', ';', ','):
        if min(count_columns(sep)) >= 3:
            return sep
    raise ValueError("Unknown file format \"{}\".".format(fname))