Ejemplo n.º 1
0
def load_signatures(fname: str) -> Sequence[Type[GeneSignature]]:
    """
    Load genes signatures from disk.

    Supported file formats are GMT, DAT (pickled), YAML or CSV (enriched motifs).

    :param fname: The name of the file that contains the signatures.
    :return: A list of gene signatures.
    """
    extension = PurePath(fname).suffixes
    if is_valid_suffix(extension, 'ctx'):
        # csv/tsv
        return df2regulons(
            load_motifs(fname, sep=suffixes_to_separator(extension)))
    elif is_valid_suffix(extension, 'ctx_yaml'):
        return load_from_yaml(fname)
    elif '.gmt' in extension:
        sep = guess_separator(fname)
        return GeneSignature.from_gmt(fname,
                                      field_separator=sep,
                                      gene_separator=sep)
    elif '.dat' in extension:
        with openfile(fname, 'rb') as f:
            return pickle.load(f)
    else:
        raise ValueError("Unknown file format \"{}\".".format(fname))
Ejemplo n.º 2
0
def load_signatures(fname: str) -> Sequence[Type[GeneSignature]]:
    """
    Load genes signatures from disk.

    Supported file formats are GMT, DAT (pickled), YAML or CSV (enriched motifs).

    :param fname: The name of the file that contains the signatures.
    :return: A list of gene signatures.
    """
    extension = os.path.splitext(fname)[1].lower()
    if extension in FILE_EXTENSION2SEPARATOR.keys():
        return df2regulons(
            load_motifs(fname, sep=FILE_EXTENSION2SEPARATOR[extension]))
    elif extension in {'.yaml', '.yml'}:
        return load_from_yaml(fname)
    elif extension.endswith('.gmt'):
        sep = guess_separator(fname)
        return GeneSignature.from_gmt(fname,
                                      field_separator=sep,
                                      gene_separator=sep)
    elif extension == '.dat':
        with open(fname, 'rb') as f:
            return pickle.load(f)
    else:
        raise ValueError("Unknown file format \"{}\".".format(fname))
Ejemplo n.º 3
0
def _load_modules(fname: str) -> Sequence[Type[GeneSignature]]:
    # Loading from YAML is extremely slow. Therefore this is a potential performance improvement.
    # Potential improvements are switching to JSON or to use a CLoader:
    # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml
    # The alternative for which was opted in the end is binary pickling.
    if fname.endswith('.yaml') or fname.endswith('.yml'):
        return load_from_yaml(fname)
    elif fname.endswith('.dat'):
        with open(fname, 'rb') as f:
            return pickle.load(f)
    else:
        LOGGER.error("Unknown file format for \"{}\".".format(fname))
        sys.exit(1)
Ejemplo n.º 4
0
def run(args):
    # Set logging level.
    logging_debug_opt = False
    LOGGER.addHandler(create_logging_handler(logging_debug_opt))
    LOGGER.setLevel(logging.DEBUG)

    LOGGER.info("Using configuration {}.".format(args.config_filename))
    cfg = ConfigParser()
    cfg.read(args.config_filename)

    in_fname = cfg['data']['modules'] if not args.input else args.input
    LOGGER.info("Loading modules from {}.".format(in_fname))
    # Loading from YAML is extremely slow. Therefore this is a potential performance improvement.
    # Potential improvements are switching to JSON or to use a CLoader:
    # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml
    if in_fname.endswith('.yaml'):
        modules = load_from_yaml(in_fname)
    else:
        with open(in_fname, 'rb') as f:
            modules = pickle.load(f)
    # Filter out modules with to few genes.
    min_genes = int(cfg['parameters']['min_genes'])
    modules = list(filter(lambda m: len(m) >= min_genes, modules))

    LOGGER.info("Loading databases.")

    def name(fname):
        return os.path.splitext(os.path.basename(fname))[0]

    db_fnames = list(mapcat(glob.glob, cfg['data']['databases'].split(";")))
    dbs = [
        RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames
    ]

    LOGGER.info("Calculating regulons.")
    motif_annotations_fname = cfg['data']['motif_annotations']
    mode = cfg['parameters']['mode']
    with ProgressBar() if mode == "dask_multiprocessing" else NoProgressBar():
        df = prune2df(dbs,
                      modules,
                      motif_annotations_fname,
                      rank_threshold=int(cfg['parameters']['rank_threshold']),
                      auc_threshold=float(cfg['parameters']['auc_threshold']),
                      nes_threshold=float(cfg['parameters']['nes_threshold']),
                      client_or_address=mode,
                      module_chunksize=cfg['parameters']['chunk_size'],
                      num_workers=args.num_workers)

    LOGGER.info("Writing results to file.")
    df.to_csv(cfg['parameters']['output'] if not args.output else args.output)
Ejemplo n.º 5
0
def load_modules(fname: str) -> Sequence[Type[GeneSignature]]:
    # Loading from YAML is extremely slow. Therefore this is a potential performance improvement.
    # Potential improvements are switching to JSON or to use a CLoader:
    # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml
    # The alternative for which was opted in the end is binary pickling.
    extension = PurePath(fname).suffixes
    if is_valid_suffix(extension, 'ctx_yaml'):
        return load_from_yaml(fname)
    elif '.dat' in extension:
        with openfile(fname, 'rb') as f:
            return pickle.load(f)
    elif '.gmt' in extension:
        return GeneSignature.from_gmt(fname)
    else:
        raise ValueError("Unknown file format for \"{}\".".format(fname))
Ejemplo n.º 6
0
    N_SAMPLES = ex_matrix.shape[0]  # Full dataset
    print(N_SAMPLES)

    #read in modules
    with open(MODULES_BIN_FNAME, "rb") as f:
        modules = pickle.load(f)
    print("LOADED modules, type:")

    print(type(modules))
    #print(modules)        #long list

    #-------------Phase II: Prune modules for targets with cis regulatory footprints (aka RcisTarget)----

    print("STARTING PHASE II")

    regulons = load_from_yaml(REGULONS_BIN_FNAME)
    print("LOADED regulons, type:")
    print(type(regulons))
    #print(regulons)

    #-------------Phase III: Cellular regulon enrichment matrix (aka AUCell)----------------

    print("STARTING PHASE III")

    #auc_mtx = aucell(ex_matrix, regulons, num_cores=nCores)    #don't transpose ex_matrix again as it was already transposed above #originally num_workers, but it should be num_cores
    auc_mtx = aucell(
        ex_matrix, regulons, num_workers=nCores
    )  #don't transpose ex_matrix again as it was already transposed above #originally num_workers, but it should be num_cores
    auc_mtx.to_csv(AUC_FNAME, sep='\t')
    print("DEFINED auc_mtx")