Ejemplo n.º 1
0
def save_enriched_motifs(df, fname: str) -> None:
    """
    Save enriched motifs.

    Supported file formats are CSV, TSV, GMT, DAT (pickle), JSON or YAML.

    :param df:
    :param fname:
    :return:
    """
    extension = PurePath(fname).suffixes
    if is_valid_suffix(extension, 'ctx'):
        df.to_csv(fname, sep=suffixes_to_separator(extension))
    else:
        regulons = df2regulons(df)
        if '.json' in extension:
            name2targets = {
                r.name: list(r.gene2weight.keys())
                for r in regulons
            }
            with openfile(fname, 'w') as f:
                f.write(json.dumps(name2targets))
        elif '.dat' in extension:
            with openfile(fname, 'wb') as f:
                pickle.dump(regulons, f)
        elif '.gmt' in extension:
            GeneSignature.to_gmt(fname, regulons)
        elif is_valid_suffix(extension, 'ctx_yaml'):
            save_to_yaml(regulons, fname)
        else:
            raise ValueError("Unknown file format \"{}\".".format(fname))
Ejemplo n.º 2
0
def test_intersection3():
    gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.8, 'SOX4': 0.75})
    gs2 = GeneSignature(name="test1", gene2weight={'TP53': 0.3, 'SOX2': 0.60})
    gsu = gs1.intersection(gs2)
    assert len(gsu) == 1
    assert 'TP53' in gsu
    assert gsu.gene2weight['TP53'] == 0.8
Ejemplo n.º 3
0
def save_enriched_motifs(df, fname: str) -> None:
    """
    Save enriched motifs.

    Supported file formats are CSV, TSV, GMT, DAT (pickle), JSON or YAML.

    :param df:
    :param fname:
    :return:
    """
    extension = os.path.splitext(fname)[1].lower()
    if extension in FILE_EXTENSION2SEPARATOR.keys():
        df.to_csv(fname, sep=FILE_EXTENSION2SEPARATOR[extension])
    else:
        regulons = df2regulons(df)
        if extension == '.json':
            name2targets = {
                r.name: list(r.gene2weight.keys())
                for r in regulons
            }
            with open(fname, 'w') as f:
                f.write(json.dumps(name2targets))
        elif extension == '.dat':
            with open(fname, 'wb') as f:
                pickle.dump(regulons, f)
        elif extension == '.gmt':
            GeneSignature.to_gmt(fname, regulons)
        elif extension in {'.yaml', '.yml'}:
            save_to_yaml(regulons, fname)
        else:
            raise ValueError("Unknown file format \"{}\".".format(fname))
Ejemplo n.º 4
0
def test_diff3():
    gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.8, 'SOX4': 0.75})
    gs2 = GeneSignature(name="test1", gene2weight={'TP53': 0.3, 'SOX2': 0.60})
    gsu = gs1.difference(gs2)
    assert 'SOX4' in gsu
    assert gsu.gene2weight['SOX4'] == 0.75
    assert len(gsu) == 1
Ejemplo n.º 5
0
def test_union1():
    gs1 = GeneSignature(name="test1", gene2weight=['TP53', 'SOX4'])
    gs2 = GeneSignature(name="test1", gene2weight=['TP53', 'SOX2'])
    gsu = gs1.union(gs2)
    assert 'TP53' in gsu
    assert 'SOX4' in gsu
    assert 'SOX2' in gsu
    assert len(gsu) == 3
Ejemplo n.º 6
0
def test_rename():
    gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.5, 'SOX4': 0.75})
    gs2 = gs1.rename('test2')
    assert 'TP53' in gs2
    assert 'SOX4' in gs2
    assert gs2.name == 'test2'
    assert len(gs2) == 2
    assert gs2.gene2weight['TP53'] == 0.5
    assert gs2.gene2weight['SOX4'] == 0.75
Ejemplo n.º 7
0
def test_head():
    gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.8, 'SOX4': 0.75})
    gs2 = gs1.head(1)
    assert gs2['TP53'] == 0.8
    assert len(gs2) == 1
    gs2 = gs1.head(2)
    assert gs2['TP53'] == 0.8
    assert gs2['SOX4'] == 0.75
    assert len(gs2) == 2
Ejemplo n.º 8
0
def test_union3():
    gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.8, 'SOX4': 0.75})
    gs2 = GeneSignature(name="test1", gene2weight={'TP53': 0.3, 'SOX2': 0.60})
    gsu = gs1.union(gs2)
    assert 'TP53' in gsu
    assert gsu.gene2weight['TP53'] == 0.8
    assert 'SOX4' in gsu
    assert gsu.gene2weight['SOX4'] == 0.75
    assert 'SOX2' in gsu
    assert gsu.gene2weight['SOX2'] == 0.6
    assert len(gsu) == 3
Ejemplo n.º 9
0
def test_noweights():
    gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.8, 'SOX4': 0.75})
    gs2 = gs1.noweights()
    assert gs1['TP53'] == 0.8
    assert gs2['TP53'] == 1.0

    reg1 = Regulon(name='TP53 regulon', gene2weight={'TP53': 0.8, 'SOX4': 0.75}, transcription_factor="TP53", gene2occurrence={"TP53": 1})
    reg2 = reg1.noweights()
    assert reg1['TP53'] == 0.8
    assert reg2['TP53'] == 1.0
    assert isinstance(reg2, Regulon)
Ejemplo n.º 10
0
def load_signatures(fname: str) -> Sequence[Type[GeneSignature]]:
    """
    Load genes signatures from disk.

    Supported file formats are GMT, DAT (pickled), YAML or CSV (enriched motifs).

    :param fname: The name of the file that contains the signatures.
    :return: A list of gene signatures.
    """
    extension = os.path.splitext(fname)[1].lower()
    if extension in FILE_EXTENSION2SEPARATOR.keys():
        return df2regulons(
            load_motifs(fname, sep=FILE_EXTENSION2SEPARATOR[extension]))
    elif extension in {'.yaml', '.yml'}:
        return load_from_yaml(fname)
    elif extension.endswith('.gmt'):
        sep = guess_separator(fname)
        return GeneSignature.from_gmt(fname,
                                      field_separator=sep,
                                      gene_separator=sep)
    elif extension == '.dat':
        with open(fname, 'rb') as f:
            return pickle.load(f)
    else:
        raise ValueError("Unknown file format \"{}\".".format(fname))
Ejemplo n.º 11
0
def load_signatures(fname: str) -> Sequence[Type[GeneSignature]]:
    """
    Load genes signatures from disk.

    Supported file formats are GMT, DAT (pickled), YAML or CSV (enriched motifs).

    :param fname: The name of the file that contains the signatures.
    :return: A list of gene signatures.
    """
    extension = PurePath(fname).suffixes
    if is_valid_suffix(extension, 'ctx'):
        # csv/tsv
        return df2regulons(
            load_motifs(fname, sep=suffixes_to_separator(extension)))
    elif is_valid_suffix(extension, 'ctx_yaml'):
        return load_from_yaml(fname)
    elif '.gmt' in extension:
        sep = guess_separator(fname)
        return GeneSignature.from_gmt(fname,
                                      field_separator=sep,
                                      gene_separator=sep)
    elif extension == '.dat':
        with openfile(fname, 'rb') as f:
            return pickle.load(f)
    else:
        raise ValueError("Unknown file format \"{}\".".format(fname))
Ejemplo n.º 12
0
def gs():
    return GeneSignature.from_gmt(
        TEST_SIGNATURE_FNAME,
        NOMENCLATURE,
        gene_separator="\t",
        field_separator="\t",
    )[0]
Ejemplo n.º 13
0
def test_load_gmt():
    gss = GeneSignature.from_gmt(field_separator='\t', gene_separator='\t', fname=TEST_SIGNATURE_FNAME)
    # http://software.broadinstitute.org/gsea/msigdb/collections.jsp#C6
    assert len(gss) == 189
    assert gss[0].name == "GLI1_UP.V1_DN"
    assert "COPZ1" in gss[0]
    assert len(gss[0]) == 29
Ejemplo n.º 14
0
def aucell_command(args):
    """
    Calculate regulon enrichment (as AUC values) for cells.
    """
    LOGGER.info("Loading expression matrix.")
    ex_mtx = _load_expression_matrix(args)

    if any(
            args.regulons_fname.name.endswith(ext)
            for ext in FILE_EXTENSION2SEPARATOR.keys()):
        LOGGER.info("Creating regulons.")
        regulons = _df2regulons(args.regulons_fname.name, args.nomenclature)
    elif args.regulons_fname.name.endswith('.gmt'):
        LOGGER.info("Loading regulons.")
        regulons = GeneSignature.from_gmt(args.regulons_fname.name,
                                          args.nomenclature,
                                          field_separator='\t',
                                          gene_separator='\t')
    else:
        LOGGER.info("Loading regulons.")
        regulons = _load_modules(args.regulons_fname.name)

    LOGGER.info("Calculating enrichment.")
    auc_heatmap = aucell(ex_mtx,
                         regulons,
                         auc_threshold=args.auc_threshold,
                         noweights=args.weights != 'yes',
                         num_cores=args.num_workers)

    LOGGER.info("Writing results to file.")
    auc_heatmap.to_csv(args.output)
Ejemplo n.º 15
0
def test_init2():
    gs1 = GeneSignature(name="test1", gene2weight=[('TP53', 0.5), ('SOX4', 0.75)])
    assert 'TP53' in gs1
    assert 'SOX4' in gs1
    assert gs1.name == 'test1'
    assert len(gs1) == 2
    assert gs1.gene2weight['TP53'] == 0.5
    assert gs1.gene2weight['SOX4'] == 0.75
Ejemplo n.º 16
0
def test_init3():
    gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.5, 'SOX4': 0.75})
    assert 'TP53' in gs1
    assert 'SOX4' in gs1
    assert gs1.name == 'test1'
    assert len(gs1) == 2
    assert gs1.gene2weight['TP53'] == 0.5
    assert gs1.gene2weight['SOX4'] == 0.75
Ejemplo n.º 17
0
def gmt2regions(gmt_fname, db_fname, delineation_code, fraction):
    db = RegionRankingDatabase(fname=db_fname, name=os.path.basename(db_fname))
    signatures = GeneSignature.from_gmt(gmt_fname)
    delineation = CODE2DELINEATION[delineation_code]
    for signature in signatures:
        sys.stdout(
            signature.name + ',' +
            ','.join(convert(signature, db, delineation, fraction).genes))
Ejemplo n.º 18
0
def test_init1():
    gs1 = GeneSignature(name="test1", gene2weight=['TP53', 'SOX4'])
    assert 'TP53' in gs1
    assert 'SOX4' in gs1
    assert gs1.name == 'test1'
    assert len(gs1) == 2
    assert gs1.gene2weight['TP53'] == 1.0
    assert gs1.gene2weight['SOX4'] == 1.0
Ejemplo n.º 19
0
def test_aucell_mismatch(exp_matrix, gs):
    percentiles = derive_auc_threshold(exp_matrix)
    gss = [
        GeneSignature(name="test",
                      gene2weight=list(map("FAKE{}".format, range(100))))
    ] + gs
    aucs_mtx = aucell(exp_matrix,
                      gss,
                      auc_threshold=percentiles[0.01],
                      num_workers=1)
    print(aucs_mtx.head())
Ejemplo n.º 20
0
def load_modules(fname: str) -> Sequence[Type[GeneSignature]]:
    # Loading from YAML is extremely slow. Therefore this is a potential performance improvement.
    # Potential improvements are switching to JSON or to use a CLoader:
    # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml
    # The alternative for which was opted in the end is binary pickling.
    extension = PurePath(fname).suffixes
    if is_valid_suffix(extension, 'ctx_yaml'):
        return load_from_yaml(fname)
    elif '.dat' in extension:
        with openfile(fname, 'rb') as f:
            return pickle.load(f)
    elif '.gmt' in extension:
        return GeneSignature.from_gmt(fname)
    else:
        raise ValueError("Unknown file format for \"{}\".".format(fname))
Ejemplo n.º 21
0
def load_modules(fname: str) -> Sequence[Type[GeneSignature]]:
    # Loading from YAML is extremely slow. Therefore this is a potential performance improvement.
    # Potential improvements are switching to JSON or to use a CLoader:
    # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml
    # The alternative for which was opted in the end is binary pickling.
    if fname.endswith('.yaml') or fname.endswith('.yml'):
        return load_from_yaml(fname)
    elif fname.endswith('.dat'):
        with open(fname, 'rb') as f:
            return pickle.load(f)
    elif fname.endswith('.gmt'):
        sep = guess_separator(fname)
        return GeneSignature.from_gmt(fname,
                                      field_separator=sep,
                                      gene_separator=sep)
    else:
        raise ValueError("Unknown file format for \"{}\".".format(fname))
Ejemplo n.º 22
0
 def calculate_regulon_enrichment(self):
     # Calculate regulon enrichment per cell using AUCell.
     # Create regulons with weight based on given key
     print("Using {} to weight the genes when running AUCell.".format(
         self.auc_regulon_weights_key))
     regulon_signatures = list(
         map(
             lambda x: GeneSignature(
                 name=x.name,
                 gene2weight=self.get_regulon_gene_data(
                     x, self.auc_regulon_weights_key),
             ),
             self.regulons,
         ))
     auc_mtx = aucell(
         self.ex_mtx, regulon_signatures,
         num_workers=self.num_workers)  # (n_cells x n_regulons)
     auc_mtx = auc_mtx.loc[self.ex_mtx.index]
     return auc_mtx
Ejemplo n.º 23
0
    def signatures():
        for gene_sig_file_path in gene_sig_file_paths:
            gene_sig = pd.read_csv(gene_sig_file_path,
                                   sep='\t',
                                   header=None,
                                   index_col=None)
            fname = ntpath.basename(gene_sig_file_path)
            regulon = os.path.splitext(fname)[0]

            # Check if the file is the regulon frequency file
            if regulon == 'regulons':
                continue

            # Do some sanity checks
            if len(gene_sig.columns) == 0:
                raise Exception(
                    f"{gene_sig_file_path} has 0 columns. Requires .tsv with 1 or 2 columns. First column should be genes (required), second (optional) are weight for the given genes."
                )
            if len(gene_sig.columns) > 2:
                raise Exception(
                    f"{gene_sig_file_path} has more than 2 columns. Requires .tsv with 1 or 2 columns. First column should be genes, second (optional) are weight for the given genes."
                )
            if len(gene_sig.columns) == 1 or noweights:
                gene2weight = gene_sig[0]
            if len(gene_sig.columns) == 2 and not noweights:
                # Filter the genes based on the given weight_threshold
                # 1st column: genes
                # 2nd column: weights
                gene_sig = gene_sig[gene_sig[1] > weight_threshold]
                if len(gene_sig.index) == 0:
                    if show_warnings:
                        warnings.warn(
                            "{0} is empty after apply filter with weight_threshold > {1}"
                            .format(regulon, weight_threshold))
                    continue
                gene2weight = [tuple(x) for x in gene_sig.values]
            yield GeneSignature(name=regulon, gene2weight=gene2weight)
Ejemplo n.º 24
0
    def doGeneSetEnrichment(self, request, context):
        gene_set_file_path = os.path.join(self.dfh.get_gene_sets_dir(),
                                          request.geneSetFilePath)
        loom = self.lfh.get_loom(loom_file_path=request.loomFilePath)
        gse = _gse.GeneSetEnrichment(scope=self,
                                     method="AUCell",
                                     loom=loom,
                                     gene_set_file_path=gene_set_file_path,
                                     annotation='')

        # Running AUCell...
        yield gse.update_state(step=-1,
                               status_code=200,
                               status_message="Running AUCell...",
                               values=None)
        time.sleep(1)

        # Reading gene set...
        yield gse.update_state(step=0,
                               status_code=200,
                               status_message="Reading the gene set...",
                               values=None)
        with open(gse.gene_set_file_path, 'r') as f:
            # Skip first line because it contains the name of the signature
            gs = GeneSignature(name='Gene Signature #1',
                               gene2weight=[
                                   line.strip() for idx, line in enumerate(f)
                                   if idx > 0
                               ])
        time.sleep(1)

        if not gse.has_AUCell_rankings():
            # Creating the matrix as DataFrame...
            yield gse.update_state(step=1,
                                   status_code=200,
                                   status_message="Creating the matrix...",
                                   values=None)
            loom = self.lfh.get_loom(loom_file_path=request.loomFilePath)
            dgem = np.transpose(loom.get_connection()[:, :])
            ex_mtx = pd.DataFrame(data=dgem,
                                  index=loom.get_ca_attr_by_name("CellID"),
                                  columns=loom.get_genes())
            # Creating the rankings...
            start_time = time.time()
            yield gse.update_state(step=2.1,
                                   status_code=200,
                                   status_message="Creating the rankings...",
                                   values=None)
            rnk_mtx = create_rankings(ex_mtx=ex_mtx)
            # Saving the rankings...
            yield gse.update_state(step=2.2,
                                   status_code=200,
                                   status_message="Saving the rankings...",
                                   values=None)
            lp.create(gse.get_AUCell_ranking_filepath(), rnk_mtx.as_matrix(),
                      {"CellID": loom.get_cell_ids()},
                      {"Gene": loom.get_genes()})
            print("Debug: %s seconds elapsed ---" % (time.time() - start_time))
        else:
            # Load the rankings...
            yield gse.update_state(
                step=2,
                status_code=200,
                status_message="Rankings exists: loading...",
                values=None)
            rnk_loom = self.lfh.get_loom_connection(
                gse.get_AUCell_ranking_filepath())
            rnk_mtx = pd.DataFrame(data=rnk_loom[:, :],
                                   index=rnk_loom.ra.CellID,
                                   columns=rnk_loom.ca.Gene)

        # Calculating AUCell enrichment...
        start_time = time.time()
        yield gse.update_state(
            step=3,
            status_code=200,
            status_message="Calculating AUCell enrichment...",
            values=None)
        aucs = enrichment(rnk_mtx, gs).loc[:, "AUC"].values

        print("Debug: %s seconds elapsed ---" % (time.time() - start_time))
        yield gse.update_state(step=4,
                               status_code=200,
                               status_message=gse.get_method() +
                               " enrichment done!",
                               values=aucs)
Ejemplo n.º 25
0
def test_diff1():
    gs1 = GeneSignature(name="test1", gene2weight=['TP53', 'SOX4'])
    gs2 = GeneSignature(name="test1", gene2weight=['TP53', 'SOX2'])
    gsu = gs1.difference(gs2)
    assert 'SOX4' in gsu
    assert len(gsu) == 1
Ejemplo n.º 26
0
def test_add():
    gss = GeneSignature.from_gmt(field_separator='\t',
                                 gene_separator='\t',
                                 fname=TEST_SIGNATURE_FNAME)
    res = gss[0].add("MEF2")
    assert "MEF2" in res
Ejemplo n.º 27
0
def test_immut():
    gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.5, 'SOX4': 0.75})
    with pytest.raises(attr.exceptions.FrozenInstanceError):
        gs1.name = 'rename'
    with pytest.raises(TypeError):
        gs1.gene2weight['TP53'] = 0.6
Ejemplo n.º 28
0
def test_dict():
    gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.5, 'SOX4': 0.75})
    assert gs1['TP53'] == 0.5
    assert gs1['SOX4'] == 0.75
Ejemplo n.º 29
0
def test_genes():
    gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.5, 'SOX4': 0.75})
    assert gs1.genes == ('SOX4', 'TP53')