Ejemplo n.º 1
0
def cell_cycle_correction(data, annoFile, annoDB   = 'custom'):
    if not os.path.exists(annoFile):
        raise Exception("Annotation file not found")
    data_slalom = load_txt(df=data.T,annoFiles=annoFile,annoDBs=annoDB)
    print ("Loaded {:d} cells, {:d} genes".format(data_slalom['Y'].shape[0],data_slalom['Y'].shape[1]))
    print ("Annotation: {:d} terms".format(len(data_slalom['terms'])))
    #I: indicator matrix that assigns genes to pathways
    I = data_slalom['I'] #if loaded from the hdf file change to I = data['IMSigDB']
    #Y: log expresison values
    Y = data_slalom['Y']
    #terms: ther names of the terms
    terms = data_slalom['terms']

    #gene_ids: the ids of the genes in Y
    gene_ids = data_slalom['genes']

    #initialize FA instance, here using a Gaussian noise model and fitting 3 dense hidden factors
    FA = slalom.initFA(Y, terms,I, gene_ids=gene_ids, noise='gauss', nHidden=3, minGenes=10)

    #model training
    FA.train()

    #print diagnostics
    FA.printDiagnostics()

    corrected_data = FA.regressOut(terms=['M phase', 'Dna replication', 'Chromosome segregation','M phase of mitotic cell cycle'])

    full_matrix = data.copy()
    annotated_genes = np.array(data_slalom['genes'])[np.sum(data_slalom['I'], axis=1) != 0]
    full_matrix[annotated_genes] = corrected_data

    return full_matrix
Ejemplo n.º 2
0
def factor_analysis(norm_df, id_to_name, tsne, annoFile, annoDB):
    df_factors = None
    factors = []

    if not os.path.exists(annoFile):
        raise Exception("Annotation file not found")

    norm_df_copy = norm_df.copy()
    norm_df_copy.columns = np.array(
        [id_to_name[gene_id] for gene_id in norm_df_copy.columns.values])

    data = load_txt_2(data=norm_df_copy.T, annoFiles=annoFile, annoDBs=annoDB)

    print("Loaded {:d} cells, {:d} genes".format(data['Y'].shape[0],
                                                 data['Y'].shape[1]))
    print("Annotation: {:d} terms".format(len(data['terms'])))

    #I: indicator matrix that assigns genes to pathways
    I = data['I']  #if loaded from the hdf file change to I = data['IMSigDB']
    #Y: log expresison values
    Y = data['Y']
    #terms: ther names of the terms
    terms = data['terms']

    #gene_ids: the ids of the genes in Y
    gene_ids = data['genes']

    #initialize FA instance, here using a Gaussian noise model and fitting 3 dense hidden factors
    FA = slalom.initFA(Y,
                       terms,
                       I,
                       gene_ids=gene_ids,
                       noise='gauss',
                       nHidden=3,
                       minGenes=15)

    #model training
    FA.train()

    #print diagnostics
    FA.printDiagnostics()

    #plot results
    #fig = plotRelevance(FA)

    print(terms[0])

    X = FA.getX(terms=[terms[0]])

    if True:  ## is the first factor
        df_factors = pd.DataFrame(X, columns=[terms[0]])
        df_factors.index = tsne.index
    else:
        df_factors[terms[0]] = X
    factors.append(terms[0])

    #print(df_factors)

    plot_gene_expression(df_factors, tsne, factors)
    plt.show()
    #    plt.savefig(os.path.join(out_dir,'factors.pdf'))

    #get factors; analogous getters are implemented for relevance and weights (see docs)
    #X = FA.getX(terms=['G2m checkpoint','P53 pathway'])
    print(FA.getX())
    print(FA.getX().shape)
Ejemplo n.º 3
0
import time
import slalom as sl
import numpy as np
from anndata import read_h5ad

print('reading')
ad = read_h5ad('kang_count.h5ad')

print('initializing')

t0 = time.time()

FA = sl.initFA(ad.X, ad.uns['terms'], ad.varm['I'], list(ad.var_names), noise='gauss', nHidden=3, nHiddenSparse=0, 
               minGenes=12, pruneGenes=True, do_preTrain=False)

print('training')

FA.train()

t1 = time.time()

print(t1 - t0)

sl.saveFA(FA, 'FA_kang_3_hidden.hdf5', saveF=True)
Ejemplo n.º 4
0
print ("Annotation: {:d} terms".format(len(data['terms'])))

#I: indicator matrix that assigns genes to pathways
I = data['I'] #if loaded from the hdf file change to I = data['IMSigDB']
#Y: log expresison values 
Y = data['Y']
#terms: ther names of the terms
terms = data['terms']

#gene_ids: the ids of the genes in Y
# gene_ids = data['genes']  # Incorrect key
gene_ids = data['sym_names']

#initialize FA instance, here using a Gaussian noise model and fitting 3 dense hidden factors
print('Initialising')
FA = slalom.initFA(Y, terms, I, gene_ids=gene_ids, noise='gauss', nHidden=3, minGenes=15)

#model training
print('Training')
FA.train()

#print diagnostics
FA.printDiagnostics()

#plot results
fig = plotRelevance(FA)

#get factors; analogous getters are implemented for relevance and weights (see docs)
# X = FA.getX(terms=['G2m checkpoint','P53 pathway'])  # Incorrect names
terms_to_plot = ['G2M_CHECKPOINT','P53_PATHWAY']
X = FA.getX(terms=terms_to_plot)