def map_gene_sets_to_probes(exp, block, base_dir, base_filename, ann_gene_sets, src_gene_sets): """ TODO: working units check @param filepath: Filepath to store result obj @type ann_gs: GeneSets @type gs: GeneSets @rtype: GeneSets """ entrez_ids_to_probes_map = transpose_dict_list(ann_gene_sets.get_gs().genes) gene_sets_probes = GeneSets(base_dir, base_filename) gene_sets_probes.metadata["org"] = src_gene_sets.metadata["org"] gene_sets_probes.metadata["gene_units"] = GeneUnits.PROBE_ID gene_sets_probes.metadata["set_units"] = src_gene_sets.metadata["set_units"] gs = GS() src_gs = src_gene_sets.get_gs() for set_name, gene_ids in src_gs.genes.iteritems(): tmp_set = set() for entrez_id in gene_ids: tmp_set.update(entrez_ids_to_probes_map.get(entrez_id ,[])) if tmp_set: gs.genes[set_name] = list(tmp_set) gs.description[set_name] = src_gs.description[set_name] gene_sets_probes.store_gs(gs) return [gene_sets_probes], {}
def filter_gs_by_genes(src_gs, allowed_genes): """ @type src: environment.structures.GS @type allowed_genes: list of strings @param allowed_genes: gene units in allowed_genes and src should be the same @rtype: environment.structures.GS """ allowed = set(allowed_genes) gs = GS() for k, gene_set in src_gs.genes.iteritems(): to_preserve = set(gene_set).intersection(allowed) if to_preserve: gs.genes[k] = list(to_preserve) gs.description[k] = src_gs.description return gs
def threshold_task( exp, block, es, T, base_filename, ): # def removeTemporaryNegativeFeatures(S, indicator_string = 'negative_feature___'): # """Remove elements starting with the indicator_string and remove possible duplicates.""" # return S.apply(lambda list_element: set([s.replace(indicator_string, '') for s in list_element])) """Computes co-comodules from matrix H by given threshold T.""" if settings.CELERY_DEBUG: import sys sys.path.append( '/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg' ) import pydevd pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True) H = es.get_assay_data_frame() print(H) # mu = np.mean(H, axis = 1) # sigma = np.std(H, axis = 1) # Z = H.apply(lambda z: (z-mu)/sigma, axis = 0) # S = [] # S.append(removeTemporaryNegativeFeatures(Z.apply(lambda x: Z.columns[x >= T].tolist(), axis = 1))) # S = pd.DataFrame(S) # S = S.apply(lambda x: set.union(*x)) # result = pd.DataFrame(S) from wrappers.snmnmf.evaluation import EnrichmentInGeneSets z = 1 x = EnrichmentInGeneSets(z) result = x.getGeneSet(H, T) gene_sets = GeneSets(exp.get_data_folder(), base_filename) gs = GS(result, result) gene_sets.store_gs(gs) # cs = GeneSets(exp.get_data_folder(), base_filename) # cs.store_set(result) return [gene_sets], {}
def pattern_filter_task(exp, block, m_rna_es, mi_rna_es, gene_sets, metric, n_best, base_filename): """ @type m_rna_es: ExpressionSet @type mi_rna_es: ExpressionSet @type comodule_set: ComoduleSet @type metric: metric @type n_best: int """ if settings.CELERY_DEBUG: import sys sys.path.append( '/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg' ) import pydevd pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True) mData = m_rna_es.get_assay_data_frame() pheno = m_rna_es.get_pheno_data_frame() classes = pheno['User_class'].values data = mData com_set = gene_sets.get_gs(conv=False).genes result = pattern_filter(com_set.values(), data, classes, n_best, metric) result = {key: value for key, value in enumerate(result)} gs = GS(result, result) cs = GeneSets(exp.get_data_folder(), base_filename) cs.store_gs(gs) return [cs], {}
def preprocess_soft(exp, block, source_file): #TODO: now we assume that we get GSE file try: soft = list(parse_geo(gzip.open(source_file.filepath))) except: raise RuntimeError("Bad source file, can't read") assert soft[2].entity_type == "PLATFORM" pl = soft[2].table_rows id_idx = pl[0].index('ID') entrez_idx = pl[0].index('ENTREZ_GENE_ID') #TODO bug here probe_to_genes_GS = GS() for row in pl[1:]: probe_to_genes_GS.description[row[id_idx]] = "" probe_to_genes_GS.genes[row[id_idx]] = row[entrez_idx].split(" /// ") platform_annotation = PlatformAnnotation( "TODO:GET NAME FROM SOFT", base_dir=exp.get_data_folder(), base_filename="%s_annotation" % block.uuid ) platform_annotation.gene_sets.metadata["gene_units"] = GeneUnits.ENTREZ_ID platform_annotation.gene_sets.metadata["set_units"] = GeneUnits.PROBE_ID platform_annotation.gene_sets.store_gs(probe_to_genes_GS) id_ref_idx = soft[3].table_rows[0].index("ID_REF") value_idx = soft[3].table_rows[0].index("VALUE") assay_df = DataFrame(dict([ ( soft[i].entity_attributes['Sample_geo_accession'], Series(dict([(row[id_ref_idx], row[value_idx]) for row in soft[i].table_rows[1:]])) ) for i in range(3, len(soft)) ])) expression_set = ExpressionSet(exp.get_data_folder(), "%s_es" % block.uuid) expression_set.store_assay_data_frame(assay_df) raw_factors = [soft[i].entity_attributes for i in range(3, len(soft))] pheno_index = [] # Here we trying to guess sub columns one_factor_row = raw_factors[0] pheno_complex_columns_def = {} for col_name, col in one_factor_row.iteritems(): if type(col) in [str, unicode]: continue else: if all([":" in sub_col for sub_col in col]): mb_sub_col_names_sets = [ tuple(map(lambda x: x.split(":")[0], row[col_name])) for row in raw_factors ] if len(set(mb_sub_col_names_sets)) == 1: pheno_complex_columns_def[col_name] = "dict" else: pheno_complex_columns_def[col_name] = "list" else: pheno_complex_columns_def[col_name] = "list" factors = [] for idx, factor in enumerate(raw_factors): pheno_index.append(factor.pop('Sample_geo_accession', idx)) factor.pop('sample_table_begin', None) factor.pop('sample_table_end', None) fixed_factor = {} for col_name, col in factor.iteritems(): # Special treat for sub columns if col_name in pheno_complex_columns_def: if pheno_complex_columns_def[col_name] == "list": for sub_idx, sub_col in enumerate(col): fixed_factor["%s_%s" % (col_name, sub_idx + 1)] = sub_col elif pheno_complex_columns_def[col_name] == "dict": for sub_col in col: sub_name, sub_value = sub_col.split(":", 1) fixed_factor["%s_%s" % (col_name, sub_name)] = sub_value else: fixed_factor[col_name] = col factors.append(fixed_factor) # TODO: add ordering to phenotype features pheno_df = DataFrame([Series(factor) for factor in factors], index=pheno_index) if expression_set.pheno_metadata["user_class_title"] not in pheno_df.columns: pheno_df[expression_set.pheno_metadata["user_class_title"]] = "" pheno_df.index.name = 'Sample_geo_accession' expression_set.store_pheno_data_frame(pheno_df) return [expression_set, platform_annotation], {}
def preprocess_soft(exp, block, source_file): #TODO: now we assume that we get GSE file try: soft = list(parse_geo(gzip.open(source_file.filepath))) except: raise RuntimeError("Bad source file, can't read") assert soft[2].entity_type == "PLATFORM" pl = soft[2].table_rows id_idx = pl[0].index('ID') # entrez_idx = pl[0].index('ENTREZ_GENE_ID') refseq_idx = [i for i, item in enumerate(pl[0]) if re.search('.*refseq.*', item, re.IGNORECASE)] if refseq_idx == []: refseq_idx = [i for i, item in enumerate(pl[0]) if re.search('.*mirna.*', item, re.IGNORECASE)][0] else: refseq_idx = refseq_idx[0] probe_to_genes_GS = GS() for row in pl[1:]: probe_to_genes_GS.description[row[id_idx]] = "" probe_to_genes_GS.genes[row[id_idx]] = [row[refseq_idx].split(" /// ")[0]] # platform_annotation = PlatformAnnotation( # "TODO:GET NAME FROM SOFT", # base_dir=exp.get_data_folder(), # base_filename="%s_annotation" % block.uuid # ) # # platform_annotation.gene_sets.metadata["gene_units"] = GeneUnits.ENTREZ_ID # platform_annotation.gene_sets.metadata["set_units"] = GeneUnits.PROBE_ID # platform_annotation.gene_sets.store_gs(probe_to_genes_GS) if settings.CELERY_DEBUG: import sys sys.path.append('/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg') import pydevd pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True) id_ref_idx = soft[3].table_rows[0].index("ID_REF") value_idx = soft[3].table_rows[0].index("VALUE") assay_df = DataFrame(dict([ ( soft[i].entity_attributes['Sample_geo_accession'], Series(dict( map_probes_to_refseqs(probe_to_genes_GS.genes, [(row[id_ref_idx], row[value_idx]) for row in soft[i].table_rows[1:]]) )) ) for i in range(3, len(soft)) ])) expression_set = ExpressionSet(exp.get_data_folder(), "%s_es" % block.uuid) expression_set.store_assay_data_frame(assay_df.T) raw_factors = [soft[i].entity_attributes for i in range(3, len(soft))] pheno_index = [] # Here we trying to guess sub columns one_factor_row = raw_factors[0] pheno_complex_columns_def = {} for col_name, col in one_factor_row.iteritems(): if type(col) in [str, unicode]: continue else: if all([":" in sub_col for sub_col in col]): mb_sub_col_names_sets = [ tuple(map(lambda x: x.split(":")[0], row[col_name])) for row in raw_factors ] if len(set(mb_sub_col_names_sets)) == 1: pheno_complex_columns_def[col_name] = "dict" else: pheno_complex_columns_def[col_name] = "list" else: pheno_complex_columns_def[col_name] = "list" factors = [] for idx, factor in enumerate(raw_factors): pheno_index.append(factor.pop('Sample_geo_accession', idx)) factor.pop('sample_table_begin', None) factor.pop('sample_table_end', None) fixed_factor = {} for col_name, col in factor.iteritems(): # Special treat for sub columns if col_name in pheno_complex_columns_def: if pheno_complex_columns_def[col_name] == "list": for sub_idx, sub_col in enumerate(col): fixed_factor["%s_%s" % (col_name, sub_idx + 1)] = sub_col elif pheno_complex_columns_def[col_name] == "dict": for sub_col in col: sub_name, sub_value = sub_col.split(":", 1) fixed_factor["%s_%s" % (col_name, sub_name)] = sub_value else: fixed_factor[col_name] = col factors.append(fixed_factor) # TODO: add ordering to phenotype features pheno_df = DataFrame([Series(factor) for factor in factors], index=pheno_index) if expression_set.pheno_metadata["user_class_title"] not in pheno_df.columns: pheno_df[expression_set.pheno_metadata["user_class_title"]] = "" pheno_df.index.name = 'Sample_geo_accession' expression_set.store_pheno_data_frame(pheno_df) return [expression_set], {}
def preprocess_soft(exp, block, source_file): #TODO: now we assume that we get GSE file try: soft = list(parse_geo(gzip.open(source_file.filepath))) except: raise RuntimeError("Bad source file, can't read") assert soft[2].entity_type == "PLATFORM" pl = soft[2].table_rows id_idx = pl[0].index('ID') # entrez_idx = pl[0].index('ENTREZ_GENE_ID') refseq_idx = [ i for i, item in enumerate(pl[0]) if re.search('.*refseq.*', item, re.IGNORECASE) ] if refseq_idx == []: refseq_idx = [ i for i, item in enumerate(pl[0]) if re.search('.*mirna.*', item, re.IGNORECASE) ][0] else: refseq_idx = refseq_idx[0] probe_to_genes_GS = GS() for row in pl[1:]: probe_to_genes_GS.description[row[id_idx]] = "" probe_to_genes_GS.genes[row[id_idx]] = [ row[refseq_idx].split(" /// ")[0] ] # platform_annotation = PlatformAnnotation( # "TODO:GET NAME FROM SOFT", # base_dir=exp.get_data_folder(), # base_filename="%s_annotation" % block.uuid # ) # # platform_annotation.gene_sets.metadata["gene_units"] = GeneUnits.ENTREZ_ID # platform_annotation.gene_sets.metadata["set_units"] = GeneUnits.PROBE_ID # platform_annotation.gene_sets.store_gs(probe_to_genes_GS) if settings.CELERY_DEBUG: import sys sys.path.append( '/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg' ) import pydevd pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True) id_ref_idx = soft[3].table_rows[0].index("ID_REF") value_idx = soft[3].table_rows[0].index("VALUE") assay_df = DataFrame( dict([(soft[i].entity_attributes['Sample_geo_accession'], Series( dict( map_probes_to_refseqs( probe_to_genes_GS.genes, [(row[id_ref_idx], row[value_idx]) for row in soft[i].table_rows[1:]])))) for i in range(3, len(soft))])) expression_set = ExpressionSet(exp.get_data_folder(), "%s_es" % block.uuid) expression_set.store_assay_data_frame(assay_df.T) raw_factors = [soft[i].entity_attributes for i in range(3, len(soft))] pheno_index = [] # Here we trying to guess sub columns one_factor_row = raw_factors[0] pheno_complex_columns_def = {} for col_name, col in one_factor_row.iteritems(): if type(col) in [str, unicode]: continue else: if all([":" in sub_col for sub_col in col]): mb_sub_col_names_sets = [ tuple(map(lambda x: x.split(":")[0], row[col_name])) for row in raw_factors ] if len(set(mb_sub_col_names_sets)) == 1: pheno_complex_columns_def[col_name] = "dict" else: pheno_complex_columns_def[col_name] = "list" else: pheno_complex_columns_def[col_name] = "list" factors = [] for idx, factor in enumerate(raw_factors): pheno_index.append(factor.pop('Sample_geo_accession', idx)) factor.pop('sample_table_begin', None) factor.pop('sample_table_end', None) fixed_factor = {} for col_name, col in factor.iteritems(): # Special treat for sub columns if col_name in pheno_complex_columns_def: if pheno_complex_columns_def[col_name] == "list": for sub_idx, sub_col in enumerate(col): fixed_factor["%s_%s" % (col_name, sub_idx + 1)] = sub_col elif pheno_complex_columns_def[col_name] == "dict": for sub_col in col: sub_name, sub_value = sub_col.split(":", 1) fixed_factor["%s_%s" % (col_name, sub_name)] = sub_value else: fixed_factor[col_name] = col factors.append(fixed_factor) # TODO: add ordering to phenotype features pheno_df = DataFrame([Series(factor) for factor in factors], index=pheno_index) if expression_set.pheno_metadata[ "user_class_title"] not in pheno_df.columns: pheno_df[expression_set.pheno_metadata["user_class_title"]] = "" pheno_df.index.name = 'Sample_geo_accession' expression_set.store_pheno_data_frame(pheno_df) return [expression_set], {}
def pattern_search( exp, block, m_rna_es, mi_rna_es, gene2gene, miRNA2gene, # gene_platform, # miRNA_platform, radius, min_imp, number_of_genes, metric, base_filename): """ @type m_rna_es: ExpressionSet @type mi_rna_es: ExpressionSet @type gene2gene: BinaryInteraction @type miRNA2gene: BinaryInteraction @type radius: int @type min_imp: double """ AllUpdated(exp.pk, comment=u"Initializing data...", silent=False, mode=NotifyMode.INFO).send() exp.log(block.uuid, "Initializing data...") mData = m_rna_es.get_assay_data_frame() gene_platform = list(mData.columns) AllUpdated(exp.pk, comment=u"Transforming interaction matrix", silent=False, mode=NotifyMode.INFO).send() gene2gene = gene2gene.get_matrix_for_platform(exp, gene_platform, symmetrize=True, identifiers=False) AllUpdated(exp.pk, comment=u"Transforming interaction matrix done", silent=False, mode=NotifyMode.INFO).send() # TODO fix pattern search # if miRNA2gene is not None: # miRNA2gene = miRNA2gene.load_matrix().T # miRNA2gene = sp.coo_matrix(miRNA2gene.values) # if mi_rna_es is not None: # miData = mi_rna_es.get_assay_data_frame() # mir2gene = miRNA2gene # mir2gene = sp.coo_matrix(mir2gene.values).T # nw = mergeNetworks(gene2gene, mir2gene) # else: # gene2gene = gene2gene.load_matrix() # nw = sparse_df_to_saprse_matrix(gene2gene) nw = gene2gene.tocsr() # data = mData.ix[1:] data = mData data.set_index(data.columns[0], inplace=True, drop=True) data = zscore(data) pheno = m_rna_es.get_pheno_data_frame() classes = pheno['User_class'].values if settings.CELERY_DEBUG: import sys sys.path.append( '/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg' ) import pydevd pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True) exp.log(block.uuid, "Data ready. Running Pattern Search") seeds = np.random.choice(np.unique(nw.indices), number_of_genes, replace=False) # inicializace objektu metric=metric, searcher = DifferentialPatternSearcher(nw, radius=radius, min_improve=min_imp, seeds=seeds, base_dir="orig_interactions/", verbose=True) #vlastni search res = searcher.search(data, classes) exp.log(block.uuid, "Pattern search finished.") # res ... list patternu, # tj. pro nase potreby: comodule_set = map( lambda pattern: [gene_platform[gene] for gene in pattern.genes], res) # cs = ComoduleSet(exp.get_data_folder(), base_filename) gene_sets = GeneSets(exp.get_data_folder(), "%s_ps_gene_sets" % str(block.uuid)) result = {key: value for key, value in enumerate(comodule_set)} gs = GS(result, result) gene_sets.store_gs(gs) # self.set_out_var("gene_sets", gene_sets) # result = {key: value for key, value in enumerate(comodule_set)} # cs.store_set(result) # exp.log(block.uuid, "ComoduleSet stored.") return [gene_sets], {}