Beispiel #1
0
def map_gene_sets_to_probes(exp, block,
                            base_dir, base_filename, ann_gene_sets, src_gene_sets):
    """
    TODO: working units check

    @param filepath: Filepath to store result obj

    @type ann_gs: GeneSets
    @type gs: GeneSets

    @rtype: GeneSets
    """
    entrez_ids_to_probes_map = transpose_dict_list(ann_gene_sets.get_gs().genes)

    gene_sets_probes = GeneSets(base_dir, base_filename)

    gene_sets_probes.metadata["org"] = src_gene_sets.metadata["org"]
    gene_sets_probes.metadata["gene_units"] = GeneUnits.PROBE_ID
    gene_sets_probes.metadata["set_units"] = src_gene_sets.metadata["set_units"]
    gs = GS()
    src_gs = src_gene_sets.get_gs()
    for set_name, gene_ids in src_gs.genes.iteritems():
        tmp_set = set()
        for entrez_id in gene_ids:
            tmp_set.update(entrez_ids_to_probes_map.get(entrez_id ,[]))
        if tmp_set:
            gs.genes[set_name] = list(tmp_set)
            gs.description[set_name] = src_gs.description[set_name]

    gene_sets_probes.store_gs(gs)
    return [gene_sets_probes], {}
Beispiel #2
0
def filter_gs_by_genes(src_gs, allowed_genes):
    """
    @type src: environment.structures.GS

    @type allowed_genes: list of strings
    @param allowed_genes: gene units in allowed_genes and src should be the same

    @rtype: environment.structures.GS
    """

    allowed = set(allowed_genes)
    gs = GS()
    for k, gene_set in src_gs.genes.iteritems():
        to_preserve = set(gene_set).intersection(allowed)
        if to_preserve:
            gs.genes[k] = list(to_preserve)
            gs.description[k] = src_gs.description
    return gs
Beispiel #3
0
def threshold_task(
    exp,
    block,
    es,
    T,
    base_filename,
):

    # def removeTemporaryNegativeFeatures(S, indicator_string = 'negative_feature___'):
    #     """Remove elements starting with the indicator_string and remove possible duplicates."""
    #     return S.apply(lambda list_element: set([s.replace(indicator_string, '')  for s in list_element]))
    """Computes co-comodules from matrix H by given threshold T."""
    if settings.CELERY_DEBUG:
        import sys
        sys.path.append(
            '/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg'
        )
        import pydevd
        pydevd.settrace('localhost',
                        port=6901,
                        stdoutToServer=True,
                        stderrToServer=True)

    H = es.get_assay_data_frame()
    print(H)
    # mu = np.mean(H, axis = 1)
    # sigma = np.std(H, axis = 1)
    # Z = H.apply(lambda z: (z-mu)/sigma, axis = 0)
    # S = []
    # S.append(removeTemporaryNegativeFeatures(Z.apply(lambda x: Z.columns[x >= T].tolist(), axis = 1)))
    # S = pd.DataFrame(S)
    # S = S.apply(lambda x: set.union(*x))
    # result = pd.DataFrame(S)
    from wrappers.snmnmf.evaluation import EnrichmentInGeneSets
    z = 1
    x = EnrichmentInGeneSets(z)
    result = x.getGeneSet(H, T)

    gene_sets = GeneSets(exp.get_data_folder(), base_filename)
    gs = GS(result, result)
    gene_sets.store_gs(gs)

    # cs = GeneSets(exp.get_data_folder(), base_filename)
    # cs.store_set(result)
    return [gene_sets], {}
Beispiel #4
0
def pattern_filter_task(exp, block, m_rna_es, mi_rna_es, gene_sets, metric,
                        n_best, base_filename):
    """
        @type m_rna_es: ExpressionSet
        @type mi_rna_es: ExpressionSet
        @type comodule_set: ComoduleSet
        @type metric: metric
        @type n_best: int
    """
    if settings.CELERY_DEBUG:
        import sys
        sys.path.append(
            '/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg'
        )
        import pydevd
        pydevd.settrace('localhost',
                        port=6901,
                        stdoutToServer=True,
                        stderrToServer=True)

    mData = m_rna_es.get_assay_data_frame()
    pheno = m_rna_es.get_pheno_data_frame()
    classes = pheno['User_class'].values

    data = mData
    com_set = gene_sets.get_gs(conv=False).genes

    result = pattern_filter(com_set.values(), data, classes, n_best, metric)

    result = {key: value for key, value in enumerate(result)}
    gs = GS(result, result)
    cs = GeneSets(exp.get_data_folder(), base_filename)

    cs.store_gs(gs)

    return [cs], {}
Beispiel #5
0
def preprocess_soft(exp, block, source_file):
    #TODO: now we assume that we get GSE file
    try:
        soft = list(parse_geo(gzip.open(source_file.filepath)))
    except:
        raise RuntimeError("Bad source file, can't read")

    assert soft[2].entity_type == "PLATFORM"

    pl = soft[2].table_rows
    id_idx = pl[0].index('ID')
    entrez_idx = pl[0].index('ENTREZ_GENE_ID')

    #TODO bug here
    probe_to_genes_GS = GS()
    for row in pl[1:]:
        probe_to_genes_GS.description[row[id_idx]] = ""
        probe_to_genes_GS.genes[row[id_idx]] = row[entrez_idx].split(" /// ")

    platform_annotation = PlatformAnnotation(
        "TODO:GET NAME FROM SOFT",
        base_dir=exp.get_data_folder(),
        base_filename="%s_annotation" % block.uuid
    )

    platform_annotation.gene_sets.metadata["gene_units"] = GeneUnits.ENTREZ_ID
    platform_annotation.gene_sets.metadata["set_units"] = GeneUnits.PROBE_ID
    platform_annotation.gene_sets.store_gs(probe_to_genes_GS)

    id_ref_idx = soft[3].table_rows[0].index("ID_REF")
    value_idx = soft[3].table_rows[0].index("VALUE")
    assay_df = DataFrame(dict([
        (
            soft[i].entity_attributes['Sample_geo_accession'],
            Series(dict([(row[id_ref_idx], row[value_idx]) for row in soft[i].table_rows[1:]]))
        )
        for i in range(3, len(soft))
    ]))

    expression_set = ExpressionSet(exp.get_data_folder(), "%s_es" % block.uuid)
    expression_set.store_assay_data_frame(assay_df)

    raw_factors = [soft[i].entity_attributes
               for i in range(3, len(soft))]
    pheno_index = []

    # Here we trying to guess sub columns
    one_factor_row = raw_factors[0]
    pheno_complex_columns_def = {}
    for col_name, col in one_factor_row.iteritems():
        if type(col) in [str, unicode]:
            continue
        else:
            if all([":" in sub_col for sub_col in col]):
                mb_sub_col_names_sets = [
                    tuple(map(lambda x: x.split(":")[0], row[col_name]))
                    for row in raw_factors
                ]
                if len(set(mb_sub_col_names_sets)) == 1:
                    pheno_complex_columns_def[col_name] = "dict"
                else:
                    pheno_complex_columns_def[col_name] = "list"
            else:
                pheno_complex_columns_def[col_name] = "list"

    factors = []
    for idx, factor in enumerate(raw_factors):
        pheno_index.append(factor.pop('Sample_geo_accession', idx))
        factor.pop('sample_table_begin', None)
        factor.pop('sample_table_end', None)
        fixed_factor = {}
        for col_name, col in factor.iteritems():
            # Special treat for sub columns
            if col_name in pheno_complex_columns_def:
                if pheno_complex_columns_def[col_name] == "list":
                    for sub_idx, sub_col in enumerate(col):
                        fixed_factor["%s_%s" % (col_name, sub_idx + 1)] = sub_col
                elif pheno_complex_columns_def[col_name] == "dict":
                    for sub_col in col:
                        sub_name, sub_value = sub_col.split(":", 1)
                        fixed_factor["%s_%s" % (col_name, sub_name)] = sub_value

            else:
                fixed_factor[col_name] = col
        factors.append(fixed_factor)

    # TODO: add ordering to phenotype features

    pheno_df = DataFrame([Series(factor) for factor in factors], index=pheno_index)
    if expression_set.pheno_metadata["user_class_title"] not in pheno_df.columns:
        pheno_df[expression_set.pheno_metadata["user_class_title"]] = ""

    pheno_df.index.name = 'Sample_geo_accession'
    expression_set.store_pheno_data_frame(pheno_df)

    return [expression_set, platform_annotation], {}
Beispiel #6
0
def preprocess_soft(exp, block, source_file):
    #TODO: now we assume that we get GSE file
    try:
        soft = list(parse_geo(gzip.open(source_file.filepath)))
    except:
        raise RuntimeError("Bad source file, can't read")

    assert soft[2].entity_type == "PLATFORM"

    pl = soft[2].table_rows
    id_idx = pl[0].index('ID')
    # entrez_idx = pl[0].index('ENTREZ_GENE_ID')
    refseq_idx = [i for i, item in enumerate(pl[0]) if re.search('.*refseq.*', item, re.IGNORECASE)]
    if refseq_idx == []:
        refseq_idx = [i for i, item in enumerate(pl[0]) if re.search('.*mirna.*', item, re.IGNORECASE)][0]
    else:
        refseq_idx = refseq_idx[0]


    probe_to_genes_GS = GS()
    for row in pl[1:]:
        probe_to_genes_GS.description[row[id_idx]] = ""
        probe_to_genes_GS.genes[row[id_idx]] = [row[refseq_idx].split(" /// ")[0]]

    # platform_annotation = PlatformAnnotation(
    #     "TODO:GET NAME FROM SOFT",
    #     base_dir=exp.get_data_folder(),
    #     base_filename="%s_annotation" % block.uuid
    # )
    #
    # platform_annotation.gene_sets.metadata["gene_units"] = GeneUnits.ENTREZ_ID
    # platform_annotation.gene_sets.metadata["set_units"] = GeneUnits.PROBE_ID
    # platform_annotation.gene_sets.store_gs(probe_to_genes_GS)

    if settings.CELERY_DEBUG:
        import sys
        sys.path.append('/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg')
        import pydevd
        pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True)

    id_ref_idx = soft[3].table_rows[0].index("ID_REF")
    value_idx = soft[3].table_rows[0].index("VALUE")
    assay_df = DataFrame(dict([
        (
            soft[i].entity_attributes['Sample_geo_accession'],
            Series(dict(
                map_probes_to_refseqs(probe_to_genes_GS.genes, [(row[id_ref_idx], row[value_idx]) for row in soft[i].table_rows[1:]])
            ))
        )
        for i in range(3, len(soft))
    ]))

    expression_set = ExpressionSet(exp.get_data_folder(), "%s_es" % block.uuid)
    expression_set.store_assay_data_frame(assay_df.T)

    raw_factors = [soft[i].entity_attributes
               for i in range(3, len(soft))]
    pheno_index = []

    # Here we trying to guess sub columns
    one_factor_row = raw_factors[0]
    pheno_complex_columns_def = {}
    for col_name, col in one_factor_row.iteritems():
        if type(col) in [str, unicode]:
            continue
        else:
            if all([":" in sub_col for sub_col in col]):
                mb_sub_col_names_sets = [
                    tuple(map(lambda x: x.split(":")[0], row[col_name]))
                    for row in raw_factors
                ]
                if len(set(mb_sub_col_names_sets)) == 1:
                    pheno_complex_columns_def[col_name] = "dict"
                else:
                    pheno_complex_columns_def[col_name] = "list"
            else:
                pheno_complex_columns_def[col_name] = "list"

    factors = []
    for idx, factor in enumerate(raw_factors):
        pheno_index.append(factor.pop('Sample_geo_accession', idx))
        factor.pop('sample_table_begin', None)
        factor.pop('sample_table_end', None)
        fixed_factor = {}
        for col_name, col in factor.iteritems():
            # Special treat for sub columns
            if col_name in pheno_complex_columns_def:
                if pheno_complex_columns_def[col_name] == "list":
                    for sub_idx, sub_col in enumerate(col):
                        fixed_factor["%s_%s" % (col_name, sub_idx + 1)] = sub_col
                elif pheno_complex_columns_def[col_name] == "dict":
                    for sub_col in col:
                        sub_name, sub_value = sub_col.split(":", 1)
                        fixed_factor["%s_%s" % (col_name, sub_name)] = sub_value

            else:
                fixed_factor[col_name] = col
        factors.append(fixed_factor)

    # TODO: add ordering to phenotype features

    pheno_df = DataFrame([Series(factor) for factor in factors], index=pheno_index)
    if expression_set.pheno_metadata["user_class_title"] not in pheno_df.columns:
        pheno_df[expression_set.pheno_metadata["user_class_title"]] = ""

    pheno_df.index.name = 'Sample_geo_accession'
    expression_set.store_pheno_data_frame(pheno_df)

    return [expression_set], {}
Beispiel #7
0
def preprocess_soft(exp, block, source_file):
    #TODO: now we assume that we get GSE file
    try:
        soft = list(parse_geo(gzip.open(source_file.filepath)))
    except:
        raise RuntimeError("Bad source file, can't read")

    assert soft[2].entity_type == "PLATFORM"

    pl = soft[2].table_rows
    id_idx = pl[0].index('ID')
    # entrez_idx = pl[0].index('ENTREZ_GENE_ID')
    refseq_idx = [
        i for i, item in enumerate(pl[0])
        if re.search('.*refseq.*', item, re.IGNORECASE)
    ]
    if refseq_idx == []:
        refseq_idx = [
            i for i, item in enumerate(pl[0])
            if re.search('.*mirna.*', item, re.IGNORECASE)
        ][0]
    else:
        refseq_idx = refseq_idx[0]

    probe_to_genes_GS = GS()
    for row in pl[1:]:
        probe_to_genes_GS.description[row[id_idx]] = ""
        probe_to_genes_GS.genes[row[id_idx]] = [
            row[refseq_idx].split(" /// ")[0]
        ]

    # platform_annotation = PlatformAnnotation(
    #     "TODO:GET NAME FROM SOFT",
    #     base_dir=exp.get_data_folder(),
    #     base_filename="%s_annotation" % block.uuid
    # )
    #
    # platform_annotation.gene_sets.metadata["gene_units"] = GeneUnits.ENTREZ_ID
    # platform_annotation.gene_sets.metadata["set_units"] = GeneUnits.PROBE_ID
    # platform_annotation.gene_sets.store_gs(probe_to_genes_GS)

    if settings.CELERY_DEBUG:
        import sys
        sys.path.append(
            '/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg'
        )
        import pydevd
        pydevd.settrace('localhost',
                        port=6901,
                        stdoutToServer=True,
                        stderrToServer=True)

    id_ref_idx = soft[3].table_rows[0].index("ID_REF")
    value_idx = soft[3].table_rows[0].index("VALUE")
    assay_df = DataFrame(
        dict([(soft[i].entity_attributes['Sample_geo_accession'],
               Series(
                   dict(
                       map_probes_to_refseqs(
                           probe_to_genes_GS.genes,
                           [(row[id_ref_idx], row[value_idx])
                            for row in soft[i].table_rows[1:]]))))
              for i in range(3, len(soft))]))

    expression_set = ExpressionSet(exp.get_data_folder(), "%s_es" % block.uuid)
    expression_set.store_assay_data_frame(assay_df.T)

    raw_factors = [soft[i].entity_attributes for i in range(3, len(soft))]
    pheno_index = []

    # Here we trying to guess sub columns
    one_factor_row = raw_factors[0]
    pheno_complex_columns_def = {}
    for col_name, col in one_factor_row.iteritems():
        if type(col) in [str, unicode]:
            continue
        else:
            if all([":" in sub_col for sub_col in col]):
                mb_sub_col_names_sets = [
                    tuple(map(lambda x: x.split(":")[0], row[col_name]))
                    for row in raw_factors
                ]
                if len(set(mb_sub_col_names_sets)) == 1:
                    pheno_complex_columns_def[col_name] = "dict"
                else:
                    pheno_complex_columns_def[col_name] = "list"
            else:
                pheno_complex_columns_def[col_name] = "list"

    factors = []
    for idx, factor in enumerate(raw_factors):
        pheno_index.append(factor.pop('Sample_geo_accession', idx))
        factor.pop('sample_table_begin', None)
        factor.pop('sample_table_end', None)
        fixed_factor = {}
        for col_name, col in factor.iteritems():
            # Special treat for sub columns
            if col_name in pheno_complex_columns_def:
                if pheno_complex_columns_def[col_name] == "list":
                    for sub_idx, sub_col in enumerate(col):
                        fixed_factor["%s_%s" %
                                     (col_name, sub_idx + 1)] = sub_col
                elif pheno_complex_columns_def[col_name] == "dict":
                    for sub_col in col:
                        sub_name, sub_value = sub_col.split(":", 1)
                        fixed_factor["%s_%s" %
                                     (col_name, sub_name)] = sub_value

            else:
                fixed_factor[col_name] = col
        factors.append(fixed_factor)

    # TODO: add ordering to phenotype features

    pheno_df = DataFrame([Series(factor) for factor in factors],
                         index=pheno_index)
    if expression_set.pheno_metadata[
            "user_class_title"] not in pheno_df.columns:
        pheno_df[expression_set.pheno_metadata["user_class_title"]] = ""

    pheno_df.index.name = 'Sample_geo_accession'
    expression_set.store_pheno_data_frame(pheno_df)

    return [expression_set], {}
Beispiel #8
0
def pattern_search(
        exp,
        block,
        m_rna_es,
        mi_rna_es,
        gene2gene,
        miRNA2gene,
        # gene_platform,
        # miRNA_platform,
        radius,
        min_imp,
        number_of_genes,
        metric,
        base_filename):
    """
        @type m_rna_es: ExpressionSet
        @type mi_rna_es: ExpressionSet
        @type gene2gene: BinaryInteraction
        @type miRNA2gene: BinaryInteraction
        @type radius: int
        @type min_imp: double
    """

    AllUpdated(exp.pk,
               comment=u"Initializing data...",
               silent=False,
               mode=NotifyMode.INFO).send()

    exp.log(block.uuid, "Initializing data...")

    mData = m_rna_es.get_assay_data_frame()
    gene_platform = list(mData.columns)
    AllUpdated(exp.pk,
               comment=u"Transforming interaction matrix",
               silent=False,
               mode=NotifyMode.INFO).send()

    gene2gene = gene2gene.get_matrix_for_platform(exp,
                                                  gene_platform,
                                                  symmetrize=True,
                                                  identifiers=False)

    AllUpdated(exp.pk,
               comment=u"Transforming interaction matrix done",
               silent=False,
               mode=NotifyMode.INFO).send()

    # TODO fix pattern search
    # if miRNA2gene is not None:
    #     miRNA2gene = miRNA2gene.load_matrix().T
    #     miRNA2gene = sp.coo_matrix(miRNA2gene.values)
    # if mi_rna_es is not None:
    #     miData = mi_rna_es.get_assay_data_frame()
    #     mir2gene = miRNA2gene
    #     mir2gene = sp.coo_matrix(mir2gene.values).T
    #     nw = mergeNetworks(gene2gene, mir2gene)
    # else:
    # gene2gene = gene2gene.load_matrix()
    # nw = sparse_df_to_saprse_matrix(gene2gene)
    nw = gene2gene.tocsr()
    # data = mData.ix[1:]
    data = mData
    data.set_index(data.columns[0], inplace=True, drop=True)

    data = zscore(data)
    pheno = m_rna_es.get_pheno_data_frame()
    classes = pheno['User_class'].values
    if settings.CELERY_DEBUG:
        import sys
        sys.path.append(
            '/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg'
        )
        import pydevd
        pydevd.settrace('localhost',
                        port=6901,
                        stdoutToServer=True,
                        stderrToServer=True)

    exp.log(block.uuid, "Data ready. Running Pattern Search")
    seeds = np.random.choice(np.unique(nw.indices),
                             number_of_genes,
                             replace=False)
    # inicializace objektu metric=metric,
    searcher = DifferentialPatternSearcher(nw,
                                           radius=radius,
                                           min_improve=min_imp,
                                           seeds=seeds,
                                           base_dir="orig_interactions/",
                                           verbose=True)

    #vlastni search
    res = searcher.search(data, classes)
    exp.log(block.uuid, "Pattern search finished.")

    # res ... list patternu,
    # tj. pro nase potreby:
    comodule_set = map(
        lambda pattern: [gene_platform[gene] for gene in pattern.genes], res)

    # cs = ComoduleSet(exp.get_data_folder(), base_filename)
    gene_sets = GeneSets(exp.get_data_folder(),
                         "%s_ps_gene_sets" % str(block.uuid))
    result = {key: value for key, value in enumerate(comodule_set)}
    gs = GS(result, result)
    gene_sets.store_gs(gs)

    # self.set_out_var("gene_sets", gene_sets)
    # result = {key: value for key, value in enumerate(comodule_set)}
    # cs.store_set(result)
    # exp.log(block.uuid, "ComoduleSet stored.")

    return [gene_sets], {}