Ejemplo n.º 1
0
def preprocess_soft(exp, block, source_file):
    #TODO: now we assume that we get GSE file
    try:
        soft = list(parse_geo(gzip.open(source_file.filepath)))
    except:
        raise RuntimeError("Bad source file, can't read")

    assert soft[2].entity_type == "PLATFORM"

    pl = soft[2].table_rows
    id_idx = pl[0].index('ID')
    # entrez_idx = pl[0].index('ENTREZ_GENE_ID')
    refseq_idx = [i for i, item in enumerate(pl[0]) if re.search('.*refseq.*', item, re.IGNORECASE)]
    if refseq_idx == []:
        refseq_idx = [i for i, item in enumerate(pl[0]) if re.search('.*mirna.*', item, re.IGNORECASE)][0]
    else:
        refseq_idx = refseq_idx[0]


    probe_to_genes_GS = GS()
    for row in pl[1:]:
        probe_to_genes_GS.description[row[id_idx]] = ""
        probe_to_genes_GS.genes[row[id_idx]] = [row[refseq_idx].split(" /// ")[0]]

    # platform_annotation = PlatformAnnotation(
    #     "TODO:GET NAME FROM SOFT",
    #     base_dir=exp.get_data_folder(),
    #     base_filename="%s_annotation" % block.uuid
    # )
    #
    # platform_annotation.gene_sets.metadata["gene_units"] = GeneUnits.ENTREZ_ID
    # platform_annotation.gene_sets.metadata["set_units"] = GeneUnits.PROBE_ID
    # platform_annotation.gene_sets.store_gs(probe_to_genes_GS)

    if settings.CELERY_DEBUG:
        import sys
        sys.path.append('/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg')
        import pydevd
        pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True)

    id_ref_idx = soft[3].table_rows[0].index("ID_REF")
    value_idx = soft[3].table_rows[0].index("VALUE")
    assay_df = DataFrame(dict([
        (
            soft[i].entity_attributes['Sample_geo_accession'],
            Series(dict(
                map_probes_to_refseqs(probe_to_genes_GS.genes, [(row[id_ref_idx], row[value_idx]) for row in soft[i].table_rows[1:]])
            ))
        )
        for i in range(3, len(soft))
    ]))

    expression_set = ExpressionSet(exp.get_data_folder(), "%s_es" % block.uuid)
    expression_set.store_assay_data_frame(assay_df.T)

    raw_factors = [soft[i].entity_attributes
               for i in range(3, len(soft))]
    pheno_index = []

    # Here we trying to guess sub columns
    one_factor_row = raw_factors[0]
    pheno_complex_columns_def = {}
    for col_name, col in one_factor_row.iteritems():
        if type(col) in [str, unicode]:
            continue
        else:
            if all([":" in sub_col for sub_col in col]):
                mb_sub_col_names_sets = [
                    tuple(map(lambda x: x.split(":")[0], row[col_name]))
                    for row in raw_factors
                ]
                if len(set(mb_sub_col_names_sets)) == 1:
                    pheno_complex_columns_def[col_name] = "dict"
                else:
                    pheno_complex_columns_def[col_name] = "list"
            else:
                pheno_complex_columns_def[col_name] = "list"

    factors = []
    for idx, factor in enumerate(raw_factors):
        pheno_index.append(factor.pop('Sample_geo_accession', idx))
        factor.pop('sample_table_begin', None)
        factor.pop('sample_table_end', None)
        fixed_factor = {}
        for col_name, col in factor.iteritems():
            # Special treat for sub columns
            if col_name in pheno_complex_columns_def:
                if pheno_complex_columns_def[col_name] == "list":
                    for sub_idx, sub_col in enumerate(col):
                        fixed_factor["%s_%s" % (col_name, sub_idx + 1)] = sub_col
                elif pheno_complex_columns_def[col_name] == "dict":
                    for sub_col in col:
                        sub_name, sub_value = sub_col.split(":", 1)
                        fixed_factor["%s_%s" % (col_name, sub_name)] = sub_value

            else:
                fixed_factor[col_name] = col
        factors.append(fixed_factor)

    # TODO: add ordering to phenotype features

    pheno_df = DataFrame([Series(factor) for factor in factors], index=pheno_index)
    if expression_set.pheno_metadata["user_class_title"] not in pheno_df.columns:
        pheno_df[expression_set.pheno_metadata["user_class_title"]] = ""

    pheno_df.index.name = 'Sample_geo_accession'
    expression_set.store_pheno_data_frame(pheno_df)

    return [expression_set], {}
Ejemplo n.º 2
0
def preprocess_soft(exp, block, source_file):
    #TODO: now we assume that we get GSE file
    try:
        soft = list(parse_geo(gzip.open(source_file.filepath)))
    except:
        raise RuntimeError("Bad source file, can't read")

    assert soft[2].entity_type == "PLATFORM"

    pl = soft[2].table_rows
    id_idx = pl[0].index('ID')
    entrez_idx = pl[0].index('ENTREZ_GENE_ID')

    #TODO bug here
    probe_to_genes_GS = GS()
    for row in pl[1:]:
        probe_to_genes_GS.description[row[id_idx]] = ""
        probe_to_genes_GS.genes[row[id_idx]] = row[entrez_idx].split(" /// ")

    platform_annotation = PlatformAnnotation(
        "TODO:GET NAME FROM SOFT",
        base_dir=exp.get_data_folder(),
        base_filename="%s_annotation" % block.uuid
    )

    platform_annotation.gene_sets.metadata["gene_units"] = GeneUnits.ENTREZ_ID
    platform_annotation.gene_sets.metadata["set_units"] = GeneUnits.PROBE_ID
    platform_annotation.gene_sets.store_gs(probe_to_genes_GS)

    id_ref_idx = soft[3].table_rows[0].index("ID_REF")
    value_idx = soft[3].table_rows[0].index("VALUE")
    assay_df = DataFrame(dict([
        (
            soft[i].entity_attributes['Sample_geo_accession'],
            Series(dict([(row[id_ref_idx], row[value_idx]) for row in soft[i].table_rows[1:]]))
        )
        for i in range(3, len(soft))
    ]))

    expression_set = ExpressionSet(exp.get_data_folder(), "%s_es" % block.uuid)
    expression_set.store_assay_data_frame(assay_df)

    raw_factors = [soft[i].entity_attributes
               for i in range(3, len(soft))]
    pheno_index = []

    # Here we trying to guess sub columns
    one_factor_row = raw_factors[0]
    pheno_complex_columns_def = {}
    for col_name, col in one_factor_row.iteritems():
        if type(col) in [str, unicode]:
            continue
        else:
            if all([":" in sub_col for sub_col in col]):
                mb_sub_col_names_sets = [
                    tuple(map(lambda x: x.split(":")[0], row[col_name]))
                    for row in raw_factors
                ]
                if len(set(mb_sub_col_names_sets)) == 1:
                    pheno_complex_columns_def[col_name] = "dict"
                else:
                    pheno_complex_columns_def[col_name] = "list"
            else:
                pheno_complex_columns_def[col_name] = "list"

    factors = []
    for idx, factor in enumerate(raw_factors):
        pheno_index.append(factor.pop('Sample_geo_accession', idx))
        factor.pop('sample_table_begin', None)
        factor.pop('sample_table_end', None)
        fixed_factor = {}
        for col_name, col in factor.iteritems():
            # Special treat for sub columns
            if col_name in pheno_complex_columns_def:
                if pheno_complex_columns_def[col_name] == "list":
                    for sub_idx, sub_col in enumerate(col):
                        fixed_factor["%s_%s" % (col_name, sub_idx + 1)] = sub_col
                elif pheno_complex_columns_def[col_name] == "dict":
                    for sub_col in col:
                        sub_name, sub_value = sub_col.split(":", 1)
                        fixed_factor["%s_%s" % (col_name, sub_name)] = sub_value

            else:
                fixed_factor[col_name] = col
        factors.append(fixed_factor)

    # TODO: add ordering to phenotype features

    pheno_df = DataFrame([Series(factor) for factor in factors], index=pheno_index)
    if expression_set.pheno_metadata["user_class_title"] not in pheno_df.columns:
        pheno_df[expression_set.pheno_metadata["user_class_title"]] = ""

    pheno_df.index.name = 'Sample_geo_accession'
    expression_set.store_pheno_data_frame(pheno_df)

    return [expression_set, platform_annotation], {}
Ejemplo n.º 3
0
def preprocess_soft(exp, block, source_file):
    #TODO: now we assume that we get GSE file
    try:
        soft = list(parse_geo(gzip.open(source_file.filepath)))
    except:
        raise RuntimeError("Bad source file, can't read")

    assert soft[2].entity_type == "PLATFORM"

    pl = soft[2].table_rows
    id_idx = pl[0].index('ID')
    # entrez_idx = pl[0].index('ENTREZ_GENE_ID')
    refseq_idx = [
        i for i, item in enumerate(pl[0])
        if re.search('.*refseq.*', item, re.IGNORECASE)
    ]
    if refseq_idx == []:
        refseq_idx = [
            i for i, item in enumerate(pl[0])
            if re.search('.*mirna.*', item, re.IGNORECASE)
        ][0]
    else:
        refseq_idx = refseq_idx[0]

    probe_to_genes_GS = GS()
    for row in pl[1:]:
        probe_to_genes_GS.description[row[id_idx]] = ""
        probe_to_genes_GS.genes[row[id_idx]] = [
            row[refseq_idx].split(" /// ")[0]
        ]

    # platform_annotation = PlatformAnnotation(
    #     "TODO:GET NAME FROM SOFT",
    #     base_dir=exp.get_data_folder(),
    #     base_filename="%s_annotation" % block.uuid
    # )
    #
    # platform_annotation.gene_sets.metadata["gene_units"] = GeneUnits.ENTREZ_ID
    # platform_annotation.gene_sets.metadata["set_units"] = GeneUnits.PROBE_ID
    # platform_annotation.gene_sets.store_gs(probe_to_genes_GS)

    if settings.CELERY_DEBUG:
        import sys
        sys.path.append(
            '/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg'
        )
        import pydevd
        pydevd.settrace('localhost',
                        port=6901,
                        stdoutToServer=True,
                        stderrToServer=True)

    id_ref_idx = soft[3].table_rows[0].index("ID_REF")
    value_idx = soft[3].table_rows[0].index("VALUE")
    assay_df = DataFrame(
        dict([(soft[i].entity_attributes['Sample_geo_accession'],
               Series(
                   dict(
                       map_probes_to_refseqs(
                           probe_to_genes_GS.genes,
                           [(row[id_ref_idx], row[value_idx])
                            for row in soft[i].table_rows[1:]]))))
              for i in range(3, len(soft))]))

    expression_set = ExpressionSet(exp.get_data_folder(), "%s_es" % block.uuid)
    expression_set.store_assay_data_frame(assay_df.T)

    raw_factors = [soft[i].entity_attributes for i in range(3, len(soft))]
    pheno_index = []

    # Here we trying to guess sub columns
    one_factor_row = raw_factors[0]
    pheno_complex_columns_def = {}
    for col_name, col in one_factor_row.iteritems():
        if type(col) in [str, unicode]:
            continue
        else:
            if all([":" in sub_col for sub_col in col]):
                mb_sub_col_names_sets = [
                    tuple(map(lambda x: x.split(":")[0], row[col_name]))
                    for row in raw_factors
                ]
                if len(set(mb_sub_col_names_sets)) == 1:
                    pheno_complex_columns_def[col_name] = "dict"
                else:
                    pheno_complex_columns_def[col_name] = "list"
            else:
                pheno_complex_columns_def[col_name] = "list"

    factors = []
    for idx, factor in enumerate(raw_factors):
        pheno_index.append(factor.pop('Sample_geo_accession', idx))
        factor.pop('sample_table_begin', None)
        factor.pop('sample_table_end', None)
        fixed_factor = {}
        for col_name, col in factor.iteritems():
            # Special treat for sub columns
            if col_name in pheno_complex_columns_def:
                if pheno_complex_columns_def[col_name] == "list":
                    for sub_idx, sub_col in enumerate(col):
                        fixed_factor["%s_%s" %
                                     (col_name, sub_idx + 1)] = sub_col
                elif pheno_complex_columns_def[col_name] == "dict":
                    for sub_col in col:
                        sub_name, sub_value = sub_col.split(":", 1)
                        fixed_factor["%s_%s" %
                                     (col_name, sub_name)] = sub_value

            else:
                fixed_factor[col_name] = col
        factors.append(fixed_factor)

    # TODO: add ordering to phenotype features

    pheno_df = DataFrame([Series(factor) for factor in factors],
                         index=pheno_index)
    if expression_set.pheno_metadata[
            "user_class_title"] not in pheno_df.columns:
        pheno_df[expression_set.pheno_metadata["user_class_title"]] = ""

    pheno_df.index.name = 'Sample_geo_accession'
    expression_set.store_pheno_data_frame(pheno_df)

    return [expression_set], {}