Ejemplo n.º 1
0
    def process_upload(self, exp, *args, **kwargs):
        """
            @param exp: Experiment
        """
        self.clean_errors()

        assay_df = pd.DataFrame.from_csv(self.es_matrix.get_file())

        es = ExpressionSet(base_dir=exp.get_data_folder(),
                           base_filename="%s_annotation" % self.uuid)

        pheno_df = pd.DataFrame.from_csv(self.pheno_matrix.get_file())
        pheno_df.set_index(pheno_df.columns[0])

        user_class_title = es.pheno_metadata["user_class_title"]
        if user_class_title not in pheno_df.columns:
            pheno_df[es.pheno_metadata["user_class_title"]] = ""

        es.store_assay_data_frame(assay_df)
        es.store_pheno_data_frame(pheno_df)

        if self.working_unit:
            es.working_unit = self.working_unit

        self.set_out_var("expression_set", es)

        exp.store_block(self)

        self.do_action("success", exp)
Ejemplo n.º 2
0
    def process_upload(self, exp, *args, **kwargs):
        """
            @param exp: Experiment
        """
        self.clean_errors()

        assay_df = pd.DataFrame.from_csv(self.es_matrix.get_file())

        es = ExpressionSet(base_dir=exp.get_data_folder(),
                           base_filename="%s_annotation" % self.uuid)

        pheno_df = pd.DataFrame.from_csv(self.pheno_matrix.get_file())
        pheno_df.set_index(pheno_df.columns[0])

        user_class_title = es.pheno_metadata["user_class_title"]
        if user_class_title not in pheno_df.columns:
            pheno_df[es.pheno_metadata["user_class_title"]] = ""

        # if matrix is bad oriented, then do transposition
        if self.es_matrix_ori == "GxS":
            assay_df = assay_df.T

        es.store_assay_data_frame(assay_df)
        es.store_pheno_data_frame(pheno_df)

        if self.working_unit:
            es.working_unit = self.working_unit

        self.set_out_var("expression_set", es)

        exp.store_block(self)

        self.do_action("success", exp)
Ejemplo n.º 3
0
    def process_upload(self, exp, *args, **kwargs):
        """
            @param exp: Experiment
        """
        # TODO: move to celery
        self.clean_errors()
        seq = []
        sep = getattr(self, "csv_sep", " ")
        try:
            if len(self.pheno_matrices) != len(self.es_matrices):
                raise RuntimeError(
                    "Different number of phenotypes and expression sets")

            self.labels = es_matrix_names = sorted(self.es_matrices)
            pheno_matrix_names = sorted(self.pheno_matrices)
            self.pheno_by_es_names = {
                es_name: pheno_name
                for es_name, pheno_name in zip(es_matrix_names,
                                               pheno_matrix_names)
            }
            for es_name, pheno_name in self.pheno_by_es_names.iteritems():
                es_ufw = self.es_matrices[es_name]
                es_df = es_ufw.get_as_data_frame(sep)

                pheno_ufw = self.pheno_matrices[pheno_name]
                pheno_df = pheno_ufw.get_as_data_frame(sep)

                es_sample_names = sorted(es_df.columns.tolist())
                pheno_sample_names = sorted(pheno_df.index.tolist())
                if es_sample_names != pheno_sample_names:
                    raise RuntimeError("Couldn't match `%s` and `%s` due to "
                                       "different sample name sets" %
                                       (es_name, pheno_name))

                es = ExpressionSet(base_dir=exp.get_data_folder(),
                                   base_filename="%s_%s" %
                                   (self.uuid, es_name))
                es.store_assay_data_frame(es_df)
                es.store_pheno_data_frame(pheno_df)

                es.pheno_metadata["user_class_title"] = pheno_df.columns[0]
                seq.append({"es": es, "__label__": es_name})

            self.seq = seq
            exp.store_block(self)
            self.do_action("processing_done", exp, seq)
        except Exception as e:
            log.exception(e)
            self.errors.append(e)
            self.do_action("error_on_processing", exp, e)
Ejemplo n.º 4
0
    def process_upload(self, exp, *args, **kwargs):
        """
            @param exp: Experiment
        """
        # TODO: move to celery
        self.clean_errors()
        seq = []
        sep = getattr(self, "csv_sep", " ")
        try:
            if len(self.pheno_matrices) != len(self.es_matrices):
                raise RuntimeError("Different number of phenotypes and expression sets")

            self.labels = es_matrix_names = sorted(self.es_matrices)
            pheno_matrix_names = sorted(self.pheno_matrices)
            self.pheno_by_es_names = {
                es_name: pheno_name for
                es_name, pheno_name
                in zip(es_matrix_names, pheno_matrix_names)
            }
            for es_name, pheno_name in self.pheno_by_es_names.iteritems():
                es_ufw = self.es_matrices[es_name]
                es_df = es_ufw.get_as_data_frame(sep)

                pheno_ufw = self.pheno_matrices[pheno_name]
                pheno_df = pheno_ufw.get_as_data_frame(sep)

                es_sample_names = sorted(es_df.columns.tolist())
                pheno_sample_names = sorted(pheno_df.index.tolist())
                if es_sample_names != pheno_sample_names:
                    raise RuntimeError("Couldn't match `%s` and `%s` due to "
                                       "different sample name sets" % (es_name, pheno_name))

                es = ExpressionSet(
                    base_dir=exp.get_data_folder(),
                    base_filename="%s_%s" % (self.uuid, es_name)
                )
                es.store_assay_data_frame(es_df)
                es.store_pheno_data_frame(pheno_df)

                es.pheno_metadata["user_class_title"] = pheno_df.columns[0]
                seq.append({"es": es, "__label__": es_name})

            self.seq = seq
            exp.store_block(self)
            self.do_action("processing_done", exp, seq)
        except Exception as e:
            log.exception(e)
            self.errors.append(e)
            self.do_action("error_on_processing", exp, e)
Ejemplo n.º 5
0
    def process_upload(self, exp, *args, **kwargs):
        """
            @param exp: Experiment
        """
        # TODO: move to celery
        self.clean_errors()
        sep = getattr(self, "csv_sep", " ")

        try:
            if not self.pheno_matrix:
                self.warnings.append(Exception("Phenotype is undefined"))
                pheno_df = None
            else:
                pheno_df = self.pheno_matrix.get_as_data_frame(sep)
                pheno_df.set_index(pheno_df.columns[0])

                # TODO: solve somehow better: Here we add empty column with user class assignment
                pheno_df[ExpressionSet(None, None).pheno_metadata["user_class_title"]] = ""

            if self.m_rna_matrix is not None:
                m_rna_assay_df = self.m_rna_matrix.get_as_data_frame(sep)

                m_rna_es = ExpressionSet(base_dir=exp.get_data_folder(),
                                        base_filename="%s_m_rna_es" % self.uuid)
                m_rna_es.store_assay_data_frame(m_rna_assay_df)
                m_rna_es.store_pheno_data_frame(pheno_df)
                m_rna_es.working_unit = self.m_rna_unit

                self.set_out_var("m_rna_es", m_rna_es)

                # TODO: fetch GPL annotation if GPL id was provided

            if self.mi_rna_matrix is not None:
                mi_rna_assay_df = self.mi_rna_matrix.get_as_data_frame(sep)

                mi_rna_es = ExpressionSet(base_dir=exp.get_data_folder(),
                                        base_filename="%s_mi_rna_es" % self.uuid)
                mi_rna_es.store_assay_data_frame(mi_rna_assay_df)
                mi_rna_es.store_pheno_data_frame(pheno_df)

                self.set_out_var("mi_rna_es", mi_rna_es)

            if self.methyl_matrix is not None:

                methyl_assay_df = self.methyl_matrix.get_as_data_frame(sep)

                methyl_es = ExpressionSet(base_dir=exp.get_data_folder(),
                                          base_filename="%s_methyl_es" % self.uuid)
                methyl_es.store_assay_data_frame(methyl_assay_df)
                methyl_es.store_pheno_data_frame(pheno_df)

                self.set_out_var("methyl_es", methyl_es)

            self.do_action("success", exp)
        except Exception as e:
            ex_type, ex, tb = sys.exc_info()
            traceback.print_tb(tb)
            self.do_action("error", exp, e)
Ejemplo n.º 6
0
def preprocess_soft(exp, block, source_file):
    #TODO: now we assume that we get GSE file
    try:
        soft = list(parse_geo(gzip.open(source_file.filepath)))
    except:
        raise RuntimeError("Bad source file, can't read")

    assert soft[2].entity_type == "PLATFORM"

    pl = soft[2].table_rows
    id_idx = pl[0].index('ID')
    entrez_idx = pl[0].index('ENTREZ_GENE_ID')

    #TODO bug here
    probe_to_genes_GS = GS()
    for row in pl[1:]:
        probe_to_genes_GS.description[row[id_idx]] = ""
        probe_to_genes_GS.genes[row[id_idx]] = row[entrez_idx].split(" /// ")

    platform_annotation = PlatformAnnotation(
        "TODO:GET NAME FROM SOFT",
        base_dir=exp.get_data_folder(),
        base_filename="%s_annotation" % block.uuid
    )

    platform_annotation.gene_sets.metadata["gene_units"] = GeneUnits.ENTREZ_ID
    platform_annotation.gene_sets.metadata["set_units"] = GeneUnits.PROBE_ID
    platform_annotation.gene_sets.store_gs(probe_to_genes_GS)

    id_ref_idx = soft[3].table_rows[0].index("ID_REF")
    value_idx = soft[3].table_rows[0].index("VALUE")
    assay_df = DataFrame(dict([
        (
            soft[i].entity_attributes['Sample_geo_accession'],
            Series(dict([(row[id_ref_idx], row[value_idx]) for row in soft[i].table_rows[1:]]))
        )
        for i in range(3, len(soft))
    ]))

    expression_set = ExpressionSet(exp.get_data_folder(), "%s_es" % block.uuid)
    expression_set.store_assay_data_frame(assay_df)

    raw_factors = [soft[i].entity_attributes
               for i in range(3, len(soft))]
    pheno_index = []

    # Here we trying to guess sub columns
    one_factor_row = raw_factors[0]
    pheno_complex_columns_def = {}
    for col_name, col in one_factor_row.iteritems():
        if type(col) in [str, unicode]:
            continue
        else:
            if all([":" in sub_col for sub_col in col]):
                mb_sub_col_names_sets = [
                    tuple(map(lambda x: x.split(":")[0], row[col_name]))
                    for row in raw_factors
                ]
                if len(set(mb_sub_col_names_sets)) == 1:
                    pheno_complex_columns_def[col_name] = "dict"
                else:
                    pheno_complex_columns_def[col_name] = "list"
            else:
                pheno_complex_columns_def[col_name] = "list"

    factors = []
    for idx, factor in enumerate(raw_factors):
        pheno_index.append(factor.pop('Sample_geo_accession', idx))
        factor.pop('sample_table_begin', None)
        factor.pop('sample_table_end', None)
        fixed_factor = {}
        for col_name, col in factor.iteritems():
            # Special treat for sub columns
            if col_name in pheno_complex_columns_def:
                if pheno_complex_columns_def[col_name] == "list":
                    for sub_idx, sub_col in enumerate(col):
                        fixed_factor["%s_%s" % (col_name, sub_idx + 1)] = sub_col
                elif pheno_complex_columns_def[col_name] == "dict":
                    for sub_col in col:
                        sub_name, sub_value = sub_col.split(":", 1)
                        fixed_factor["%s_%s" % (col_name, sub_name)] = sub_value

            else:
                fixed_factor[col_name] = col
        factors.append(fixed_factor)

    # TODO: add ordering to phenotype features

    pheno_df = DataFrame([Series(factor) for factor in factors], index=pheno_index)
    if expression_set.pheno_metadata["user_class_title"] not in pheno_df.columns:
        pheno_df[expression_set.pheno_metadata["user_class_title"]] = ""

    pheno_df.index.name = 'Sample_geo_accession'
    expression_set.store_pheno_data_frame(pheno_df)

    return [expression_set, platform_annotation], {}
Ejemplo n.º 7
0
def preprocess_soft(exp, block, source_file):
    #TODO: now we assume that we get GSE file
    try:
        soft = list(parse_geo(gzip.open(source_file.filepath)))
    except:
        raise RuntimeError("Bad source file, can't read")

    assert soft[2].entity_type == "PLATFORM"

    pl = soft[2].table_rows
    id_idx = pl[0].index('ID')
    # entrez_idx = pl[0].index('ENTREZ_GENE_ID')
    refseq_idx = [i for i, item in enumerate(pl[0]) if re.search('.*refseq.*', item, re.IGNORECASE)]
    if refseq_idx == []:
        refseq_idx = [i for i, item in enumerate(pl[0]) if re.search('.*mirna.*', item, re.IGNORECASE)][0]
    else:
        refseq_idx = refseq_idx[0]


    probe_to_genes_GS = GS()
    for row in pl[1:]:
        probe_to_genes_GS.description[row[id_idx]] = ""
        probe_to_genes_GS.genes[row[id_idx]] = [row[refseq_idx].split(" /// ")[0]]

    # platform_annotation = PlatformAnnotation(
    #     "TODO:GET NAME FROM SOFT",
    #     base_dir=exp.get_data_folder(),
    #     base_filename="%s_annotation" % block.uuid
    # )
    #
    # platform_annotation.gene_sets.metadata["gene_units"] = GeneUnits.ENTREZ_ID
    # platform_annotation.gene_sets.metadata["set_units"] = GeneUnits.PROBE_ID
    # platform_annotation.gene_sets.store_gs(probe_to_genes_GS)

    if settings.CELERY_DEBUG:
        import sys
        sys.path.append('/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg')
        import pydevd
        pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True)

    id_ref_idx = soft[3].table_rows[0].index("ID_REF")
    value_idx = soft[3].table_rows[0].index("VALUE")
    assay_df = DataFrame(dict([
        (
            soft[i].entity_attributes['Sample_geo_accession'],
            Series(dict(
                map_probes_to_refseqs(probe_to_genes_GS.genes, [(row[id_ref_idx], row[value_idx]) for row in soft[i].table_rows[1:]])
            ))
        )
        for i in range(3, len(soft))
    ]))

    expression_set = ExpressionSet(exp.get_data_folder(), "%s_es" % block.uuid)
    expression_set.store_assay_data_frame(assay_df.T)

    raw_factors = [soft[i].entity_attributes
               for i in range(3, len(soft))]
    pheno_index = []

    # Here we trying to guess sub columns
    one_factor_row = raw_factors[0]
    pheno_complex_columns_def = {}
    for col_name, col in one_factor_row.iteritems():
        if type(col) in [str, unicode]:
            continue
        else:
            if all([":" in sub_col for sub_col in col]):
                mb_sub_col_names_sets = [
                    tuple(map(lambda x: x.split(":")[0], row[col_name]))
                    for row in raw_factors
                ]
                if len(set(mb_sub_col_names_sets)) == 1:
                    pheno_complex_columns_def[col_name] = "dict"
                else:
                    pheno_complex_columns_def[col_name] = "list"
            else:
                pheno_complex_columns_def[col_name] = "list"

    factors = []
    for idx, factor in enumerate(raw_factors):
        pheno_index.append(factor.pop('Sample_geo_accession', idx))
        factor.pop('sample_table_begin', None)
        factor.pop('sample_table_end', None)
        fixed_factor = {}
        for col_name, col in factor.iteritems():
            # Special treat for sub columns
            if col_name in pheno_complex_columns_def:
                if pheno_complex_columns_def[col_name] == "list":
                    for sub_idx, sub_col in enumerate(col):
                        fixed_factor["%s_%s" % (col_name, sub_idx + 1)] = sub_col
                elif pheno_complex_columns_def[col_name] == "dict":
                    for sub_col in col:
                        sub_name, sub_value = sub_col.split(":", 1)
                        fixed_factor["%s_%s" % (col_name, sub_name)] = sub_value

            else:
                fixed_factor[col_name] = col
        factors.append(fixed_factor)

    # TODO: add ordering to phenotype features

    pheno_df = DataFrame([Series(factor) for factor in factors], index=pheno_index)
    if expression_set.pheno_metadata["user_class_title"] not in pheno_df.columns:
        pheno_df[expression_set.pheno_metadata["user_class_title"]] = ""

    pheno_df.index.name = 'Sample_geo_accession'
    expression_set.store_pheno_data_frame(pheno_df)

    return [expression_set], {}
Ejemplo n.º 8
0
def preprocess_soft(exp, block, source_file):
    #TODO: now we assume that we get GSE file
    try:
        soft = list(parse_geo(gzip.open(source_file.filepath)))
    except:
        raise RuntimeError("Bad source file, can't read")

    assert soft[2].entity_type == "PLATFORM"

    pl = soft[2].table_rows
    id_idx = pl[0].index('ID')
    # entrez_idx = pl[0].index('ENTREZ_GENE_ID')
    refseq_idx = [
        i for i, item in enumerate(pl[0])
        if re.search('.*refseq.*', item, re.IGNORECASE)
    ]
    if refseq_idx == []:
        refseq_idx = [
            i for i, item in enumerate(pl[0])
            if re.search('.*mirna.*', item, re.IGNORECASE)
        ][0]
    else:
        refseq_idx = refseq_idx[0]

    probe_to_genes_GS = GS()
    for row in pl[1:]:
        probe_to_genes_GS.description[row[id_idx]] = ""
        probe_to_genes_GS.genes[row[id_idx]] = [
            row[refseq_idx].split(" /// ")[0]
        ]

    # platform_annotation = PlatformAnnotation(
    #     "TODO:GET NAME FROM SOFT",
    #     base_dir=exp.get_data_folder(),
    #     base_filename="%s_annotation" % block.uuid
    # )
    #
    # platform_annotation.gene_sets.metadata["gene_units"] = GeneUnits.ENTREZ_ID
    # platform_annotation.gene_sets.metadata["set_units"] = GeneUnits.PROBE_ID
    # platform_annotation.gene_sets.store_gs(probe_to_genes_GS)

    if settings.CELERY_DEBUG:
        import sys
        sys.path.append(
            '/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg'
        )
        import pydevd
        pydevd.settrace('localhost',
                        port=6901,
                        stdoutToServer=True,
                        stderrToServer=True)

    id_ref_idx = soft[3].table_rows[0].index("ID_REF")
    value_idx = soft[3].table_rows[0].index("VALUE")
    assay_df = DataFrame(
        dict([(soft[i].entity_attributes['Sample_geo_accession'],
               Series(
                   dict(
                       map_probes_to_refseqs(
                           probe_to_genes_GS.genes,
                           [(row[id_ref_idx], row[value_idx])
                            for row in soft[i].table_rows[1:]]))))
              for i in range(3, len(soft))]))

    expression_set = ExpressionSet(exp.get_data_folder(), "%s_es" % block.uuid)
    expression_set.store_assay_data_frame(assay_df.T)

    raw_factors = [soft[i].entity_attributes for i in range(3, len(soft))]
    pheno_index = []

    # Here we trying to guess sub columns
    one_factor_row = raw_factors[0]
    pheno_complex_columns_def = {}
    for col_name, col in one_factor_row.iteritems():
        if type(col) in [str, unicode]:
            continue
        else:
            if all([":" in sub_col for sub_col in col]):
                mb_sub_col_names_sets = [
                    tuple(map(lambda x: x.split(":")[0], row[col_name]))
                    for row in raw_factors
                ]
                if len(set(mb_sub_col_names_sets)) == 1:
                    pheno_complex_columns_def[col_name] = "dict"
                else:
                    pheno_complex_columns_def[col_name] = "list"
            else:
                pheno_complex_columns_def[col_name] = "list"

    factors = []
    for idx, factor in enumerate(raw_factors):
        pheno_index.append(factor.pop('Sample_geo_accession', idx))
        factor.pop('sample_table_begin', None)
        factor.pop('sample_table_end', None)
        fixed_factor = {}
        for col_name, col in factor.iteritems():
            # Special treat for sub columns
            if col_name in pheno_complex_columns_def:
                if pheno_complex_columns_def[col_name] == "list":
                    for sub_idx, sub_col in enumerate(col):
                        fixed_factor["%s_%s" %
                                     (col_name, sub_idx + 1)] = sub_col
                elif pheno_complex_columns_def[col_name] == "dict":
                    for sub_col in col:
                        sub_name, sub_value = sub_col.split(":", 1)
                        fixed_factor["%s_%s" %
                                     (col_name, sub_name)] = sub_value

            else:
                fixed_factor[col_name] = col
        factors.append(fixed_factor)

    # TODO: add ordering to phenotype features

    pheno_df = DataFrame([Series(factor) for factor in factors],
                         index=pheno_index)
    if expression_set.pheno_metadata[
            "user_class_title"] not in pheno_df.columns:
        pheno_df[expression_set.pheno_metadata["user_class_title"]] = ""

    pheno_df.index.name = 'Sample_geo_accession'
    expression_set.store_pheno_data_frame(pheno_df)

    return [expression_set], {}
Ejemplo n.º 9
0
    def process_upload(self, exp, *args, **kwargs):
        """
            @param exp: Experiment
        """
        # TODO: move to celery
        self.clean_errors()
        sep = getattr(self, "csv_sep", " ")

        try:
            if not self.pheno_matrix:
                self.warnings.append(Exception("Phenotype is undefined"))
                pheno_df = None
            else:
                pheno_df = self.pheno_matrix.get_as_data_frame(sep)
                pheno_df.set_index(pheno_df.columns[0])

                # TODO: solve somehow better: Here we add empty column with user class assignment
                pheno_df[ExpressionSet(
                    None, None).pheno_metadata["user_class_title"]] = ""

            if self.m_rna_matrix is not None:
                m_rna_assay_df = self.m_rna_matrix.get_as_data_frame(sep)

                m_rna_es = ExpressionSet(base_dir=exp.get_data_folder(),
                                         base_filename="%s_m_rna_es" %
                                         self.uuid)
                m_rna_es.store_assay_data_frame(m_rna_assay_df)
                m_rna_es.store_pheno_data_frame(pheno_df)
                m_rna_es.working_unit = self.m_rna_unit

                self.set_out_var("m_rna_es", m_rna_es)

                # TODO: fetch GPL annotation if GPL id was provided

            if self.mi_rna_matrix is not None:
                mi_rna_assay_df = self.mi_rna_matrix.get_as_data_frame(sep)

                mi_rna_es = ExpressionSet(base_dir=exp.get_data_folder(),
                                          base_filename="%s_mi_rna_es" %
                                          self.uuid)
                mi_rna_es.store_assay_data_frame(mi_rna_assay_df)
                mi_rna_es.store_pheno_data_frame(pheno_df)

                self.set_out_var("mi_rna_es", mi_rna_es)

            if self.methyl_matrix is not None:

                methyl_assay_df = self.methyl_matrix.get_as_data_frame(sep)

                methyl_es = ExpressionSet(base_dir=exp.get_data_folder(),
                                          base_filename="%s_methyl_es" %
                                          self.uuid)
                methyl_es.store_assay_data_frame(methyl_assay_df)
                methyl_es.store_pheno_data_frame(pheno_df)

                self.set_out_var("methyl_es", methyl_es)

            self.do_action("success", exp)
        except Exception as e:
            ex_type, ex, tb = sys.exc_info()
            traceback.print_tb(tb)
            self.do_action("error", exp, e)