Esempio n. 1
0
def dataset_download(gds_id, samples=None, transpose=False, callback=None):
    file_name = '{}.tab'.format(gds_id)
    local_files.update(file_name, extract=True, callback=callback)

    table = Table(local_files.localpath_download(file_name))
    title = table.name
    gds_info = local_files.info(file_name)
    table_annotations = {TableAnnotation.tax_id: gds_info['taxid']}

    if callback:
        callback()

    if samples is not None:
        filters = [
            table_filter.FilterStringList(sample, sample_types)
            for sample, sample_types in samples.items()
        ]
        table = table_filter.Values(filters)(table)

        column_values = []
        for meta_var in samples.keys():
            column_values.append(
                table.get_column_view(table.domain[meta_var])[0])

        class_values = list(map('|'.join, zip(*column_values)))

        _class_values = list(set(class_values))
        map_class_values = {
            value: key
            for (key, value) in enumerate(_class_values)
        }
        class_var = DiscreteVariable(name='class', values=_class_values)
        _domain = Domain(table.domain.attributes,
                         table.domain.class_vars + (class_var, ),
                         table.domain.metas)

        table = table.transform(_domain)
        col, _ = table.get_column_view(class_var)
        col[:] = [map_class_values[class_val] for class_val in class_values]

    if transpose:
        table = Table.transpose(table,
                                feature_names_column='sample_id',
                                meta_attr_name='genes')
        table.name = title  # table name is lost after transpose
        table_annotations[TableAnnotation.gene_as_attr_name] = not gds_info[
            TableAnnotation.gene_as_attr_name]
        table_annotations[TableAnnotation.gene_id_column] = gds_info[
            TableAnnotation.gene_id_attribute]
    else:
        table_annotations[TableAnnotation.gene_as_attr_name] = gds_info[
            TableAnnotation.gene_as_attr_name]
        table_annotations[TableAnnotation.gene_id_attribute] = gds_info[
            TableAnnotation.gene_id_attribute]

    if callback:
        callback()

    table.attributes = table_annotations
    return table
Esempio n. 2
0
def dataset_download(gds_id, samples=None, transpose=False, callback=None):
    file_name = '{}.tab'.format(gds_id)
    file_path = local_files.localpath_download(file_name, extract=True, callback=callback)

    table = Table(file_path)
    title = table.name
    gds_info = local_files.info(file_name)
    table_annotations = {TableAnnotation.tax_id: gds_info['taxid']}

    if callback:
        callback()

    if samples is not None:
        filters = [table_filter.FilterStringList(sample, sample_types) for sample, sample_types in samples.items()]
        table = table_filter.Values(filters)(table)

        column_values = []
        for meta_var in samples.keys():
            column_values.append(table.get_column_view(table.domain[meta_var])[0])

        class_values = list(map('|'.join, zip(*column_values)))

        _class_values = list(set(class_values))
        map_class_values = {value: key for (key, value) in enumerate(_class_values)}
        class_var = DiscreteVariable(name='class', values=_class_values)
        _domain = Domain(table.domain.attributes, table.domain.class_vars + (class_var,), table.domain.metas)

        table = table.transform(_domain)
        col, _ = table.get_column_view(class_var)
        col[:] = [map_class_values[class_val] for class_val in class_values]

    if transpose:
        table = Table.transpose(table, feature_names_column='sample_id', meta_attr_name='genes')

        # When transposing a table, variable.attributes get picked up as numerical values instead of strings.
        # We need to convert from Continuous to StringVariable
        _genes = [
            [str(int(gene)) if not np.isnan(gene) else '?']
            for gene in table.get_column_view('Entrez ID')[0].astype(np.float64)
        ]
        new_var = StringVariable('Entrez ID')
        metas = [var for var in table.domain.metas if var.name != 'Entrez ID'] + [new_var]
        new_domain = Domain(table.domain.attributes, table.domain.class_vars, metas)
        table = table.transform(new_domain)
        table[:, new_var] = _genes

        # table name is lost after transpose
        table.name = title

        table_annotations[TableAnnotation.gene_as_attr_name] = not gds_info[TableAnnotation.gene_as_attr_name]
        table_annotations[TableAnnotation.gene_id_column] = gds_info[TableAnnotation.gene_id_attribute]
    else:
        table_annotations[TableAnnotation.gene_as_attr_name] = gds_info[TableAnnotation.gene_as_attr_name]
        table_annotations[TableAnnotation.gene_id_attribute] = gds_info[TableAnnotation.gene_id_attribute]

    if callback:
        callback()

    table.attributes = table_annotations
    return table
Esempio n. 3
0
    def commit(self):
        selection_model = self.data_view.selectionModel()

        if selection_model:
            selection = selection_model.selectedRows(self.COUNT)
            self.selected_rows = [self.filter_proxy_model.mapToSource(sel).row() for sel in selection]

            if selection and self.input_genes:
                genes = [model_index.data(Qt.UserRole) for model_index in selection]
                output_genes = [gene_name for gene_name in list(set.union(*genes))]
                self.num_of_sel_genes = len(output_genes)
                self.update_info_box()

                if self.use_attr_names:
                    selected = [
                        column
                        for column in self.input_data.domain.attributes
                        if self.gene_id_attribute in column.attributes
                        and str(column.attributes[self.gene_id_attribute]) in output_genes
                    ]

                    domain = Domain(selected, self.input_data.domain.class_vars, self.input_data.domain.metas)
                    new_data = self.input_data.from_table(domain, self.input_data)
                    self.Outputs.matched_genes.send(new_data)

                else:
                    # create filter from selected column for genes
                    only_known = table_filter.FilterStringList(self.gene_id_column, output_genes)
                    # apply filter to the data
                    data_table = table_filter.Values([only_known])(self.input_data)

                    self.Outputs.matched_genes.send(data_table)
Esempio n. 4
0
    def __apply_filters(self, data_table):
        set_of_attributes = set([
            key for attr in data_table.domain[:]
            for key in attr.attributes.keys() if key == NCBI_ID
        ])

        gene_id = NCBI_ID if NCBI_ID in data_table.domain or set_of_attributes else None

        if self.include_entrez_id:
            data_table, gene_id = self.__handle_ids(data_table)

        if self.filter_unknown:
            known_input_genes = [
                gene.input_name
                for gene in self.gene_matcher.get_known_genes()
            ]

            if self.use_attr_names:
                temp_domain = Domain([
                    attr for attr in data_table.domain.attributes
                    if attr.name in known_input_genes
                ],
                                     metas=data_table.domain.metas,
                                     class_vars=data_table.domain.class_vars)
                data_table = data_table.transform(temp_domain)
            else:

                # create filter from selected column for genes
                only_known = table_filter.FilterStringList(
                    self.selected_gene_col, known_input_genes)
                # apply filter to the data
                data_table = table_filter.Values([only_known])(data_table)

        return data_table, gene_id
Esempio n. 5
0
    def test_filter_string_list_case_insensitive_data(self):
        filtered_data = filter.Values(conditions=[
            filter.FilterStringList(-1, ['donec'], case_sensitive=False)
        ])(self.table)
        correct_data = [SqlRowInstance(filtered_data.domain, row)
                        for row in self.data if row[0] in ['Donec']]

        self.assertEqual(len(filtered_data), len(correct_data))
        self.assertSequenceEqual(filtered_data, correct_data)
Esempio n. 6
0
    def test_filter_string_list(self):
        filtered_data = filter.Values(conditions=[
            filter.FilterStringList(-1, ['et', 'in'])
        ])(self.table)
        correct_data = [SqlRowInstance(filtered_data.domain, row)
                        for row in self.data if row[0] in ['et', 'in']]

        self.assertEqual(len(filtered_data), len(correct_data))
        self.assertSequenceEqual(filtered_data, correct_data)
Esempio n. 7
0
    def commit(self):
        selection = self.table_view.selectionModel().selectedRows(
            self.table_model.entrez_column_index)

        selected_genes = [row.data() for row in selection]
        if not len(selected_genes):
            selected_genes = self.table_model.get_filtered_genes()

        gene_ids = self.get_target_ids()
        known_genes = [gid for gid in gene_ids if gid != '?']

        table = None
        gm_table = None
        if known_genes:
            # Genes are in rows (we have a column with genes).
            if not self.use_attr_names:

                if self.target_database in self.input_data.domain:
                    gene_var = self.input_data.domain[self.target_database]
                    metas = self.input_data.domain.metas
                else:
                    gene_var = StringVariable(self.target_database)
                    metas = self.input_data.domain.metas + (gene_var, )

                domain = Domain(self.input_data.domain.attributes,
                                self.input_data.domain.class_vars, metas)

                table = self.input_data.transform(domain)
                col, _ = table.get_column_view(gene_var)
                col[:] = gene_ids

                # filter selected rows
                selected_genes_set = set(selected_genes)
                selected_rows = [
                    row_index for row_index, row in enumerate(table)
                    if str(row[gene_var]) in selected_genes_set
                ]

                # handle table attributes
                table.attributes[TAX_ID] = self.get_selected_organism()
                table.attributes[GENE_AS_ATTRIBUTE_NAME] = False
                table.attributes[GENE_ID_COLUMN] = self.target_database
                table = table[selected_rows] if selected_rows else table

                if self.exclude_unmatched:
                    # create filter from selected column for genes
                    only_known = table_filter.FilterStringList(
                        gene_var, known_genes)
                    # apply filter to the data
                    table = table_filter.Values([only_known])(table)

                self.Outputs.data_table.send(table)

            # genes are are in columns (genes are features).
            else:
                domain = self.input_data.domain.copy()
                table = self.input_data.transform(domain)

                for gene in self.gene_matcher.genes:
                    if gene.input_identifier in table.domain:

                        table.domain[gene.input_identifier].attributes[
                            self.target_database] = (str(gene.gene_id)
                                                     if gene.gene_id else '?')

                        if self.replace_id_with_symbol:
                            try:
                                table.domain[gene.input_identifier].name = str(
                                    gene.symbol)
                            except AttributeError:
                                # TODO: missing gene symbol, need to handle this?
                                pass

                # filter selected columns
                selected_genes_set = set(selected_genes)
                selected = [
                    column for column in table.domain.attributes
                    if self.target_database in column.attributes
                    and str(column.attributes[
                        self.target_database]) in selected_genes_set
                ]

                output_attrs = table.domain.attributes

                if selected:
                    output_attrs = selected

                if self.exclude_unmatched:
                    known_genes_set = set(known_genes)
                    output_attrs = [
                        col for col in output_attrs if col.attributes[
                            self.target_database] in known_genes_set
                    ]

                domain = Domain(output_attrs, table.domain.class_vars,
                                table.domain.metas)

                table = table.from_table(domain, table)

                # handle table attributes
                table.attributes[TAX_ID] = self.get_selected_organism()
                table.attributes[GENE_AS_ATTRIBUTE_NAME] = True
                table.attributes[GENE_ID_ATTRIBUTE] = self.target_database

            gm_table = self.gene_matcher.to_data_table(
                selected_genes=selected_genes if selected_genes else None)

        self.Outputs.data_table.send(table)
        self.Outputs.gene_matcher_results.send(gm_table)