def get_enriched_pathways(self, genes, reference=None, prob=statistics.Binomial(), callback=None): """ Return a dictionary with enriched pathways ids as keys and (list_of_genes, p_value, num_of_reference_genes) tuples as items. """ if reference is None: reference = self.genes.keys() reference = set(reference) allPathways = defaultdict(lambda: [[], 1.0, []]) milestones = progress_bar_milestones(len(genes), 100) pathways_db = KEGGPathways() pathways_for_gene = [] for i, gene in enumerate(genes): pathways_for_gene.append(self.pathways([gene])) if callback and i in milestones: callback(i * 50.0 / len(genes)) # pre-cache for speed pathways_db.pre_cache( [pid for pfg in pathways_for_gene for pid in pfg]) for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)): for pathway in pathways: if pathways_db.get_entry(pathway).gene: allPathways[pathway][0].append(gene) if callback and i in milestones: callback(50.0 + i * 50.0 / len(genes)) pItems = allPathways.items() for i, (p_id, entry) in enumerate(pItems): pathway = pathways_db.get_entry(p_id) entry[2].extend(reference.intersection(pathway.gene or [])) entry[1] = prob.p_value(len(entry[0]), len(reference), len(entry[2]), len(genes)) return dict([(pid, (genes, p, len(ref))) for pid, (genes, p, ref) in allPathways.items()])
def get_enriched_terms( self, genes, reference=None, evidence_codes=None, slims_only=False, aspect=None, prob=statistics.Binomial(), use_fdr=True, progress_callback=None, ): """ Return a dictionary of enriched terms, with tuples of (list_of_genes, p_value, reference_count) for items and term ids as keys. P-Values are FDR adjusted if use_fdr is True (default). :param genes: List of genes :param reference: List of genes (if None all genes included in the annotations will be used). :param evidence_codes: List of evidence codes to consider. :param slims_only: If `True` return only slim terms. :param aspect: Which aspects to use. Use all by default; one of Process (biological process), Function (molecular function) or Component (cellular component) :param prob: :param use_fdr: :param progress_callback: """ all_genes = set(genes) if aspect is None: aspects_set = {'Process', 'Component', 'Function'} elif isinstance(aspect, str): aspects_set = {aspect} else: aspects_set = aspect if reference is None: reference = self.genes() evidence_codes = set(evidence_codes or evidence_dict.keys()) annotations = [ ann for gene in genes for ann in self.gene_annotations[gene] if ann.evidence in evidence_codes and ann.aspect in aspects_set ] ref_annotations = { ann for gene in reference for ann in self.gene_annotations[gene] if ann.evidence in evidence_codes and ann.aspect in aspects_set } annotations_dict = defaultdict(set) for ann in annotations: annotations_dict[ann.go_id].add(ann) self._ensure_ontology() if slims_only and not self.ontology.slims_subset: warnings.warn( "Unspecified slims subset in the ontology! " "Using 'goslim_generic' subset", UserWarning) self.ontology.set_slims_subset('goslim_generic') terms = annotations_dict.keys() filtered_terms = [term for term in terms if term in self.ontology] if len(terms) != len(filtered_terms): term_diff = set(terms) - set(filtered_terms) warnings.warn( "%s terms in the annotations were not found in the " "ontology." % ",".join(map(repr, term_diff)), UserWarning, ) terms = self.ontology.extract_super_graph(filtered_terms) res = {} milestones = progress_bar_milestones(len(terms), 100) for i, term in enumerate(terms): if slims_only and term not in self.ontology.slims_subset: continue all_annotations = self.get_annotations_by_go_id(term).intersection( ref_annotations) all_annotated_genes = {ann.gene_id for ann in all_annotations} mapped_genes = all_genes.intersection(all_annotated_genes) if len(reference) > len(all_annotated_genes): mapped_reference_genes = reference.intersection( all_annotated_genes) else: mapped_reference_genes = all_annotated_genes.intersection( reference) res[term] = ( [gene for gene in mapped_genes], prob.p_value(len(mapped_genes), len(reference), len(mapped_reference_genes), len(genes)), len(mapped_reference_genes), ) if progress_callback and i in milestones: progress_callback(100.0 * i / len(terms)) if use_fdr: res = sorted(res.items(), key=lambda x: x[1][1]) res = { id: (genes, p, ref) for (id, (genes, _, ref)), p in zip( res, statistics.FDR([p for _, (_, p, _) in res])) } return res
def __init__(self, parent=None): super().__init__(self, parent) self.input_data = None self.ref_data = None self.ontology = None self.annotations = None self.loaded_annotation_code = None self.treeStructRootKey = None self.probFunctions = [statistics.Binomial(), statistics.Hypergeometric()] self.selectedTerms = [] self.selectionChanging = 0 self.__state = State.Ready self.__scheduletimer = QTimer(self, singleShot=True) self.__scheduletimer.timeout.connect(self.__update) ############# # GUI ############# self.tabs = gui.tabWidget(self.controlArea) # Input tab self.inputTab = gui.createTabPage(self.tabs, "Input") box = gui.widgetBox(self.inputTab, "Info") self.infoLabel = gui.widgetLabel(box, "No data on input\n") gui.button(box, self, "Ontology/Annotation Info", callback=self.ShowInfo, tooltip="Show information on loaded ontology and annotations") self.referenceRadioBox = gui.radioButtonsInBox( self.inputTab, self, "useReferenceDataset", ["Entire genome", "Reference set (input)"], tooltips=["Use entire genome for reference", "Use genes from Referece Examples input signal as reference"], box="Reference", callback=self.__invalidate) self.referenceRadioBox.buttons[1].setDisabled(True) gui.radioButtonsInBox( self.inputTab, self, "aspectIndex", ["Biological process", "Cellular component", "Molecular function"], box="Aspect", callback=self.__invalidate) # Filter tab self.filterTab = gui.createTabPage(self.tabs, "Filter") box = gui.widgetBox(self.filterTab, "Filter GO Term Nodes") gui.checkBox(box, self, "filterByNumOfInstances", "Genes", callback=self.FilterAndDisplayGraph, tooltip="Filter by number of input genes mapped to a term") ibox = gui.indentedBox(box) gui.spin(ibox, self, 'minNumOfInstances', 1, 100, step=1, label='#:', labelWidth=15, callback=self.FilterAndDisplayGraph, callbackOnReturn=True, tooltip="Min. number of input genes mapped to a term") gui.checkBox(box, self, "filterByPValue_nofdr", "p-value", callback=self.FilterAndDisplayGraph, tooltip="Filter by term p-value") gui.doubleSpin(gui.indentedBox(box), self, 'maxPValue_nofdr', 1e-8, 1, step=1e-8, label='p:', labelWidth=15, callback=self.FilterAndDisplayGraph, callbackOnReturn=True, tooltip="Max term p-value") # use filterByPValue for FDR, as it was the default in prior versions gui.checkBox(box, self, "filterByPValue", "FDR", callback=self.FilterAndDisplayGraph, tooltip="Filter by term FDR") gui.doubleSpin(gui.indentedBox(box), self, 'maxPValue', 1e-8, 1, step=1e-8, label='p:', labelWidth=15, callback=self.FilterAndDisplayGraph, callbackOnReturn=True, tooltip="Max term p-value") box = gui.widgetBox(box, "Significance test") gui.radioButtonsInBox(box, self, "probFunc", ["Binomial", "Hypergeometric"], tooltips=["Use binomial distribution test", "Use hypergeometric distribution test"], callback=self.__invalidate) # TODO: only update the p values box = gui.widgetBox(self.filterTab, "Evidence codes in annotation", addSpace=True) self.evidenceCheckBoxDict = {} for etype in go.evidenceTypesOrdered: ecb = QCheckBox( etype, toolTip=go.evidenceTypes[etype], checked=self.useEvidenceType[etype]) ecb.toggled.connect(self.__on_evidenceChanged) box.layout().addWidget(ecb) self.evidenceCheckBoxDict[etype] = ecb # Select tab self.selectTab = gui.createTabPage(self.tabs, "Select") box = gui.radioButtonsInBox( self.selectTab, self, "selectionDirectAnnotation", ["Directly or Indirectly", "Directly"], box="Annotated genes", callback=self.ExampleSelection) box = gui.widgetBox(self.selectTab, "Output", addSpace=True) gui.radioButtonsInBox( box, self, "selectionDisjoint", btnLabels=["All selected genes", "Term-specific genes", "Common term genes"], tooltips=["Outputs genes annotated to all selected GO terms", "Outputs genes that appear in only one of selected GO terms", "Outputs genes common to all selected GO terms"], callback=self.ExampleSelection) # ListView for DAG, and table for significant GOIDs self.DAGcolumns = ['GO term', 'Cluster', 'Reference', 'p-value', 'FDR', 'Genes', 'Enrichment'] self.splitter = QSplitter(Qt.Vertical, self.mainArea) self.mainArea.layout().addWidget(self.splitter) # list view self.listView = GOTreeWidget(self.splitter) self.listView.setSelectionMode(QTreeView.ExtendedSelection) self.listView.setAllColumnsShowFocus(1) self.listView.setColumnCount(len(self.DAGcolumns)) self.listView.setHeaderLabels(self.DAGcolumns) self.listView.header().setSectionsClickable(True) self.listView.header().setSortIndicatorShown(True) self.listView.header().setSortIndicator(self.DAGcolumns.index('p-value'), Qt.AscendingOrder) self.listView.setSortingEnabled(True) self.listView.setItemDelegateForColumn( 6, EnrichmentColumnItemDelegate(self)) self.listView.setRootIsDecorated(True) self.listView.itemSelectionChanged.connect(self.ViewSelectionChanged) # table of significant GO terms self.sigTerms = QTreeWidget(self.splitter) self.sigTerms.setColumnCount(len(self.DAGcolumns)) self.sigTerms.setHeaderLabels(self.DAGcolumns) self.sigTerms.setSortingEnabled(True) self.sigTerms.setSelectionMode(QTreeView.ExtendedSelection) self.sigTerms.header().setSortIndicator(self.DAGcolumns.index('p-value'), Qt.AscendingOrder) self.sigTerms.setItemDelegateForColumn( 6, EnrichmentColumnItemDelegate(self)) self.sigTerms.itemSelectionChanged.connect(self.TableSelectionChanged) self.sigTableTermsSorted = [] self.graph = {} self.originalGraph = None self.inputTab.layout().addStretch(1) self.filterTab.layout().addStretch(1) self.selectTab.layout().addStretch(1) class AnnotationSlot(SimpleNamespace): taxid = ... # type: str name = ... # type: str filename = ... # type:str @staticmethod def parse_tax_id(f_name): return f_name.split('.')[1] try: remote_files = serverfiles.ServerFiles().listfiles(DOMAIN) except (ConnectTimeout, RequestException, ConnectionError): # TODO: Warn user about failed connection to the remote server remote_files = [] self.available_annotations = [ AnnotationSlot( taxid=AnnotationSlot.parse_tax_id(annotation_file), name=taxonomy.common_taxid_to_name(AnnotationSlot.parse_tax_id(annotation_file)), filename=FILENAME_ANNOTATION.format(AnnotationSlot.parse_tax_id(annotation_file)) ) for _, annotation_file in set(remote_files + serverfiles.listfiles(DOMAIN)) if annotation_file != FILENAME_ONTOLOGY ] self._executor = ThreadExecutor()
def assign_annotations(items_sets, available_annotations, data, p_value_fun=PFUN_BINOMIAL, scoring=SCORING_EXP_RATIO): """ The function gets a set of attributes (e.g. genes) for each cell and attributes for each annotation. It returns the annotations significant for each cell. Parameters ---------- items_sets : list of sets Set of most important attributes for each item. available_annotations : Orange.data.Table Available annotations (e.g. cell types) p_value_fun : str, optional (defaults: TEST_BINOMIAL) A function that calculates p-value. It can be either PFUN_BINOMIAL that uses statistics.Binomial().p_value or PFUN_HYPERGEOMETRIC that uses hypergeom.sf. data : Orange.data.Table Tabular data with gene expressions - we need that to compute scores. scoring : str, optional (default=SCORING_EXP_RATIO) Type of scoring Returns ------- Orange.data.Table Annotation probabilities Orange.data.Table Annotation fdrs """ assert TAX_ID in data.attributes, "The input table needs to have a " \ "tax_id attribute" tax_id = data.attributes[TAX_ID] # select function for p-value if p_value_fun == PFUN_HYPERGEOMETRIC: # sf accept x-1 instead of x p_fun = lambda x, n, m, k: hypergeom.sf(x-1, n, m, k) else: p_fun = statistics.Binomial().p_value # retrieve number of genes for organism N = len(GeneInfo(tax_id)) grouped_annotations_items, genes_celltypes = \ AnnotateSamples._group_marker_attributes( available_annotations, [d.attributes.get("Entrez ID") for d in data.domain.attributes]) def hg_cell(item_attributes): p_values = [] scores = [] for i, (ct, attributes) in enumerate(grouped_annotations_items): intersect = item_attributes & attributes x = len(intersect) k = len(item_attributes) # drawn balls - expressed for item m = len(attributes) # marked balls - items for a process if x > 2: # avoid the heavy computation when intersect small p_value = p_fun(x, N, m, k) else: p_value = 1 p_values.append(p_value) if scoring == SCORING_EXP_RATIO: scores.append(x / (m + 1e-16)) fdrs = statistics.FDR(p_values) if scoring == SCORING_LOG_FDR or scoring == SCORING_LOG_PVALUE: scores = AnnotateSamples._scores_fdr( fdrs if scoring == SCORING_LOG_FDR else p_values) return scores, fdrs prob_fdrs = [hg_cell(its) for its in items_sets] probs, fdrs = zip(*prob_fdrs) if scoring == SCORING_MARKERS_SUM: probs = AnnotateSamples._scores_markers_sum(data, genes_celltypes) domain = Domain( [ContinuousVariable(ct[0]) for ct in grouped_annotations_items]) probs_table = Table(domain, np.array(probs)) fdrs_table = Table(domain, np.array(fdrs)) return probs_table, fdrs_table