def _run( data: Table, group_by_attrs: List[Variable], aggregations: Dict[Variable, Set[str]], result: Result, state: TaskState, ) -> Result: def progress(part): state.set_progress_value(part * 100) if state.is_interruption_requested(): raise Exception state.set_status("Aggregating") # group table rows if result.group_by is None: result.group_by = data.groupby(group_by_attrs) state.set_partial_result(result) aggregations = { var: [(agg, AGGREGATIONS[agg].function) for agg in sorted(aggs, key=AGGREGATIONS_ORD.index)] for var, aggs in aggregations.items() } result.result_table = result.group_by.aggregate( aggregations, wrap_callback(progress, 0.2, 1)) return result
def add_task_to_dispose_queue(task: TaskState): # transfer ownership of task to Qt, and delete it after completion # all other signals from task should be disconnected. assert task.parent() is None app = QApplication.instance() task.setParent(app) task.watcher.finished.connect(task.deleteLater)
def worker(data: Table, learner, state: TaskState): # No need to check for irregularities, this is done in widget time_var, event_var = get_survival_endpoints(data.domain) def fit_cox_models(attrs_combinations): results = [] for attrs in attrs_combinations: columns = attrs + [time_var.name, event_var.name] cph_model = learner(data[:, columns]) log2p = cph_model.ll_ratio_log2p() result = Result(log2p, cph_model) results.append(result) return results attributes = [attr for attr in data.domain.attributes] progress_steps = iter(np.linspace(0, 100, len(attributes))) _trace = fit_cox_models([attributes]) while len(_trace) != len(data.domain.attributes): attributes = [attr for attr in _trace[-1].model.domain.attributes] if len(attributes) > 1: combinations = [ list(comb) for comb in itertools.combinations(attributes, len(attributes) - 1) ] else: combinations = [attributes] results = fit_cox_models(combinations) _trace.append(max(results, key=lambda result: result.log2p)) state.set_progress_value(next(progress_steps)) return _trace
def run(corpus: Corpus, statistics: Tuple[int, str], state: TaskState) -> None: """ This function runs the computation for new features. All results will be reported as a partial results. Parameters ---------- corpus The corpus on which the computation is held. statistics Tuple of statistic pairs to be computed: (statistics id, string pattern) state State used to report progress and partial results. """ # callback is called for each corpus element statistics time tick_values = iter(np.linspace(0, 100, len(corpus) * len(statistics))) def advance(): state.set_progress_value(next(tick_values)) for s, patern in statistics: fun = STATISTICS_FUNCTIONS[s] result = fun(corpus, patern, advance) if result is not None: result = result + (ComputeValue(fun, patern),) state.set_partial_result((s, patern, result))
def _prepare_dir_and_save_images(paths_queue, dir_name, target_size, previously_saved, state: TaskState): """ This function prepares a directory structure and calls function that saves images. Parameters ---------- previously_saved : int Number of saved images in the previous process. If the process is resumed it is non-zero. """ res = Result(paths=paths_queue) if previously_saved == 0: _clean_dir(dir_name) steps = len(paths_queue) + previously_saved loader = ImageLoader() while res.paths: from_path, to_path = res.paths.popleft() _save_an_image(loader, from_path, to_path, target_size) state.set_progress_value((1 - len(res.paths) / steps) * 100) state.set_partial_result(res) if state.is_interruption_requested(): return res return res
def _load_corpus(path: str, data: Table, state: TaskState) -> Corpus: state.set_status("Loading") corpus = None if data: corpus = Corpus.from_table(data.domain, data) elif path: corpus = Corpus.from_file(path) corpus.name = os.path.splitext(os.path.basename(path))[0] return corpus
def run_embedding( images: Table, file_paths_attr: Variable, embedder_name: str, state: TaskState, ) -> Result: """ Run the embedding process Parameters ---------- images Data table with images to embed. file_paths_attr The column of the table with images. embedder_name The name of selected embedder. state State object used for controlling and progress. Returns ------- The object that holds embedded images, skipped images, and number of skipped images. """ embedder = ImageEmbedder(model=embedder_name) file_paths = images[:, file_paths_attr].metas.flatten() file_paths_mask = file_paths == file_paths_attr.Unknown file_paths_valid = file_paths[~file_paths_mask] # init progress bar and fuction ticks = iter(np.linspace(0.0, 100.0, file_paths_valid.size)) def advance(success=True): if state.is_interruption_requested(): embedder.set_canceled() if success: state.set_progress_value(next(ticks)) try: emb, skip, n_skip = embedder(images, col=file_paths_attr, callback=advance) except EmbeddingConnectionError: # recompute ticks to go from current state to 100 ticks = iter(np.linspace(next(ticks), 100.0, file_paths_valid.size)) state.set_partial_result("squeezenet") embedder = ImageEmbedder(model="squeezenet") emb, skip, n_skip = embedder(images, col=file_paths_attr, callback=advance) return Result(embedding=emb, skip_images=skip, num_skipped=n_skip)
def compute_secondary_clusters(embedding: Table, result: Result, state: TaskState): if not result.clusters.groups or not embedding: result.clusters.secondary_table = None else: state.set_status("Finding secondary clusters...") hulls = {k: v[2] for k, v in result.clusters.groups.items()} clusters = result.clusters.table domain = clusters and clusters.domain["Clusters"] table = cluster_additional_points(embedding, hulls, domain) result.clusters.secondary_table = table state.set_partial_result(("secondary_clusters", result))
def _run( corpus: Corpus, words: List[str], scoring_methods: List[str], aggregation: str, additional_params: dict, state: TaskState, ) -> None: """ Perform word scoring with selected scoring methods Parameters ---------- corpus Corpus of documents words List of words used for scoring scoring_methods Methods to score documents with aggregation Aggregation applied for each document on word scores additional_params Additional prameters for scores (e.g. embedding needs text language) state TaskState for reporting the task status and giving partial results """ def callback(i: float) -> None: state.set_progress_value(i * 100) if state.is_interruption_requested(): raise Exception cb_part = 1 / (len(scoring_methods) + 1) # +1 for preprocessing words = _preprocess_words(corpus, words, wrap_callback(callback, end=cb_part)) if len(words) == 0: raise Exception( "Empty word list after preprocessing. Please provide a valid set of words." ) for i, sm in enumerate(scoring_methods): scoring_method = SCORING_METHODS[sm][1] sig = signature(scoring_method) add_params = { k: v for k, v in additional_params.items() if k in sig.parameters } scs = scoring_method( corpus, words, wrap_callback(callback, start=(i + 1) * cb_part, end=(i + 2) * cb_part), **add_params) scs = AGGREGATIONS[aggregation](scs, axis=1) state.set_partial_result((sm, aggregation, scs))
def run_gene_matcher(gene_matcher: GeneMatcher, state: TaskState): current_iter = 0 max_iter = len(gene_matcher.genes) def callback(): nonlocal current_iter current_iter += 1 state.set_progress_value(100 * (current_iter / max_iter)) state.set_status("Working ...") gene_matcher._progress_callback = callback gene_matcher.match_genes()
def run(gene_sets: GeneSets, selected_gene_sets: List[Tuple[str, ...]], genes, state: TaskState) -> Results: results = Results() items = [] step, steps = 0, len(gene_sets) if not genes: return results state.set_status('Calculating...') for gene_set in sorted(gene_sets): step += 1 if step % (steps / 10) == 0: state.set_progress_value(100 * step / steps) if gene_set.hierarchy not in selected_gene_sets: continue if state.is_interruption_requested(): return results matched_set = gene_set.genes & genes if len(matched_set) > 0: category_column = QStandardItem() term_column = QStandardItem() count_column = QStandardItem() genes_column = QStandardItem() category_column.setData(", ".join(gene_set.hierarchy), Qt.DisplayRole) term_column.setData(gene_set.name, Qt.DisplayRole) term_column.setData(gene_set.name, Qt.ToolTipRole) # there was some cases when link string was not empty string but not valid (e.g. "_") if gene_set.link and urlparse(gene_set.link).scheme: term_column.setData(gene_set.link, LinkRole) term_column.setForeground(QColor(Qt.blue)) count_column.setData(matched_set, Qt.UserRole) count_column.setData(len(matched_set), Qt.DisplayRole) genes_column.setData(len(gene_set.genes), Qt.DisplayRole) genes_column.setData( set(gene_set.genes), Qt.UserRole) # store genes to get then on output on selection items.append( [count_column, genes_column, category_column, term_column]) results.items = items return results
def compute_clusters(embedding: Table, result: Result, state: TaskState): if not result.scores.table or not embedding: result.clusters.table = None result.clusters.groups = None else: state.set_status("Finding clusters...") kwargs = {} if result.clusters.epsilon is not None: kwargs["eps"] = result.clusters.epsilon clusters = annotate_projection(result.scores.table, embedding, **kwargs) result.clusters.table = clusters[0] result.clusters.groups = clusters[1] result.clusters.epsilon = clusters[2] state.set_partial_result(("clusters", result))
def count_words(data: Corpus, state: TaskState) -> Tuple[Counter, bool]: """ This function implements counting process of the word cloud widget and is called in the separate thread by concurrent. Parameters ---------- data Corpus with the data state State used to report status. Returns ------- Reports counts as a counter and boolean that tell whether the data were retrieved on bag of words basis. """ state.set_status("Calculating...") state.set_progress_value(0) bow_counts = _bow_words(data) state.set_progress_value(0.5) if bow_counts: corpus_counter = Counter(bow_counts) else: corpus_counter = Counter(w for doc in data.ngrams for w in doc) state.set_progress_value(1) return corpus_counter, bool(bow_counts)
def run(data: Orange.data.Table, metric: distance, normalized_dist: bool, axis: int, state: TaskState) -> Orange.misc.DistMatrix: if data is None: return None def callback(i: float) -> bool: state.set_progress_value(i) if state.is_interruption_requested(): raise InterruptException state.set_status("Calculating...") kwargs = {"axis": 1 - axis, "impute": True, "callback": callback} if metric.supports_normalization and normalized_dist: kwargs["normalize"] = True return metric(data, **kwargs)
def run_vizrank(compute_score: Callable, states: Iterator, scores: List, task: TaskState): res = Result(queue=Queue(), scores=None) scores = scores.copy() def do_work(st, next_st): try: score = compute_score(st) if score is not None: pos = bisect_left(scores, score) res.queue.put_nowait(QueuedScore(position=pos, score=score, state=st, next_state=next_st)) scores.insert(pos, score) except Exception: # ignore current state in case of any problem pass res.scores = scores.copy() task.set_partial_result(res) state = None next_state = next(states) try: while True: if task.is_interruption_requested(): return res state = copy.copy(next_state) next_state = copy.copy(next(states)) do_work(state, next_state) except StopIteration: do_work(state, None) return res
def run_vizrank(compute_score: Callable, states: Iterator, scores: List, task: TaskState): res = Result(queue=Queue(), scores=None) scores = scores.copy() def do_work(st, next_st): try: score = compute_score(st) if score is not None: pos = bisect_left(scores, score) res.queue.put_nowait( QueuedScore(position=pos, score=score, state=st, next_state=next_st)) scores.insert(pos, score) except Exception: # ignore current state in case of any problem pass res.scores = scores.copy() task.set_partial_result(res) state = None next_state = next(states) try: while True: if task.is_interruption_requested(): return res state = copy.copy(next_state) next_state = copy.copy(next(states)) do_work(state, next_state) except StopIteration: do_work(state, None) return res
def run_download_task(gds_id: str, samples: DefaultDict[str, list], transpose: bool, state: TaskState): res = Result() current_iter = 0 max_iter = 102 def callback(): nonlocal current_iter current_iter += 1 state.set_progress_value(100 * (current_iter / max_iter)) state.set_status("Downloading...") res.gds_dataset = dataset_download(gds_id, samples, transpose=transpose, callback=callback) return res
def run_mds(matrix: DistMatrix, max_iter: int, step_size: int, init_type: int, embedding: np.ndarray, state: TaskState): res = Result(embedding=embedding) iterations_done = 0 init = embedding state.set_status("Running...") oldstress = np.finfo(np.float).max while True: step_iter = min(max_iter - iterations_done, step_size) mds = MDS( dissimilarity="precomputed", n_components=2, n_init=1, max_iter=step_iter, init_type=init_type, init_data=init ) mdsfit = mds(matrix) iterations_done += step_iter embedding, stress = mdsfit.embedding_, mdsfit.stress_ emb_norm = np.sqrt(np.sum(embedding ** 2, axis=1)).sum() if emb_norm > 0: stress /= emb_norm res.embedding = embedding state.set_partial_result(res) state.set_progress_value(100 * iterations_done / max_iter) if iterations_done >= max_iter or stress == 0 or \ (oldstress - stress) < mds.params["eps"]: return res init = embedding oldstress = stress if state.is_interruption_requested(): return res
def worker(table: Table, covariates: List, time_var: str, event_var: str, state: TaskState): with multiprocessing.Manager() as _manager: _queue = _manager.Queue() _cpu_count = cpu_count() df = table_to_frame(table, include_metas=False) df = df.astype({event_var: np.float64}) if len(covariates) > 50: batches = (df[[time_var, event_var] + batch] for batch in [covariates[i::_cpu_count] for i in range(_cpu_count)]) else: batches = (df[[time_var, event_var] + [cov]] for cov in covariates) progress_steps = iter(np.linspace(0, 100, len(covariates))) with multiprocessing.Pool(processes=_cpu_count) as pool: results = pool.map_async( partial( batch_to_process, _queue, time_var, event_var, ), batches, ) while True: try: state.set_progress_value(next(progress_steps)) _queue.get(timeout=3) except (queue.Empty, StopIteration): break stacked_result = np.vstack(results.get()) covariate_names = stacked_result[:, 0] results = stacked_result[:, 1:].astype(float) _, pvals_corrected = fdrcorrection(results[:, -1], is_sorted=False) results = np.hstack( (results, pvals_corrected.reshape(pvals_corrected.shape[0], -1))) return covariate_names, results
def run(data: Table, desc, use_values, task: TaskState) -> Result: if task.is_interruption_requested(): raise CancelledError # pragma: no cover new_variables = construct_variables(desc, data, use_values) # Explicit cancellation point after `construct_variables` which can # already run `compute_value`. if task.is_interruption_requested(): raise CancelledError # pragma: no cover attrs = [var for var in new_variables if var.is_primitive()] metas = [var for var in new_variables if not var.is_primitive()] new_domain = Orange.data.Domain(data.domain.attributes + tuple(attrs), data.domain.class_vars, metas=data.domain.metas + tuple(metas)) try: for variable in new_variables: variable.compute_value.mask_exceptions = False data = data.transform(new_domain) finally: for variable in new_variables: variable.compute_value.mask_exceptions = True return Result(data, attrs, metas)
def __submit(self, testfunc): # type: (Callable[[Callable[[float], None]], Results]) -> None """ Submit a testing function for evaluation MUST not be called if an evaluation is already pending/running. Cancel the existing task first. Parameters ---------- testfunc : Callable[[Callable[float]], Results]) Must be a callable taking a single `callback` argument and returning a Results instance """ assert self.__state != State.Running # Setup the task task = TaskState() def progress_callback(finished): if task.is_interruption_requested(): raise UserInterrupt() task.set_progress_value(100 * finished) testfunc = partial(testfunc, callback=progress_callback) task.start(self.__executor, testfunc) task.progress_changed.connect(self.setProgressValue) task.watcher.finished.connect(self.__task_complete) self.Outputs.evaluations_results.invalidate() self.Outputs.predictions.invalidate() self.progressBarInit() self.setStatusMessage("Running") self.__state = State.Running self.__task = task
def run(selected_data_transformed: Table, data: Table, result: Result, state: TaskState) -> None: state.set_status("Listing words") result.words = [ i.name for i in selected_data_transformed.domain.attributes ] state.set_status("Computing p-values") result.p_values = hypergeom_p_values(data.X, selected_data_transformed.X, callback=state.set_progress_value) state.set_status("Computing FDR values") result.fdr_values = FDR(result.p_values)
def run(data: Table, embedding: Optional[np.ndarray], state: TaskState): res = Result(embedding=embedding) # simulate wasteful calculation (increase 'steps') step, steps = 0, 10 state.set_status("Calculating...") while step < steps: for _ in range(steps): x_data = np.array(np.mean(data.X, axis=1)) if x_data.ndim == 2: x_data = x_data.ravel() y_data = np.random.rand(len(x_data)) embedding = np.vstack((x_data, y_data)).T step += 1 if step % (steps / 10) == 0: state.set_progress_value(100 * step / steps) if state.is_interruption_requested(): return res res.embedding = embedding state.set_partial_result(res) return res
def run_freeviz(data: Table, projector: FreeViz, state: TaskState): res = Result(projector=projector, projection=None) step, steps = 0, MAX_ITERATIONS initial = res.projector.components_.T state.set_status("Calculating...") while True: # Needs a copy because projection should not be modified inplace. # If it is modified inplace, the widget and the thread hold a # reference to the same object. When the thread is interrupted it # is still modifying the object, but the widget receives it # (the modified object) with a delay. res.projection = res.projector(data).copy() anchors = res.projector.components_.T res.projector.initial = anchors state.set_partial_result(res) if np.allclose(initial, anchors, rtol=1e-5, atol=1e-4): return res initial = anchors step += 1 state.set_progress_value(100 * step / steps) if state.is_interruption_requested(): return res
def run( gene_sets: GeneSets, selected_gene_sets: List[Tuple[str, ...]], genes, state: TaskState, reference_genes=None ) -> Results: results = Results() items = [] step, steps = 0, len(gene_sets) def set_progress(): nonlocal step step += 1 state.set_progress_value(100 * (step / steps)) if not genes: return results state.set_status('Calculating...') for gene_set in sorted(gene_sets): set_progress() if gene_set.hierarchy not in selected_gene_sets: continue if state.is_interruption_requested(): return results reference_genes = [] if reference_genes is None else reference_genes enrichemnt_result = gene_set.set_enrichment(reference_genes, genes.intersection(reference_genes)) if len(enrichemnt_result.query) > 0: category_column = QStandardItem() term_column = QStandardItem() count_column = QStandardItem() genes_column = QStandardItem() ref_column = QStandardItem() pval_column = QStandardItem() fdr_column = QStandardItem() enrichment_column = QStandardItem() category_column.setData(", ".join(gene_set.hierarchy), Qt.DisplayRole) term_column.setData(gene_set.name, Qt.DisplayRole) term_column.setData(gene_set.name, Qt.ToolTipRole) # there was some cases when link string was not empty string but not valid (e.g. "_") if gene_set.link and urlparse(gene_set.link).scheme: term_column.setData(gene_set.link, LinkRole) term_column.setForeground(QColor(Qt.blue)) count_column.setData(len(enrichemnt_result.query), Qt.DisplayRole) count_column.setData(set(enrichemnt_result.query), Qt.UserRole) genes_column.setData(len(gene_set.genes), Qt.DisplayRole) genes_column.setData(set(gene_set.genes), Qt.UserRole) # store genes to get then on output on selection ref_column.setData(len(enrichemnt_result.reference), Qt.DisplayRole) pval_column.setData(enrichemnt_result.p_value, Qt.DisplayRole) pval_column.setData(enrichemnt_result.p_value, Qt.ToolTipRole) enrichment_column.setData(enrichemnt_result.enrichment_score, Qt.DisplayRole) enrichment_column.setData(enrichemnt_result.enrichment_score, Qt.ToolTipRole) items.append( [ count_column, ref_column, pval_column, fdr_column, enrichment_column, genes_column, category_column, term_column, ] ) results.items = items return results
def run_vizrank(compute_score: Callable, iterate_states: Callable, saved_state: Optional[Iterable], scores: List, progress: int, state_count: int, task: TaskState): task.set_status("Getting combinations...") task.set_progress_value(0.1) states = iterate_states(saved_state) task.set_status("Getting scores...") res = Result(queue=Queue(), scores=None) scores = scores.copy() can_set_partial_result = True def do_work(st, next_st): try: score = compute_score(st) if score is not None: pos = bisect_left(scores, score) res.queue.put_nowait( QueuedScore(position=pos, score=score, state=st, next_state=next_st)) scores.insert(pos, score) except Exception: # ignore current state in case of any problem pass res.scores = scores.copy() def reset_flag(): nonlocal can_set_partial_result can_set_partial_result = True state = None next_state = next(states) try: while True: if task.is_interruption_requested(): return res task.set_progress_value(int(progress * 100 / max(1, state_count))) progress += 1 state = copy.copy(next_state) next_state = copy.copy(next(states)) do_work(state, next_state) # for simple scores (e.g. correlations widget) and many feature # combinations, the 'partial_result_ready' signal (emitted by # invoking 'task.set_partial_result') was emitted too frequently # for a longer period of time and therefore causing the widget # being unresponsive if can_set_partial_result: task.set_partial_result(res) can_set_partial_result = False Timer(0.01, reset_flag).start() except StopIteration: do_work(state, None) task.set_partial_result(res) return res
def runner(self, state: TaskState) -> Table: exp_type = self.data_output_options.expression_type[self.exp_type].type exp_source = self.data_output_options.expression_sources[ self.exp_source] proc_slug = self.data_output_options.process[self.proc_slug].slug collection_id = self.selected_collection_id table = self.data_table progress_steps_download = iter(np.linspace(0, 50, 2)) def callback(i: float, status=""): state.set_progress_value(i * 100) if status: state.set_status(status) if state.is_interruption_requested(): raise Exception if not table: collection = self.res.get_collection_by_id(collection_id) coll_table = resdk.tables.RNATables( collection, expression_source=exp_source, expression_process_slug=proc_slug, progress_callable=wrap_callback(callback, end=0.5), ) species = coll_table._data[0].output['species'] sample = coll_table._samples[0] state.set_status('Downloading ...') loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) df_exp = coll_table.exp if exp_type != 'rc' else coll_table.rc df_exp = df_exp.rename(index=coll_table.readable_index) df_metas = coll_table.meta df_metas = df_metas.rename(index=coll_table.readable_index) df_qc = None if self.append_qc_data: # TODO: check if there is a way to detect if collection # table contains QC data try: df_qc = coll_table.qc df_qc = df_qc.rename(index=coll_table.readable_index) except ValueError: pass loop.close() state.set_status('To data table ...') duplicates = { item for item, count in Counter([ label.split('.')[1] for label in df_metas.columns.to_list() if '.' in label ]).items() if count > 1 } # what happens if there is more nested sections? section_name_to_label = { section['name']: section['label'] for section in sample.descriptor_schema.schema } column_labels = {} for field_schema, fields, path in iterate_schema( sample.descriptor, sample.descriptor_schema.schema, path=''): path = path[1:] # this is ugly, but cant go around it if path not in df_metas.columns: continue label = field_schema['label'] section_name, field_name = path.split('.') column_labels[path] = ( label if field_name not in duplicates else f'{section_name_to_label[section_name]} - {label}') df_exp = df_exp.reset_index(drop=True) df_metas = df_metas.astype('object') df_metas = df_metas.fillna(np.nan) df_metas = df_metas.replace('nan', np.nan) df_metas = df_metas.rename(columns=column_labels) if df_qc is not None: df_metas = pd.merge(df_metas, df_qc, left_index=True, right_index=True) xym, domain_metas = vars_from_df(df_metas) x, _, m = xym x_metas = np.hstack((x, m)) attrs = [ContinuousVariable(col) for col in df_exp.columns] metas = domain_metas.attributes + domain_metas.metas domain = Domain(attrs, metas=metas) table = Table(domain, df_exp.to_numpy(), metas=x_metas) state.set_progress_value(next(progress_steps_download)) state.set_status('Matching genes ...') progress_steps_gm = iter( np.linspace(50, 99, len(coll_table.gene_ids))) def gm_callback(): state.set_progress_value(next(progress_steps_gm)) tax_id = species_name_to_taxid(species) gm = GeneMatcher(tax_id, progress_callback=gm_callback) table = gm.match_table_attributes(table, rename=True) table.attributes[TableAnnotation.tax_id] = tax_id table.attributes[TableAnnotation.gene_as_attr_name] = True table.attributes[TableAnnotation.gene_id_attribute] = 'Entrez ID' self.data_table = table state.set_status('Normalizing ...') table = self.normalize(table) state.set_progress_value(100) return table
def compute_scores( data: Table, genes: Table, p_threshold: float, p_value_fun: str, scoring: str, start: float, end: float, result: Result, state: TaskState, ): if not data or not genes: result.scores.z_vals = None result.scores.annotations = None result.scores.p_vals = None result.scores.table = None else: state.set_status("Computing scores...") weights = np.array([15, 75, 10]) * (end - start) / 100 if not result.scores.z_vals: result.scores.z_vals = AnnotateSamplesMeta.mann_whitney_test(data) state.set_partial_result(("scores", result)) state.set_progress_value(weights[0]) if state.is_interruption_requested(): return if not result.scores.annotations or not result.scores.p_vals: annot, p_vals = AnnotateSamplesMeta.assign_annotations( result.scores.z_vals, genes, data, p_value_fun=p_value_fun, scoring=scoring ) result.scores.annotations = annot result.scores.p_vals = p_vals state.set_partial_result(("scores", result)) state.set_progress_value(weights[1]) if state.is_interruption_requested(): return result.scores.table = AnnotateSamplesMeta.filter_annotations( result.scores.annotations, result.scores.p_vals, p_threshold=p_threshold ) state.set_partial_result(("scores", result))
def runner( res: ResolweAPI, data_objects: List[Data], options: DataOutputOptions, exp_type: int, proc_type: int, input_annotation: int, state: TaskState, ) -> Table: data_frames = [] metadata = defaultdict(list) def parse_sample_descriptor(sample: Sample) -> None: general = sample.descriptor.get('general', {}) for label in SAMPLE_DESCRIPTOR_LABELS: metadata[label].append([general.get(label, '')]) metadata['sample_name'].append([sample.name]) exp_type = file_output_field = options.expression[exp_type].type proc_type = options.process[proc_type].type source = options.input_annotation[input_annotation].source species = options.input_annotation[input_annotation].species build = options.input_annotation[input_annotation].build # apply filters data_objects = [obj for obj in data_objects if obj.process.type == proc_type] data_objects = [ obj for obj in data_objects if obj.output['source'] == source and obj.output['species'] == species and obj.output['build'] == build ] if exp_type != 'rc': file_output_field = 'exp' data_objects = [obj for obj in data_objects if obj.output['exp_type'] == exp_type] if not data_objects: raise ResolweDataObjectsNotFound step, steps = 0, len(data_objects) + 3 def set_progress(): nonlocal step step += 1 state.set_progress_value(100 * (step / steps)) state.set_status('Downloading ...') for data_object in data_objects: set_progress() parse_sample_descriptor(data_object.sample) metadata['expression_type'].append([exp_type.upper()]) response = res.get_expressions(data_object.id, data_object.output[file_output_field]['file']) with io.BytesIO() as f: f.write(response.content) f.seek(0) # expressions to data frame df = pd.read_csv(f, sep='\t', compression='gzip') df = df.set_index('Gene').T.reset_index(drop=True) data_frames.append(df) state.set_status('Concatenating samples ...') df = pd.concat(data_frames, axis=0) state.set_status('To data table ...') table = table_from_frame(df) set_progress() state.set_status('Adding metadata ...') metas = [StringVariable(label) for label in metadata.keys()] domain = Domain(table.domain.attributes, table.domain.class_vars, metas) table = table.transform(domain) for key, value in metadata.items(): table[:, key] = value set_progress() state.set_status('Matching genes ...') tax_id = species_name_to_taxid(species) gm = GeneMatcher(tax_id) table = gm.match_table_attributes(table, rename=True) table.attributes[TableAnnotation.tax_id] = tax_id table.attributes[TableAnnotation.gene_as_attr_name] = True table.attributes[TableAnnotation.gene_id_attribute] = 'Entrez ID' set_progress() return table
def worker(self, state: TaskState): while True: state.set_partial_result(self.update_frame()) time.sleep(1 / 10)