def run_mds(matrix: DistMatrix, max_iter: int, step_size: int, init_type: int, embedding: np.ndarray, state: TaskState): res = Result(embedding=embedding) iterations_done = 0 init = embedding state.set_status("Running...") oldstress = np.finfo(np.float).max while True: step_iter = min(max_iter - iterations_done, step_size) mds = MDS( dissimilarity="precomputed", n_components=2, n_init=1, max_iter=step_iter, init_type=init_type, init_data=init ) mdsfit = mds(matrix) iterations_done += step_iter embedding, stress = mdsfit.embedding_, mdsfit.stress_ emb_norm = np.sqrt(np.sum(embedding ** 2, axis=1)).sum() if emb_norm > 0: stress /= emb_norm res.embedding = embedding state.set_partial_result(res) state.set_progress_value(100 * iterations_done / max_iter) if iterations_done >= max_iter or stress == 0 or \ (oldstress - stress) < mds.params["eps"]: return res init = embedding oldstress = stress if state.is_interruption_requested(): return res
def _prepare_dir_and_save_images(paths_queue, dir_name, target_size, previously_saved, state: TaskState): """ This function prepares a directory structure and calls function that saves images. Parameters ---------- previously_saved : int Number of saved images in the previous process. If the process is resumed it is non-zero. """ res = Result(paths=paths_queue) if previously_saved == 0: _clean_dir(dir_name) steps = len(paths_queue) + previously_saved loader = ImageLoader() while res.paths: from_path, to_path = res.paths.popleft() _save_an_image(loader, from_path, to_path, target_size) state.set_progress_value((1 - len(res.paths) / steps) * 100) state.set_partial_result(res) if state.is_interruption_requested(): return res return res
def worker(data: Table, learner, state: TaskState): # No need to check for irregularities, this is done in widget time_var, event_var = get_survival_endpoints(data.domain) def fit_cox_models(attrs_combinations): results = [] for attrs in attrs_combinations: columns = attrs + [time_var.name, event_var.name] cph_model = learner(data[:, columns]) log2p = cph_model.ll_ratio_log2p() result = Result(log2p, cph_model) results.append(result) return results attributes = [attr for attr in data.domain.attributes] progress_steps = iter(np.linspace(0, 100, len(attributes))) _trace = fit_cox_models([attributes]) while len(_trace) != len(data.domain.attributes): attributes = [attr for attr in _trace[-1].model.domain.attributes] if len(attributes) > 1: combinations = [ list(comb) for comb in itertools.combinations(attributes, len(attributes) - 1) ] else: combinations = [attributes] results = fit_cox_models(combinations) _trace.append(max(results, key=lambda result: result.log2p)) state.set_progress_value(next(progress_steps)) return _trace
def run_vizrank(compute_score: Callable, iterate_states: Callable, saved_state: Optional[Iterable], scores: List, progress: int, state_count: int, task: TaskState): task.set_status("Getting combinations...") task.set_progress_value(0.1) states = iterate_states(saved_state) task.set_status("Getting scores...") res = Result(queue=Queue(), scores=None) scores = scores.copy() can_set_partial_result = True def do_work(st, next_st): try: score = compute_score(st) if score is not None: pos = bisect_left(scores, score) res.queue.put_nowait( QueuedScore(position=pos, score=score, state=st, next_state=next_st)) scores.insert(pos, score) except Exception: # ignore current state in case of any problem pass res.scores = scores.copy() def reset_flag(): nonlocal can_set_partial_result can_set_partial_result = True state = None next_state = next(states) try: while True: if task.is_interruption_requested(): return res task.set_progress_value(int(progress * 100 / max(1, state_count))) progress += 1 state = copy.copy(next_state) next_state = copy.copy(next(states)) do_work(state, next_state) # for simple scores (e.g. correlations widget) and many feature # combinations, the 'partial_result_ready' signal (emitted by # invoking 'task.set_partial_result') was emitted too frequently # for a longer period of time and therefore causing the widget # being unresponsive if can_set_partial_result: task.set_partial_result(res) can_set_partial_result = False Timer(0.01, reset_flag).start() except StopIteration: do_work(state, None) task.set_partial_result(res) return res
def run(gene_sets: GeneSets, selected_gene_sets: List[Tuple[str, ...]], genes, state: TaskState) -> Results: results = Results() items = [] step, steps = 0, len(gene_sets) if not genes: return results state.set_status('Calculating...') for gene_set in sorted(gene_sets): step += 1 if step % (steps / 10) == 0: state.set_progress_value(100 * step / steps) if gene_set.hierarchy not in selected_gene_sets: continue if state.is_interruption_requested(): return results matched_set = gene_set.genes & genes if len(matched_set) > 0: category_column = QStandardItem() term_column = QStandardItem() count_column = QStandardItem() genes_column = QStandardItem() category_column.setData(", ".join(gene_set.hierarchy), Qt.DisplayRole) term_column.setData(gene_set.name, Qt.DisplayRole) term_column.setData(gene_set.name, Qt.ToolTipRole) # there was some cases when link string was not empty string but not valid (e.g. "_") if gene_set.link and urlparse(gene_set.link).scheme: term_column.setData(gene_set.link, LinkRole) term_column.setForeground(QColor(Qt.blue)) count_column.setData(matched_set, Qt.UserRole) count_column.setData(len(matched_set), Qt.DisplayRole) genes_column.setData(len(gene_set.genes), Qt.DisplayRole) genes_column.setData( set(gene_set.genes), Qt.UserRole) # store genes to get then on output on selection items.append( [count_column, genes_column, category_column, term_column]) results.items = items return results
def compute_scores( data: Table, genes: Table, p_threshold: float, p_value_fun: str, scoring: str, start: float, end: float, result: Result, state: TaskState, ): if not data or not genes: result.scores.z_vals = None result.scores.annotations = None result.scores.p_vals = None result.scores.table = None else: state.set_status("Computing scores...") weights = np.array([15, 75, 10]) * (end - start) / 100 if not result.scores.z_vals: result.scores.z_vals = AnnotateSamplesMeta.mann_whitney_test( data) state.set_partial_result(("scores", result)) state.set_progress_value(weights[0]) if state.is_interruption_requested(): return if not result.scores.annotations or not result.scores.p_vals: annot, p_vals = AnnotateSamplesMeta.assign_annotations( result.scores.z_vals, genes, data, p_value_fun=p_value_fun, scoring=scoring) result.scores.annotations = annot result.scores.p_vals = p_vals state.set_partial_result(("scores", result)) state.set_progress_value(weights[1]) if state.is_interruption_requested(): return result.scores.table = AnnotateSamplesMeta.filter_annotations( result.scores.annotations, result.scores.p_vals, p_threshold=p_threshold) state.set_partial_result(("scores", result))
def worker(table: Table, covariates: List, time_var: str, event_var: str, state: TaskState): with multiprocessing.Manager() as _manager: _queue = _manager.Queue() _cpu_count = cpu_count() df = table_to_frame(table, include_metas=False) df = df.astype({event_var: np.float64}) if len(covariates) > 50: batches = (df[[time_var, event_var] + batch] for batch in [covariates[i::_cpu_count] for i in range(_cpu_count)]) else: batches = (df[[time_var, event_var] + [cov]] for cov in covariates) progress_steps = iter(np.linspace(0, 100, len(covariates))) with multiprocessing.Pool(processes=_cpu_count) as pool: results = pool.map_async( partial( batch_to_process, _queue, time_var, event_var, ), batches, ) while True: try: state.set_progress_value(next(progress_steps)) _queue.get(timeout=3) except (queue.Empty, StopIteration): break stacked_result = np.vstack(results.get()) covariate_names = stacked_result[:, 0] results = stacked_result[:, 1:].astype(float) _, pvals_corrected = fdrcorrection(results[:, -1], is_sorted=False) results = np.hstack( (results, pvals_corrected.reshape(pvals_corrected.shape[0], -1))) return covariate_names, results
def count_words(data: Corpus, state: TaskState) -> Tuple[Counter, bool]: """ This function implements counting process of the word cloud widget and is called in the separate thread by concurrent. Parameters ---------- data Corpus with the data state State used to report status. Returns ------- Reports counts as a counter and boolean that tell whether the data were retrieved on bag of words basis. """ state.set_status("Calculating...") state.set_progress_value(0) bow_counts = _bow_words(data) state.set_progress_value(0.5) if bow_counts: corpus_counter = Counter(bow_counts) else: corpus_counter = Counter(w for doc in data.ngrams for w in doc) state.set_progress_value(1) return corpus_counter, bool(bow_counts)
def run(data: Table, embedding: Optional[np.ndarray], state: TaskState): res = Result(embedding=embedding) # simulate wasteful calculation (increase 'steps') step, steps = 0, 10 state.set_status("Calculating...") while step < steps: for _ in range(steps): x_data = np.array(np.mean(data.X, axis=1)) if x_data.ndim == 2: x_data = x_data.ravel() y_data = np.random.rand(len(x_data)) embedding = np.vstack((x_data, y_data)).T step += 1 if step % (steps / 10) == 0: state.set_progress_value(100 * step / steps) if state.is_interruption_requested(): return res res.embedding = embedding state.set_partial_result(res) return res
def run_freeviz(data: Table, projector: FreeViz, state: TaskState): res = Result(projector=projector, projection=None) step, steps = 0, MAX_ITERATIONS initial = res.projector.components_.T state.set_status("Calculating...") while True: # Needs a copy because projection should not be modified inplace. # If it is modified inplace, the widget and the thread hold a # reference to the same object. When the thread is interrupted it # is still modifying the object, but the widget receives it # (the modified object) with a delay. res.projection = res.projector(data).copy() anchors = res.projector.components_.T res.projector.initial = anchors state.set_partial_result(res) if np.allclose(initial, anchors, rtol=1e-5, atol=1e-4): return res initial = anchors step += 1 state.set_progress_value(100 * step / steps) if state.is_interruption_requested(): return res
def runner(self, state: TaskState) -> Table: exp_type = self.data_output_options.expression_type[self.exp_type].type exp_source = self.data_output_options.expression_sources[ self.exp_source] proc_slug = self.data_output_options.process[self.proc_slug].slug collection_id = self.selected_collection_id table = self.data_table progress_steps_download = iter(np.linspace(0, 50, 2)) def callback(i: float, status=""): state.set_progress_value(i * 100) if status: state.set_status(status) if state.is_interruption_requested(): raise Exception if not table: collection = self.res.get_collection_by_id(collection_id) coll_table = resdk.tables.RNATables( collection, expression_source=exp_source, expression_process_slug=proc_slug, progress_callable=wrap_callback(callback, end=0.5), ) species = coll_table._data[0].output['species'] sample = coll_table._samples[0] state.set_status('Downloading ...') loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) df_exp = coll_table.exp if exp_type != 'rc' else coll_table.rc df_exp = df_exp.rename(index=coll_table.readable_index) df_metas = coll_table.meta df_metas = df_metas.rename(index=coll_table.readable_index) df_qc = None if self.append_qc_data: # TODO: check if there is a way to detect if collection # table contains QC data try: df_qc = coll_table.qc df_qc = df_qc.rename(index=coll_table.readable_index) except ValueError: pass loop.close() state.set_status('To data table ...') duplicates = { item for item, count in Counter([ label.split('.')[1] for label in df_metas.columns.to_list() if '.' in label ]).items() if count > 1 } # what happens if there is more nested sections? section_name_to_label = { section['name']: section['label'] for section in sample.descriptor_schema.schema } column_labels = {} for field_schema, fields, path in iterate_schema( sample.descriptor, sample.descriptor_schema.schema, path=''): path = path[1:] # this is ugly, but cant go around it if path not in df_metas.columns: continue label = field_schema['label'] section_name, field_name = path.split('.') column_labels[path] = ( label if field_name not in duplicates else f'{section_name_to_label[section_name]} - {label}') df_exp = df_exp.reset_index(drop=True) df_metas = df_metas.astype('object') df_metas = df_metas.fillna(np.nan) df_metas = df_metas.replace('nan', np.nan) df_metas = df_metas.rename(columns=column_labels) if df_qc is not None: df_metas = pd.merge(df_metas, df_qc, left_index=True, right_index=True) xym, domain_metas = vars_from_df(df_metas) x, _, m = xym x_metas = np.hstack((x, m)) attrs = [ContinuousVariable(col) for col in df_exp.columns] metas = domain_metas.attributes + domain_metas.metas domain = Domain(attrs, metas=metas) table = Table(domain, df_exp.to_numpy(), metas=x_metas) state.set_progress_value(next(progress_steps_download)) state.set_status('Matching genes ...') progress_steps_gm = iter( np.linspace(50, 99, len(coll_table.gene_ids))) def gm_callback(): state.set_progress_value(next(progress_steps_gm)) tax_id = species_name_to_taxid(species) gm = GeneMatcher(tax_id, progress_callback=gm_callback) table = gm.match_table_attributes(table, rename=True) table.attributes[TableAnnotation.tax_id] = tax_id table.attributes[TableAnnotation.gene_as_attr_name] = True table.attributes[TableAnnotation.gene_id_attribute] = 'Entrez ID' self.data_table = table state.set_status('Normalizing ...') table = self.normalize(table) state.set_progress_value(100) return table