Beispiel #1
0
def run_mds(matrix: DistMatrix, max_iter: int, step_size: int, init_type: int,
            embedding: np.ndarray, state: TaskState):
    res = Result(embedding=embedding)

    iterations_done = 0
    init = embedding
    state.set_status("Running...")
    oldstress = np.finfo(np.float).max

    while True:
        step_iter = min(max_iter - iterations_done, step_size)
        mds = MDS(
            dissimilarity="precomputed", n_components=2,
            n_init=1, max_iter=step_iter,
            init_type=init_type, init_data=init
        )

        mdsfit = mds(matrix)
        iterations_done += step_iter

        embedding, stress = mdsfit.embedding_, mdsfit.stress_
        emb_norm = np.sqrt(np.sum(embedding ** 2, axis=1)).sum()
        if emb_norm > 0:
            stress /= emb_norm

        res.embedding = embedding
        state.set_partial_result(res)
        state.set_progress_value(100 * iterations_done / max_iter)
        if iterations_done >= max_iter or stress == 0 or \
                (oldstress - stress) < mds.params["eps"]:
            return res
        init = embedding
        oldstress = stress
        if state.is_interruption_requested():
            return res
Beispiel #2
0
def _run(
    data: Table,
    group_by_attrs: List[Variable],
    aggregations: Dict[Variable, Set[str]],
    result: Result,
    state: TaskState,
) -> Result:
    def progress(part):
        state.set_progress_value(part * 100)
        if state.is_interruption_requested():
            raise Exception

    state.set_status("Aggregating")
    # group table rows
    if result.group_by is None:
        result.group_by = data.groupby(group_by_attrs)
    state.set_partial_result(result)

    aggregations = {
        var: [(agg, AGGREGATIONS[agg].function)
              for agg in sorted(aggs, key=AGGREGATIONS_ORD.index)]
        for var, aggs in aggregations.items()
    }
    result.result_table = result.group_by.aggregate(
        aggregations, wrap_callback(progress, 0.2, 1))
    return result
Beispiel #3
0
def run(corpus: Corpus, statistics: Tuple[int, str], state: TaskState) -> None:
    """
    This function runs the computation for new features.
    All results will be reported as a partial results.

    Parameters
    ----------
    corpus
        The corpus on which the computation is held.
    statistics
        Tuple of statistic pairs to be computed:
        (statistics id, string pattern)
    state
        State used to report progress and partial results.
    """
    # callback is called for each corpus element statistics time
    tick_values = iter(np.linspace(0, 100, len(corpus) * len(statistics)))

    def advance():
        state.set_progress_value(next(tick_values))

    for s, patern in statistics:
        fun = STATISTICS_FUNCTIONS[s]
        result = fun(corpus, patern, advance)
        if result is not None:
            result = result + (ComputeValue(fun, patern),)
        state.set_partial_result((s, patern, result))
Beispiel #4
0
def run_mds(matrix: DistMatrix, max_iter: int, step_size: int, init_type: int,
            embedding: np.ndarray, state: TaskState):
    res = Result(embedding=embedding)

    iterations_done = 0
    init = embedding
    state.set_status("Running...")
    oldstress = np.finfo(np.float).max

    while True:
        step_iter = min(max_iter - iterations_done, step_size)
        mds = MDS(
            dissimilarity="precomputed", n_components=2,
            n_init=1, max_iter=step_iter,
            init_type=init_type, init_data=init
        )

        mdsfit = mds(matrix)
        iterations_done += step_iter

        embedding, stress = mdsfit.embedding_, mdsfit.stress_
        emb_norm = np.sqrt(np.sum(embedding ** 2, axis=1)).sum()
        if emb_norm > 0:
            stress /= emb_norm

        res.embedding = embedding
        state.set_partial_result(res)
        state.set_progress_value(100 * iterations_done / max_iter)
        if iterations_done >= max_iter or stress == 0 or \
                (oldstress - stress) < mds.params["eps"]:
            return res
        init = embedding
        oldstress = stress
        if state.is_interruption_requested():
            return res
Beispiel #5
0
def _prepare_dir_and_save_images(paths_queue, dir_name, target_size,
                                 previously_saved, state: TaskState):
    """
    This function prepares a directory structure and calls function
    that saves images.

    Parameters
    ----------
    previously_saved : int
        Number of saved images in the previous process. If the process is
        resumed it is non-zero.
    """
    res = Result(paths=paths_queue)
    if previously_saved == 0:
        _clean_dir(dir_name)

    steps = len(paths_queue) + previously_saved
    loader = ImageLoader()
    while res.paths:
        from_path, to_path = res.paths.popleft()
        _save_an_image(loader, from_path, to_path, target_size)

        state.set_progress_value((1 - len(res.paths) / steps) * 100)
        state.set_partial_result(res)
        if state.is_interruption_requested():
            return res

    return res
Beispiel #6
0
def run_embedding(
    images: Table,
    file_paths_attr: Variable,
    embedder_name: str,
    state: TaskState,
) -> Result:
    """
    Run the embedding process

    Parameters
    ----------
    images
        Data table with images to embed.
    file_paths_attr
        The column of the table with images.
    embedder_name
        The name of selected embedder.
    state
        State object used for controlling and progress.

    Returns
    -------
    The object that holds embedded images, skipped images, and number
    of skipped images.
    """
    embedder = ImageEmbedder(model=embedder_name)

    file_paths = images[:, file_paths_attr].metas.flatten()

    file_paths_mask = file_paths == file_paths_attr.Unknown
    file_paths_valid = file_paths[~file_paths_mask]

    # init progress bar and fuction
    ticks = iter(np.linspace(0.0, 100.0, file_paths_valid.size))

    def advance(success=True):
        if state.is_interruption_requested():
            embedder.set_canceled()
        if success:
            state.set_progress_value(next(ticks))

    try:
        emb, skip, n_skip = embedder(images,
                                     col=file_paths_attr,
                                     callback=advance)
    except EmbeddingConnectionError:
        # recompute ticks to go from current state to 100
        ticks = iter(np.linspace(next(ticks), 100.0, file_paths_valid.size))

        state.set_partial_result("squeezenet")
        embedder = ImageEmbedder(model="squeezenet")
        emb, skip, n_skip = embedder(images,
                                     col=file_paths_attr,
                                     callback=advance)

    return Result(embedding=emb, skip_images=skip, num_skipped=n_skip)
Beispiel #7
0
def run_vizrank(compute_score: Callable, iterate_states: Callable,
                saved_state: Optional[Iterable], scores: List, progress: int,
                state_count: int, task: TaskState):
    task.set_status("Getting combinations...")
    task.set_progress_value(0.1)
    states = iterate_states(saved_state)

    task.set_status("Getting scores...")
    res = Result(queue=Queue(), scores=None)
    scores = scores.copy()
    can_set_partial_result = True

    def do_work(st, next_st):
        try:
            score = compute_score(st)
            if score is not None:
                pos = bisect_left(scores, score)
                res.queue.put_nowait(
                    QueuedScore(position=pos,
                                score=score,
                                state=st,
                                next_state=next_st))
                scores.insert(pos, score)
        except Exception:  # ignore current state in case of any problem
            pass
        res.scores = scores.copy()

    def reset_flag():
        nonlocal can_set_partial_result
        can_set_partial_result = True

    state = None
    next_state = next(states)
    try:
        while True:
            if task.is_interruption_requested():
                return res
            task.set_progress_value(int(progress * 100 / max(1, state_count)))
            progress += 1
            state = copy.copy(next_state)
            next_state = copy.copy(next(states))
            do_work(state, next_state)
            # for simple scores (e.g. correlations widget) and many feature
            # combinations, the 'partial_result_ready' signal (emitted by
            # invoking 'task.set_partial_result') was emitted too frequently
            # for a longer period of time and therefore causing the widget
            # being unresponsive
            if can_set_partial_result:
                task.set_partial_result(res)
                can_set_partial_result = False
                Timer(0.01, reset_flag).start()
    except StopIteration:
        do_work(state, None)
        task.set_partial_result(res)
    return res
Beispiel #8
0
 def compute_secondary_clusters(embedding: Table, result: Result, state: TaskState):
     if not result.clusters.groups or not embedding:
         result.clusters.secondary_table = None
     else:
         state.set_status("Finding secondary clusters...")
         hulls = {k: v[2] for k, v in result.clusters.groups.items()}
         clusters = result.clusters.table
         domain = clusters and clusters.domain["Clusters"]
         table = cluster_additional_points(embedding, hulls, domain)
         result.clusters.secondary_table = table
     state.set_partial_result(("secondary_clusters", result))
Beispiel #9
0
def _run(
    corpus: Corpus,
    words: List[str],
    scoring_methods: List[str],
    aggregation: str,
    additional_params: dict,
    state: TaskState,
) -> None:
    """
    Perform word scoring with selected scoring methods

    Parameters
    ----------
    corpus
        Corpus of documents
    words
        List of words used for scoring
    scoring_methods
        Methods to score documents with
    aggregation
        Aggregation applied for each document on word scores
    additional_params
        Additional prameters for scores (e.g. embedding needs text language)
    state
        TaskState for reporting the task status and giving partial results
    """
    def callback(i: float) -> None:
        state.set_progress_value(i * 100)
        if state.is_interruption_requested():
            raise Exception

    cb_part = 1 / (len(scoring_methods) + 1)  # +1 for preprocessing

    words = _preprocess_words(corpus, words,
                              wrap_callback(callback, end=cb_part))
    if len(words) == 0:
        raise Exception(
            "Empty word list after preprocessing. Please provide a valid set of words."
        )
    for i, sm in enumerate(scoring_methods):
        scoring_method = SCORING_METHODS[sm][1]
        sig = signature(scoring_method)
        add_params = {
            k: v
            for k, v in additional_params.items() if k in sig.parameters
        }
        scs = scoring_method(
            corpus, words,
            wrap_callback(callback,
                          start=(i + 1) * cb_part,
                          end=(i + 2) * cb_part), **add_params)
        scs = AGGREGATIONS[aggregation](scs, axis=1)
        state.set_partial_result((sm, aggregation, scs))
Beispiel #10
0
 def compute_clusters(embedding: Table, result: Result, state: TaskState):
     if not result.scores.table or not embedding:
         result.clusters.table = None
         result.clusters.groups = None
     else:
         state.set_status("Finding clusters...")
         kwargs = {}
         if result.clusters.epsilon is not None:
             kwargs["eps"] = result.clusters.epsilon
         clusters = annotate_projection(result.scores.table, embedding, **kwargs)
         result.clusters.table = clusters[0]
         result.clusters.groups = clusters[1]
         result.clusters.epsilon = clusters[2]
     state.set_partial_result(("clusters", result))
Beispiel #11
0
def run(data: Table, embedding: Optional[np.ndarray], state: TaskState):
    res = Result(embedding=embedding)

    # simulate wasteful calculation (increase 'steps')
    step, steps = 0, 10
    state.set_status("Calculating...")
    while step < steps:
        for _ in range(steps):
            x_data = np.array(np.mean(data.X, axis=1))
            if x_data.ndim == 2:
                x_data = x_data.ravel()
            y_data = np.random.rand(len(x_data))
            embedding = np.vstack((x_data, y_data)).T
        step += 1
        if step % (steps / 10) == 0:
            state.set_progress_value(100 * step / steps)

        if state.is_interruption_requested():
            return res

        res.embedding = embedding
        state.set_partial_result(res)
    return res
def run(data: Table, embedding: Optional[np.ndarray], state: TaskState):
    res = Result(embedding=embedding)

    # simulate wasteful calculation (increase 'steps')
    step, steps = 0, 10
    state.set_status("Calculating...")
    while step < steps:
        for _ in range(steps):
            x_data = np.array(np.mean(data.X, axis=1))
            if x_data.ndim == 2:
                x_data = x_data.ravel()
            y_data = np.random.rand(len(x_data))
            embedding = np.vstack((x_data, y_data)).T
        step += 1
        if step % (steps / 10) == 0:
            state.set_progress_value(100 * step / steps)

        if state.is_interruption_requested():
            return res

        res.embedding = embedding
        state.set_partial_result(res)
    return res
Beispiel #13
0
def run_freeviz(data: Table, projector: FreeViz, state: TaskState):
    res = Result(projector=projector, projection=None)
    step, steps = 0, MAX_ITERATIONS
    initial = res.projector.components_.T
    state.set_status("Calculating...")
    while True:
        # Needs a copy because projection should not be modified inplace.
        # If it is modified inplace, the widget and the thread hold a
        # reference to the same object. When the thread is interrupted it
        # is still modifying the object, but the widget receives it
        # (the modified object) with a delay.
        res.projection = res.projector(data).copy()
        anchors = res.projector.components_.T
        res.projector.initial = anchors

        state.set_partial_result(res)
        if np.allclose(initial, anchors, rtol=1e-5, atol=1e-4):
            return res
        initial = anchors

        step += 1
        state.set_progress_value(100 * step / steps)
        if state.is_interruption_requested():
            return res
Beispiel #14
0
def run_freeviz(data: Table, projector: FreeViz, state: TaskState):
    res = Result(projector=projector, projection=None)
    step, steps = 0, MAX_ITERATIONS
    initial = res.projector.components_.T
    state.set_status("Calculating...")
    while True:
        # Needs a copy because projection should not be modified inplace.
        # If it is modified inplace, the widget and the thread hold a
        # reference to the same object. When the thread is interrupted it
        # is still modifying the object, but the widget receives it
        # (the modified object) with a delay.
        res.projection = res.projector(data).copy()
        anchors = res.projector.components_.T
        res.projector.initial = anchors

        state.set_partial_result(res)
        if np.allclose(initial, anchors, rtol=1e-5, atol=1e-4):
            return res
        initial = anchors

        step += 1
        state.set_progress_value(100 * step / steps)
        if state.is_interruption_requested():
            return res
    def compute_scores(
        data: Table,
        genes: Table,
        p_threshold: float,
        p_value_fun: str,
        scoring: str,
        start: float,
        end: float,
        result: Result,
        state: TaskState,
    ):
        if not data or not genes:
            result.scores.z_vals = None
            result.scores.annotations = None
            result.scores.p_vals = None
            result.scores.table = None
        else:
            state.set_status("Computing scores...")
            weights = np.array([15, 75, 10]) * (end - start) / 100

            if not result.scores.z_vals:
                result.scores.z_vals = AnnotateSamplesMeta.mann_whitney_test(
                    data)
                state.set_partial_result(("scores", result))
            state.set_progress_value(weights[0])
            if state.is_interruption_requested():
                return

            if not result.scores.annotations or not result.scores.p_vals:
                annot, p_vals = AnnotateSamplesMeta.assign_annotations(
                    result.scores.z_vals,
                    genes,
                    data,
                    p_value_fun=p_value_fun,
                    scoring=scoring)
                result.scores.annotations = annot
                result.scores.p_vals = p_vals
                state.set_partial_result(("scores", result))
            state.set_progress_value(weights[1])
            if state.is_interruption_requested():
                return

            result.scores.table = AnnotateSamplesMeta.filter_annotations(
                result.scores.annotations,
                result.scores.p_vals,
                p_threshold=p_threshold)

        state.set_partial_result(("scores", result))
Beispiel #16
0
 def worker(self, state: TaskState):
     while True:
         state.set_partial_result(self.update_frame())
         time.sleep(1 / 10)