def aggregate_by_assigned_entity(annotated_chromatograms, delta_rt=0.25):
    aggregated = defaultdict(list)
    finished = []
    log_handle.log("Aggregating Common Entities: %d chromatograms" %
                   (len(annotated_chromatograms, )))
    for chroma in annotated_chromatograms:
        if chroma.composition is not None:
            if chroma.entity is not None:
                aggregated[chroma.entity].append(chroma)
            else:
                aggregated[chroma.composition].append(chroma)
        else:
            finished.append(chroma)
    for entity, group in aggregated.items():
        out = []
        group = sorted(group, key=lambda x: x.start_time)
        chroma = group[0]
        for obs in group[1:]:
            if chroma.chromatogram.overlaps_in_time(obs) or (
                    chroma.end_time - obs.start_time) < delta_rt:
                chroma = chroma.merge(obs)
            else:
                out.append(chroma)
                chroma = obs
        out.append(chroma)
        finished.extend(out)
    log_handle.log("After merging: %d chromatograms" % (len(finished), ))
    return finished
 def add_solution(self, item):
     case_mass = item.precursor_information.neutral_mass
     if abs(case_mass - self.chromatogram.neutral_mass) > 100:
         log_handle.log(
             "Warning, mis-assigned spectrum match to chromatogram %r, %r" %
             (self, item))
     self.tandem_solutions.append(item)
Esempio n. 3
0
 def add_task(self, task):
     log_handle.log("Received Task %r (%s, %r)" % (task, task.name, task.id))
     context = self.make_task_context(task.name)
     task.update_control_context(context)
     self.task_manager.add_task(task)
     path = self.get_task_path(task.name)
     dill.dump(task.args[:-1], open(path, 'wb'))
Esempio n. 4
0
    def run(self):
        self.loader = MSFileLoader(self.mzml_path)

        if self.start_scan is not None:
            self.loader.start_from_scan(self.start_scan)

        count = 0
        if self.max_scans is None:
            max_scans = float('inf')
        else:
            max_scans = self.max_scans

        end_scan = self.end_scan
        while count < max_scans:
            try:
                batch, ids = self._make_scan_batch()
                if len(batch) > 0:
                    self.queue.put(batch)
                count += len(ids)
                if end_scan in ids or len(ids) == 0:
                    break
            except StopIteration:
                break
            except Exception as e:
                log_handle.error("An error occurred while fetching scans", e)
                break

        if self.no_more_event is not None:
            self.no_more_event.set()
            log_handle.log("All Scan IDs have been dealt. %d scan bunches." %
                           (count, ))
        else:
            self.queue.put(DONE)
Esempio n. 5
0
    def run(self):
        self.loader = MSFileLoader(self.ms_file_path,
                                   huge_tree=huge_tree,
                                   decode_binary=False)

        if self.start_scan is not None:
            try:
                self.loader.start_from_scan(
                    self.start_scan,
                    require_ms1=self.loader.has_ms1_scans(),
                    grouped=True)
            except IndexError as e:
                log_handle.error("An error occurred while locating start scan",
                                 e)
                self.loader.reset()
                self.loader.make_iterator(grouped=True)
            except AttributeError:
                log_handle.error(
                    "The reader does not support random access, start time will be ignored",
                    e)
                self.loader.reset()
                self.loader.make_iterator(grouped=True)
        else:
            self.loader.make_iterator(grouped=True)

        count = 0
        last = 0
        if self.max_scans is None:
            max_scans = float('inf')
        else:
            max_scans = self.max_scans

        end_scan = self.end_scan
        while count < max_scans:
            try:
                batch, ids = self._make_scan_batch()
                if len(batch) > 0:
                    self.queue.put(batch)
                count += len(ids)
                if (count - last) > 1000:
                    last = count
                    self.queue.join()
                if (end_scan in ids and end_scan is not None) or len(ids) == 0:
                    log_handle.log("End Scan Found")
                    break
            except StopIteration:
                break
            except Exception as e:
                log_handle.error("An error occurred while fetching scans", e)
                break

        if self.no_more_event is not None:
            self.no_more_event.set()
            log_handle.log("All Scan IDs have been dealt. %d scan bunches." %
                           (count, ))
        else:
            self.queue.put(DONE)
Esempio n. 6
0
 def drain_queue():
     current_work = []
     try:
         while len(current_work) < 300:
             current_work.append(self.queue.get_nowait())
     except QueueEmptyException:
         pass
     if len(current_work) > 5:
         log_handle.log("Drained Write Queue of %d items" % (len(current_work),))
     return current_work
Esempio n. 7
0
 def drain_queue():
     current_work = []
     try:
         while len(current_work) < 300:
             current_work.append(self.queue.get_nowait())
     except QueueEmptyException:
         pass
     if len(current_work) > 5:
         log_handle.log("Drained Write Queue of %d items" %
                        (len(current_work), ))
     return current_work
Esempio n. 8
0
 def complete(self):
     self.save()
     self.serializer.complete()
     try:
         self.serializer.format()
     except OSError as e:
         if e.errno == 32:
             log_handle.log("Could not reformat the file in-place")
     except Exception:
         import traceback
         traceback.print_exc()
Esempio n. 9
0
 def complete(self):
     self.save()
     self.serializer.complete()
     try:
         self.serializer.format()
     except OSError as e:
         if e.errno == 32:
             log_handle.log("Could not reformat the file in-place")
     except Exception:
         import traceback
         traceback.print_exc()
Esempio n. 10
0
    def explore_grid(self):
        if self.network_reduction is None:
            self.network_reduction = self.model.find_threshold_and_lambda(
                rho=DEFAULT_RHO, threshold_step=0.1, fit_tau=True)
        log_handle.log("... Exploring Grid Landscape")
        stack = []
        tau_magnitude = []
        thresholds = []

        for level in self.network_reduction:
            thresholds.append(level.threshold)

        # Pull the distribution slightly to the right
        bias_shift = 1 - (1 / self.threshold_bias)
        # Reduces the influence of the threshold
        bias_scale = self.threshold_bias

        for level in self.network_reduction:
            stack.append(np.array(level.taus).mean(axis=0))
            tau_magnitude.append(
                np.abs(level.optimal_tau).sum() *
                ((level.threshold / bias_scale) + bias_shift))
        tau_magnitude = np.array(tau_magnitude)
        if len(tau_magnitude) == 0:
            # No solutions, so these will be empty
            return GridSearchSolution(stack, tau_magnitude, thresholds,
                                      np.array([]), thresholds)
        elif len(tau_magnitude) <= 2:
            apex = np.argmax(tau_magnitude)
        elif len(tau_magnitude) > 2:
            apex = peak_indices(tau_magnitude)
            if len(apex) == 0:
                apex = np.array([np.argmax(tau_magnitude)])

        thresholds = np.array(thresholds)
        apex_threshold = tau_magnitude[apex].max() * self.apex_threshold
        if apex_threshold != 0:
            apex = apex[(tau_magnitude[apex] > apex_threshold)]
        else:
            # The tau threshold may be 0, in which case any point will do, but this
            # solution carries no generalization.
            apex = apex[(tau_magnitude[apex] >= apex_threshold)]
        target_thresholds = [t for t in thresholds[apex]]
        solution = GridSearchSolution(stack, tau_magnitude, thresholds, apex,
                                      target_thresholds)
        log_handle.log("... %d Candidate Solutions" %
                       (len(target_thresholds), ))
        return solution
Esempio n. 11
0
 def force_build_indices(self):
     log_handle.log("Building Sample Index")
     self.sample_manager.rebuild()
     log_handle.log("Building Analysis Index")
     self.analysis_manager.rebuild()
     log_handle.log("Building Hypothesis Index")
     self.hypothesis_manager.rebuild()
def smooth_network(network, observed_compositions, threshold_step=0.5, apex_threshold=0.95,
                   belongingness_matrix=None, rho=DEFAULT_RHO, lambda_max=1,
                   include_missing=False, lmbda=None, model_state=None,
                   observation_aggregator=VariableObservationAggregation,
                   belongingness_normalization=NORMALIZATION):
    convert = GlycanCompositionSolutionRecord.from_chromatogram
    observed_compositions = [
        convert(o) for o in observed_compositions if _has_glycan_composition(o)]
    model = GlycomeModel(
        observed_compositions, network,
        belongingness_matrix=belongingness_matrix,
        observation_aggregator=observation_aggregator,
        belongingness_normalization=belongingness_normalization)
    log_handle.log("... Begin Model Fitting")
    if model_state is None:
        reduction = model.find_threshold_and_lambda(
            rho=rho, threshold_step=threshold_step,
            lambda_max=lambda_max)
        if len(reduction) == 0:
            log_handle.log("... No Network Reduction Found")
            return None, None, None
        search = ThresholdSelectionGridSearch(model, reduction, apex_threshold)
        params = search.average_solution(lmbda=lmbda)
    else:
        search = ThresholdSelectionGridSearch(model, None, apex_threshold)
        model_state.reindex(model)
        params = model_state
        if lmbda is not None:
            params.lmbda = lmbda
    log_handle.log("... Projecting Solution Onto Network")
    network = search.annotate_network(params, include_missing=include_missing)

    return network, search, params
    def _worker_loop(self):
        has_work = True
        i = 0

        def drain_queue():
            current_work = []
            try:
                while len(current_work) < 300:
                    current_work.append(self.queue.get_nowait())
            except QueueEmptyException:
                pass
            if len(current_work) > 5:
                log_handle.log("Drained Write Queue of %d items" %
                               (len(current_work), ))
            return current_work

        while has_work:
            try:
                next_bunch = self.queue.get(True, 1)
                if next_bunch == DONE:
                    has_work = False
                    continue
                if self.log_inserts and (i % 100 == 0):
                    log_handle.log("Saving %r" % (next_bunch[0].id, ))
                self._save_bunch(*next_bunch)
                self.commit_counter += 1 + len(next_bunch[1])
                i += 1

                if self.queue.qsize() > 0:
                    current_work = drain_queue()
                    for next_bunch in current_work:
                        if next_bunch == DONE:
                            has_work = False
                        else:
                            if self.log_inserts and (i % 100 == 0):
                                log_handle.log("Saving %r" %
                                               (next_bunch[0].id, ))
                            self._save_bunch(*next_bunch)
                            self.commit_counter += 1 + len(next_bunch[1])
                            i += 1

                if self.commit_counter - self.last_commit_count > self.commit_interval:
                    self.last_commit_count = self.commit_counter
                    log_handle.log(
                        "Syncing Scan Cache To Disk (%d items waiting)" %
                        (self.queue.qsize(), ))
                    self.serializer.commit()
                    if self.serializer.is_sqlite():
                        self.serializer.session.execute(
                            "PRAGMA wal_checkpoint(SQLITE_CHECKPOINT_RESTART);"
                        )
                    self.serializer.session.expunge_all()
            except QueueEmptyException:
                continue
            except Exception as e:
                log_handle.error(
                    "An error occurred while writing scans to disk", e)
        self.serializer.commit()
        self.serializer.session.expunge_all()
Esempio n. 14
0
def smooth_network(network,
                   observed_compositions,
                   threshold_step=0.5,
                   apex_threshold=0.95,
                   belongingness_matrix=None,
                   rho=DEFAULT_RHO,
                   lambda_max=1,
                   include_missing=False,
                   lmbda=None,
                   model_state=None,
                   observation_aggregator=VariableObservationAggregation,
                   belongingness_normalization=NORMALIZATION,
                   annotate_network=True):
    convert = GlycanCompositionSolutionRecord.from_chromatogram
    observed_compositions = [
        convert(o) for o in observed_compositions if _has_glycan_composition(o)
    ]
    model = GlycomeModel(
        observed_compositions,
        network,
        belongingness_matrix=belongingness_matrix,
        observation_aggregator=observation_aggregator,
        belongingness_normalization=belongingness_normalization)
    log_handle.log("... Begin Model Fitting")
    if model_state is None:
        reduction = model.find_threshold_and_lambda(
            rho=rho, threshold_step=threshold_step, lambda_max=lambda_max)
        if len(reduction) == 0:
            log_handle.log("... No Network Reduction Found")
            return None, None, None
        search = ThresholdSelectionGridSearch(model, reduction, apex_threshold)
        params = search.average_solution(lmbda=lmbda)
        if params is None:
            log_handle.log("... No Acceptable Solution. Could not fit model.")
            return None, None, None
    else:
        search = ThresholdSelectionGridSearch(model, None, apex_threshold)
        model_state.reindex(model)
        params = model_state
        if lmbda is not None:
            params.lmbda = lmbda
    if annotate_network:
        log_handle.log("... Projecting Solution Onto Network")
        annotated_network = search.annotate_network(
            params, include_missing=include_missing)
    else:
        annotated_network = None

    return annotated_network, search, params
Esempio n. 15
0
 def is_project_resolved(self, ratio=0.5):
     n = 0
     k = 0
     for record in self.hypotheses():
         log_handle.log("Testing %r" % (record, ))
         if record.is_resolvable():
             k += 1
         n += 1
     for record in self.samples():
         log_handle.log("Testing %r" % (record, ))
         if record.is_resolvable():
             k += 1
         n += 1
     for record in self.analyses():
         log_handle.log("Testing %r" % (record, ))
         if record.is_resolvable():
             k += 1
         n += 1
     if n == 0:
         return True
     return k / float(n) > ratio
Esempio n. 16
0
    def merge_common_entities(self,
                              annotated_chromatograms,
                              delta_rt=0.25,
                              require_unmodified=True,
                              threshold_fn=lambda x: x.q_value < 0.05):
        aggregated = defaultdict(list)
        finished = []
        self.log("Aggregating Common Entities: %d chromatograms" %
                 (len(annotated_chromatograms, )))
        for chroma in annotated_chromatograms:
            if chroma.composition is not None:
                if chroma.entity is not None:
                    # Convert to string to avoid redundant sequences from getting
                    # binned differently due to random ordering of ids.
                    aggregated[str(chroma.entity)].append(chroma)
                else:
                    aggregated[str(chroma.composition)].append(chroma)
            else:
                finished.append(chroma)
        for entity, group in aggregated.items():
            out = []
            group = sorted(group, key=lambda x: x.start_time)
            chroma = group[0]
            for obs in group[1:]:
                if chroma.chromatogram.overlaps_in_time(obs) or (
                        chroma.end_time - obs.start_time) < delta_rt:
                    chroma = chroma.merge(obs)
                else:
                    out.append(chroma)
                    chroma = obs
            out.append(chroma)
            finished.extend(out)
        self.log("After merging: %d chromatograms" % (len(finished), ))
        if require_unmodified:
            out = []
            for chromatogram in finished:
                # the structure's best match has not been identified in an unmodified state
                if Unmodified not in chromatogram.mass_shifts:
                    solutions = chromatogram.most_representative_solutions(
                        threshold_fn, reject_shifted=True)
                    # if there is a reasonable solution in an unmodified state
                    if solutions:
                        # select the best solution
                        solutions = sorted(solutions,
                                           key=lambda x: x.score,
                                           reverse=True)

                        # remove the invalidated mass shifts
                        current_shifts = chromatogram.chromatogram.mass_shifts
                        partitions = []
                        for shift in current_shifts:
                            partition, _ = chromatogram.chromatogram.bisect_mass_shift(
                                shift)
                            partitions.append(
                                partition.deduct_node_type(shift))
                        accumulated_chromatogram = partitions[0]
                        for partition in partitions[1:]:
                            accumulated_chromatogram = accumulated_chromatogram.merge(
                                partition)
                        chromatogram.chromatogram = accumulated_chromatogram

                        # update the tandem annotations
                        chromatogram.assign_entity(
                            solutions[0],
                            entity_chromatogram_type=chromatogram.chromatogram.
                            __class__)
                        chromatogram.representative_solutions = solutions
                        out.append(chromatogram)
                    else:
                        log_handle.log(
                            "... Could not find an alternative option for %r" %
                            (chromatogram, ))
                        out.append(chromatogram)
                else:
                    out.append(chromatogram)
            finished = []
            aggregated = defaultdict(list)
            for chroma in out:
                if chroma.composition is not None:
                    if chroma.entity is not None:
                        aggregated[chroma.entity].append(chroma)
                    else:
                        aggregated[chroma.composition].append(chroma)
                else:
                    finished.append(chroma)
            for entity, group in aggregated.items():
                out = []
                group = sorted(group, key=lambda x: x.start_time)
                chroma = group[0]
                for obs in group[1:]:
                    if chroma.chromatogram.overlaps_in_time(obs) or (
                            chroma.end_time - obs.start_time) < delta_rt:
                        chroma = chroma.merge(obs)
                    else:
                        out.append(chroma)
                        chroma = obs
                out.append(chroma)
                finished.extend(out)
        return finished
    def find_threshold_and_lambda(self, rho, lambda_max=1., lambda_step=0.02, threshold_start=0.,
                                  threshold_step=0.2, fit_tau=True, drop_missing=True,
                                  renormalize_belongingness=NORMALIZATION):
        r'''Iterate over score thresholds and smoothing factors (lambda), sampling points
        from the parameter grid and computing the PRESS residual at each point.

        This produces a :class:`NetworkReduction` data structure recording the results for
        later local maximum detection.

        Parameters
        ----------
        rho: float
            The scale of the variance of the observed score
        lambda_max: float
            The maximum value of lambda to consider on the grid
        lambda_step: float
            The size of the change in lambda at each iteration
        threshold_start: float
            The minimum observed score threshold to start the grid search at
        threshold_step: float
            The size of the change in the observed score threshold at each iteration
        fit_tau: bool
            Whether or not to estimate :math:`\tau` for each iteration when computing
            the PRESS
        drop_missing: bool
            Whether or not to remove nodes from the graph which are not observed above
            the threshold, restructuring the graph, which in turn changes the Laplacian.
        renormalize_belongingness: str
            A string constant which names the belongingness normalization technique to
            use.

        Returns
        -------
        :class:`NetworkReduction`:
            The recorded grid of sampled points and snapshots of the model at each point
        '''
        solutions = NetworkReduction()
        limit = max(self.S0)
        start = max(min(self.S0) - 1e-3, threshold_start)
        current_network = self.network.clone()
        thresholds = np.arange(start, limit, threshold_step)
        last_solution = None
        last_raw_observations = None
        last_aggregate = None
        for i_threshold, threshold in enumerate(thresholds):
            if i_threshold % 10 == 0:
                log_handle.log("... Threshold = %r (%0.2f%%)" % (
                    threshold, (100.0 * i_threshold / len(thresholds))))
            # Aggregate the raw observations into averaged, variance reduced records
            # and annotate the network with these new scores
            raw_observations = [c for c in self._observed_compositions if c.score > threshold]

            # cache on the explicit raw observations used because the step size may be smaller than
            # the next highest difference, and aggregating observations can be expensive. There is
            # no solution to the general problem as it calls for inverting a potentially large matrix
            # to only be used in this loop.
            if raw_observations == last_raw_observations:
                observations, summarized_state, obs_ix = last_aggregate
            else:
                agg = self.observation_aggregator(self.network)
                agg.collect(raw_observations)

                observations, summarized_state = agg.build_records()
                obs_ix = agg.observed_indices()
                last_aggregate = (observations, summarized_state, obs_ix)
                last_raw_observations = raw_observations

            variance_matrix = summarized_state.variance_matrix
            inverse_variance_matrix = summarized_state.inverse_variance_matrix
            variance_matrix = np.diag(variance_matrix[obs_ix, obs_ix])
            inverse_variance_matrix = np.diag(inverse_variance_matrix[obs_ix, obs_ix])

            # clear the scores from the network
            current_network = current_network.clone()
            for i, node in enumerate(current_network):
                node.score = 0
            # assign aggregated scores to the network
            network = assign_network(current_network, observations)

            # Filter the network, marking nodes for removal and recording observed
            # nodes for future use.
            obs = []
            missed = []
            for i, node in enumerate(network):
                if node.score < threshold:
                    missed.append(node)
                    node.marked = True
                else:
                    obs.append(node.score)
            if len(obs) == 0:
                break
            obs = np.array(obs)
            press = []

            if drop_missing:
                # drop nodes whose score does not exceed the threshold
                for node in missed:
                    network.remove_node(node, limit=5)

            if last_solution is not None:
                # If after pruning the network, no new nodes have been removed,
                # the optimal solution won't have changed from previous iteration
                # so just reuse the solution
                if last_solution.network == network:
                    current_solution = last_solution.copy()
                    current_solution.threshold = threshold
                    solutions[threshold] = current_solution
                    last_solution = current_solution
                    current_network = network
                    continue
            wpl = weighted_laplacian_matrix(network)
            ident = np.eye(wpl.shape[0])
            lum = LaplacianSmoothingModel(
                network, self.normalized_belongingness_matrix, threshold,
                neighborhood_walker=self.neighborhood_walker,
                belongingness_normalization=renormalize_belongingness,
                variance_matrix=variance_matrix,
                inverse_variance_matrix=inverse_variance_matrix)
            updates = []
            taus = []
            lambda_values = np.arange(0.01, lambda_max, lambda_step)
            for lambd in lambda_values:
                if fit_tau:
                    tau = lum.estimate_tau_from_S0(rho, lambd)
                else:
                    tau = np.zeros(self.A0.shape[1])
                T = lum.optimize_observed_scores(lambd, lum.A0.dot(tau))
                A = ident + lambd * wpl

                H = np.linalg.inv(A)
                diag_H = np.diag(H)
                if len(diag_H) != len(T):
                    diag_H = diag_H[lum.obs_ix]
                    assert len(diag_H) == len(T)

                press_value = sum(
                    ((obs - T) / (1 - (diag_H - np.finfo(float).eps))) ** 2) / len(obs)
                press.append(press_value)
                updates.append(T)
                taus.append(tau)
            current_solution = NetworkTrimmingSearchSolution(
                threshold, lambda_values, np.array(press), network, np.array(obs),
                updates, taus, lum)

            solutions[threshold] = current_solution
            last_solution = current_solution
            current_network = network
        return solutions
Esempio n. 18
0
 def validate_indices(self, ratio=0.5):
     with self._data_lock:
         if not self.is_project_resolved(ratio):
             log_handle.log("Rebuilding Project Indices")
             self.force_build_indices()
Esempio n. 19
0
    def find_threshold_and_lambda(self,
                                  rho,
                                  lambda_max=1.,
                                  lambda_step=0.02,
                                  threshold_start=0.,
                                  threshold_step=0.2,
                                  fit_tau=True,
                                  drop_missing=True,
                                  renormalize_belongingness=NORMALIZATION):
        r'''Iterate over score thresholds and smoothing factors (lambda), sampling points
        from the parameter grid and computing the PRESS residual at each point.

        This produces a :class:`NetworkReduction` data structure recording the results for
        later local maximum detection.

        Parameters
        ----------
        rho: float
            The scale of the variance of the observed score
        lambda_max: float
            The maximum value of lambda to consider on the grid
        lambda_step: float
            The size of the change in lambda at each iteration
        threshold_start: float
            The minimum observed score threshold to start the grid search at
        threshold_step: float
            The size of the change in the observed score threshold at each iteration
        fit_tau: bool
            Whether or not to estimate :math:`\tau` for each iteration when computing
            the PRESS
        drop_missing: bool
            Whether or not to remove nodes from the graph which are not observed above
            the threshold, restructuring the graph, which in turn changes the Laplacian.
        renormalize_belongingness: str
            A string constant which names the belongingness normalization technique to
            use.

        Returns
        -------
        :class:`NetworkReduction`:
            The recorded grid of sampled points and snapshots of the model at each point
        '''
        solutions = NetworkReduction()
        limit = max(self.S0)
        start = max(min(self.S0) - 1e-3, threshold_start)
        current_network = self.network.clone()
        thresholds = np.arange(start, limit, threshold_step)
        last_solution = None
        last_raw_observations = None
        last_aggregate = None
        for i_threshold, threshold in enumerate(thresholds):
            if i_threshold % 10 == 0:
                log_handle.log("... Threshold = %r (%0.2f%%)" %
                               (threshold,
                                (100.0 * i_threshold / len(thresholds))))
            # Aggregate the raw observations into averaged, variance reduced records
            # and annotate the network with these new scores
            raw_observations = [
                c for c in self._observed_compositions if c.score > threshold
            ]

            # cache on the explicit raw observations used because the step size may be smaller than
            # the next highest difference, and aggregating observations can be expensive. There is
            # no solution to the general problem as it calls for inverting a potentially large matrix
            # to only be used in this loop.
            if raw_observations == last_raw_observations:
                observations, summarized_state, obs_ix = last_aggregate  # pylint: disable=unpacking-non-sequence
            else:
                agg = self.observation_aggregator(self.network)
                agg.collect(raw_observations)

                observations, summarized_state = agg.build_records()
                obs_ix = agg.observed_indices()
                last_aggregate = (observations, summarized_state, obs_ix)
                last_raw_observations = raw_observations

            # Extract pre-calculated variance matrices
            variance_matrix = summarized_state.variance_matrix
            inverse_variance_matrix = summarized_state.inverse_variance_matrix
            variance_matrix = np.diag(variance_matrix[obs_ix, obs_ix])
            inverse_variance_matrix = np.diag(inverse_variance_matrix[obs_ix,
                                                                      obs_ix])

            # clear the scores from the network
            current_network = current_network.clone()
            for node in current_network:
                node.score = 0
                node.internal_score = 0

            # assign aggregated scores to the network
            network = assign_network(current_network, observations)

            # Filter the network, marking nodes for removal and recording observed
            # nodes for future use.
            obs = []
            missed = []
            for i, node in enumerate(network):
                if node.score < threshold:
                    missed.append(node)
                    node.marked = True
                else:
                    obs.append(node.score)
            if len(obs) == 0:
                break
            obs = np.array(obs)
            press = []

            if drop_missing:
                # drop nodes whose score does not exceed the threshold
                for node in missed:
                    network.remove_node(node, limit=5)

            if last_solution is not None:
                # If after pruning the network, no new nodes have been removed,
                # the optimal solution won't have changed from previous iteration
                # so just reuse the solution
                if last_solution.network == network:
                    current_solution = last_solution.copy()
                    current_solution.threshold = threshold
                    solutions[threshold] = current_solution
                    last_solution = current_solution
                    current_network = network
                    continue
            wpl = weighted_laplacian_matrix(network)
            ident = np.eye(wpl.shape[0])

            # The network passed into LaplacianSmoothingModel will have its indices changed,
            # and will not match the ordering of the belongingness matrix, so make sure the
            # observed indices are aligned.
            lum = LaplacianSmoothingModel(
                network,
                self.normalized_belongingness_matrix[obs_ix, :],
                threshold,
                neighborhood_walker=self.neighborhood_walker,
                belongingness_normalization=renormalize_belongingness,
                variance_matrix=variance_matrix,
                inverse_variance_matrix=inverse_variance_matrix)
            updates = []
            taus = []
            lambda_values = np.arange(0.01, lambda_max, lambda_step)
            for lambd in lambda_values:
                if fit_tau:
                    tau = lum.estimate_tau_from_S0(rho, lambd)
                else:
                    tau = np.zeros(self.A0.shape[1])
                T = lum.optimize_observed_scores(lambd, lum.A0.dot(tau))
                A = ident + lambd * wpl

                H = np.linalg.inv(A)
                diag_H = np.diag(H)
                if len(diag_H) != len(T):
                    diag_H = diag_H[lum.obs_ix]
                    assert len(diag_H) == len(T)

                press_value = sum(
                    ((obs - T) /
                     (1 - (diag_H - np.finfo(float).eps)))**2) / len(obs)
                press.append(press_value)
                updates.append(T)
                taus.append(tau)
            current_solution = NetworkTrimmingSearchSolution(
                threshold, lambda_values, np.array(press), network,
                np.array(obs), updates, taus, lum)

            solutions[threshold] = current_solution
            last_solution = current_solution
            current_network = network
        return solutions
    def merge_common_entities(self, annotated_chromatograms, delta_rt=0.25, require_unmodified=True,
                              threshold_fn=lambda x: x.q_value < 0.05):
        aggregated = defaultdict(list)
        finished = []
        self.log("Aggregating Common Entities: %d chromatograms" % (len(annotated_chromatograms,)))
        for chroma in annotated_chromatograms:
            if chroma.composition is not None:
                if chroma.entity is not None:
                    # Convert to string to avoid redundant sequences from getting
                    # binned differently due to random ordering of ids.
                    aggregated[str(chroma.entity)].append(chroma)
                else:
                    aggregated[str(chroma.composition)].append(chroma)
            else:
                finished.append(chroma)
        for entity, group in aggregated.items():
            out = []
            group = sorted(group, key=lambda x: x.start_time)
            chroma = group[0]
            for obs in group[1:]:
                if chroma.chromatogram.overlaps_in_time(obs) or (
                        chroma.end_time - obs.start_time) < delta_rt:
                    chroma = chroma.merge(obs)
                else:
                    out.append(chroma)
                    chroma = obs
            out.append(chroma)
            finished.extend(out)
        self.log("After merging: %d chromatograms" % (len(finished),))
        if require_unmodified:
            out = []
            for chromatogram in finished:
                # the structure's best match has not been identified in an unmodified state
                if Unmodified not in chromatogram.mass_shifts:
                    solutions = chromatogram.most_representative_solutions(
                        threshold_fn, reject_shifted=True)
                    # if there is a reasonable solution in an unmodified state
                    if solutions:
                        # select the best solution
                        solutions = sorted(solutions, key=lambda x: x.score, reverse=True)

                        # remove the invalidated mass shifts
                        current_shifts = chromatogram.chromatogram.mass_shifts
                        partitions = []
                        for shift in current_shifts:
                            partition, _ = chromatogram.chromatogram.bisect_mass_shift(shift)
                            partitions.append(partition.deduct_node_type(shift))
                        accumulated_chromatogram = partitions[0]
                        for partition in partitions[1:]:
                            accumulated_chromatogram = accumulated_chromatogram.merge(partition)
                        chromatogram.chromatogram = accumulated_chromatogram

                        # update the tandem annotations
                        chromatogram.assign_entity(
                            solutions[0],
                            entity_chromatogram_type=chromatogram.chromatogram.__class__)
                        chromatogram.representative_solutions = solutions
                        out.append(chromatogram)
                    else:
                        log_handle.log("... Could not find an alternative option for %r" % (chromatogram,))
                        out.append(chromatogram)
                else:
                    out.append(chromatogram)
            finished = []
            aggregated = defaultdict(list)
            for chroma in out:
                if chroma.composition is not None:
                    if chroma.entity is not None:
                        aggregated[chroma.entity].append(chroma)
                    else:
                        aggregated[chroma.composition].append(chroma)
                else:
                    finished.append(chroma)
            for entity, group in aggregated.items():
                out = []
                group = sorted(group, key=lambda x: x.start_time)
                chroma = group[0]
                for obs in group[1:]:
                    if chroma.chromatogram.overlaps_in_time(obs) or (
                            chroma.end_time - obs.start_time) < delta_rt:
                        chroma = chroma.merge(obs)
                    else:
                        out.append(chroma)
                        chroma = obs
                out.append(chroma)
                finished.extend(out)
        return finished
 def log(self, message):
     log_handle.log(message)
Esempio n. 22
0
 def _log(self, message):
     log_handle.log(message)
 def add_solution(self, item):
     case_mass = item.precursor_information.neutral_mass
     if abs(case_mass - self.chromatogram.neutral_mass) > 100:
         log_handle.log("Warning, mis-assigned spectrum match to chromatogram %r, %r" % (self, item))
     self.tandem_solutions.append(item)
 def complete(self):
     self.save()
     log_handle.log("Completing Serializer")
     self.serializer.complete()