def run(self, rdd: RDD) -> RDD: # type: ignore rdd = rdd.cache() n_points = rdd.count() m = n_points / self.n_partitions optimal_p = math.log(n_points * self.n_partitions) / m rdd = self.assign_buckets( # type: ignore rdd, p=optimal_p, key_func=_label_first_coord_and_type ) rdd = self.sort_and_assign_labels(rdd) # type: ignore return rdd
def run( self, rdd: RDD, key_func: Callable[[Tuple[Any]], Tuple[Any]] = lambda x: x ) -> RDD: # type: ignore rdd = rdd.cache() n_points = rdd.count() m = n_points / self.n_partitions optimal_p = math.log(n_points * self.n_partitions) / m rdd = self.assign_buckets(rdd, p=optimal_p, key_func=key_func) # type: ignore rdd = self.sort(rdd, key_func=key_func) # type: ignore return rdd
def normal_order(self, terms: RDD, **kwargs): """Normal order the terms according to generalized Wick theorem. The actual expansion is based on the information given in the subclasses by the abstract properties. """ comparator = kwargs.pop('comparator', self.comparator) contractor = kwargs.pop('contractor', self.contractor) if len(kwargs) != 0: raise ValueError('Invalid arguments to Wick normal order', kwargs) phase = self.phase symms = self.symms resolvers = self.resolvers terms.cache() terms_to_proc = terms.filter(lambda x: len(x.vecs) > 1) keep_top = 0 if comparator is None else 1 terms_to_keep = terms.filter(lambda x: len(x.vecs) <= keep_top) terms_to_proc.cache() if terms_to_proc.count() == 0: return terms_to_keep # Triples: term, contractions, schemes. wick_terms = terms_to_proc.map(lambda x: _prepare_wick( x, comparator, contractor, symms.value, resolvers.value)) if self._wick_parallel == 0: normal_ordered = wick_terms.flatMap(lambda x: [ _form_term_from_wick(x[0], x[1], phase, resolvers.value, i) for i in x[2] ]) elif self._wick_parallel == 1: flattened = wick_terms.flatMap( lambda x: [(x[0], x[1], i) for i in x[2]]) if self._num_partitions is not None: flattened = flattened.repartition(self._num_partitions) normal_ordered = flattened.map(lambda x: _form_term_from_wick( x[0], x[1], phase, resolvers.value, x[2])) elif self._wick_parallel == 2: # This level of parallelism is reserved for really hard problems. expanded = [] for term, contrs, schemes in wick_terms.collect(): # To work around a probable Spark bug. Problem occurs when we # have closures inside a loop to be distributed out. form_term = functools.partial(_form_term_from_wick_bcast, term, contrs, phase, resolvers) curr = self._ctx.parallelize(schemes).map(form_term) expanded.append(curr) continue normal_ordered = self._ctx.union(expanded) else: raise ValueError('Invalid Wick expansion parallel level', self._wick_parallel) return terms_to_keep.union(normal_ordered)