def naive_multiplication_rdd(mat_a: pyspark.RDD, mat_b: pyspark.RDD, is_triangle=False): """ mat_a is the left matrix mat_b is the right matix :param mat_a: :param mat_b: :param is_triangle: :return: """ if is_triangle: left_rdd = ( mat_a.flatMap(lambda x: [((x.j, x.i), x.value), ((x.i, x.j), x.value)]) .aggregateByKey(zeroValue=(0.0, 0.0), seqFunc=lambda x, y: (x[0]+y, x[1]+1), combFunc=lambda x, y: (x[0] + y[0], x[1]+y[1])) .mapValues(lambda x: x[0] / x[1]) .map(lambda x: (x[0][0], (x[0][1], x[1]))) ) else: left_rdd = mat_a.map(lambda x: (x.j, (x.i, x.value))) right_rdd = mat_b.map(lambda x: (x.i, (x.j, x.value))) combined_rdd = (left_rdd.join(right_rdd).map(lambda x: x[1]) .map(lambda x: ((x[0][0], x[1][0]), x[0][1]*x[1][1])) .reduceByKey(lambda x, y: x+y) .map(lambda x: distributed.MatrixEntry(i=x[0][0], j=x[0][1], value=x[1])) ) return combined_rdd
def extract_claims(wikidata_items: RDD, b_property_map: Broadcast, b_item_map: Broadcast): def parse_item_claims(item): item_id = item['id'] item_map = b_item_map.value if item_id not in item_map: return [] item_label = item_map[item_id] property_map = b_property_map.value if 'enwiki' in item['sitelinks']: title = item['sitelinks']['enwiki']['title'] else: title = None item_claims = [] for property_id, property_claims in item['claims'].items(): if property_id in property_map: if property_id not in property_map: continue property_name = property_map[property_id] for claim in property_claims: mainsnak = claim['mainsnak'] if 'datatype' in mainsnak and 'datavalue' in mainsnak: datatype = mainsnak['datatype'] datavalue = mainsnak['datavalue'] if datatype in datatype_parsers: wiki_object = datatype_parsers[datatype](datavalue) if wiki_object is not None: item_claims.append( Claim(item_label, property_name, wiki_object, datatype, title, property_id, item_id) ) return item_claims return wikidata_items.flatMap(parse_item_claims)
def extract_claims(wikidata_items: RDD, b_property_map: Broadcast, b_item_map: Broadcast): def parse_item_claims(item): item_id = item['id'] item_map = b_item_map.value if item_id not in item_map: return [] item_label = item_map[item_id] property_map = b_property_map.value if 'enwiki' in item['sitelinks']: title = item['sitelinks']['enwiki']['title'] else: title = None item_claims = [] for property_id, property_claims in item['claims'].items(): if property_id in property_map: if property_id not in property_map: continue property_name = property_map[property_id] for claim in property_claims: mainsnak = claim['mainsnak'] if 'datatype' in mainsnak and 'datavalue' in mainsnak: datatype = mainsnak['datatype'] datavalue = mainsnak['datavalue'] if datatype in datatype_parsers: wiki_object = datatype_parsers[datatype](datavalue) if wiki_object is not None: item_claims.append( Claim(item_label, property_name, wiki_object, datatype, title, property_id, item_id)) return item_claims return wikidata_items.flatMap(parse_item_claims)
def train_model(data: RDD, l=1.0) -> MLNaiveBayesModel: aggregated = data.flatMap(lambda x: [(l, x['features']) for l in x['labels']]) \ .combineByKey(lambda v: (1, v), lambda c, v: (c[0] + 1, c[1] + v), lambda c1, c2: (c1[0] + c2[0], c1[1] + c2[1])) \ .sortBy(lambda x: x[0]) \ .collect() num_labels = len(aggregated) num_documents = data.count() num_features = aggregated[0][1][1].size labels = np.zeros(num_labels) pi = np.zeros(num_labels, dtype=int) theta = np.zeros((num_labels, num_features)) pi_log_denom = math.log(num_documents + num_labels * l) i = 0 for (label, (n, sum_term_freq)) in aggregated: labels[i] = label pi[i] = math.log(n + l) - pi_log_denom sum_term_freq_dense = sum_term_freq.toarray() theta_log_denom = math.log(sum_term_freq.sum() + num_features * l) theta[i, :] = np.log(sum_term_freq_dense + l) - theta_log_denom i += 1 return MLNaiveBayesModel(labels, pi, theta)
def extract_item_page_map(wikidata_items: RDD): def parse_item_page(item): item_id = item['id'] if 'enwiki' in item['sitelinks']: return [(item_id, item['sitelinks']['enwiki']['title'])] else: return [] return wikidata_items.flatMap(parse_item_page).collectAsMap()
def extract_item_page_map(wikidata_items: RDD): def parse_item_page(item): item_id = item["id"] if "enwiki" in item["sitelinks"]: return [(item_id, item["sitelinks"]["enwiki"]["title"])] else: return [] return wikidata_items.flatMap(parse_item_page).collectAsMap()
def extract_claim_types(wikidata_items: RDD): def parse_types(item): value_types = [] for property_claims in item["claims"].values(): for c in property_claims: mainsnak = c["mainsnak"] if "datatype" in mainsnak: value_types.append(mainsnak["datatype"]) return value_types return set(wikidata_items.flatMap(parse_types).distinct().collect())
def extract_claim_types(wikidata_items: RDD): def parse_types(item): value_types = [] for property_claims in item['claims'].values(): for c in property_claims: mainsnak = c['mainsnak'] if 'datatype' in mainsnak: value_types.append(mainsnak['datatype']) return value_types return set(wikidata_items.flatMap(parse_types).distinct().collect())
def __group_by_blocks(train: RDD, test: RDD, similarity: RDD, n_bucket_block: int, n_cross_block: int, n_item_block: int) -> (RDD, RDD, RDD): """ :param train: RDD<(cross_block, item_block), (cross_bucket, item, rating)> :param test: RDD<(bucket_block, item_block), (bucket, item)> :param similarity: RDD<(bucket_block, cross_block), (bucket, cross_bucket, similarity)> """ """ train -> RDD<(b, bucket_block, item_block), ((bucket, item, rating), 0)>, b=1, ..., n_bucket_block test -> RDD<(bucket_block, b, item_block), ((bucket, item), 1)>, b=1, ..., n_cross_block similarity-> RDD<(bucket_block, bucket_block, i), ((bucket, bucket, similarity), 2)>, i=1, ..., n_item_block """ train = train.flatMap(lambda u: ( ((b, u[0][0], u[0][1]), (u[1], 0)) for b in range(n_bucket_block))) test = test.flatMap(lambda u: ( ((u[0][0], c, u[0][1]), (u[1], 1)) for c in range(n_cross_block))) similarity = similarity.flatMap(lambda u: ( ((u[0][0], u[0][1], i), (u[1], 2)) for i in range(n_item_block))) return train, test, similarity
def __find_candidates(self, signature: RDD) -> RDD: """ Generate candidates from signatures. :param signature: RDD<(Hashable, tuple<int>)> :return: RDD<(Hashable, Hashable)> Item pairs which are candidates for computing similarity later. """ divide = self.__divide_signature generate = self.__generate_candidates n_bands, n_rows = self.__n_bands, self.__n_rows return signature.flatMap(lambda key_values: divide( *key_values, n_bands=n_bands, n_rows=n_rows)).aggregateByKey( tuple(), lambda u, v: u + (v, ), lambda u1, u2: u1 + u2).flatMap( lambda key_values: generate(key_values[1])).distinct()
def extract_claims(wikidata_items: RDD, b_property_map: Broadcast, b_item_map: Broadcast): def parse_item_claims(item): item_id = item["id"] item_map = b_item_map.value if item_id not in item_map: return [] item_label = item_map[item_id] property_map = b_property_map.value if "enwiki" in item["sitelinks"]: title = item["sitelinks"]["enwiki"]["title"] else: title = None item_claims = [] for property_id, property_claims in item["claims"].items(): if property_id in property_map: if property_id not in property_map: continue property_name = property_map[property_id] for claim in property_claims: mainsnak = claim["mainsnak"] if "datatype" in mainsnak and "datavalue" in mainsnak: datatype = mainsnak["datatype"] datavalue = mainsnak["datavalue"] if datatype in datatype_parsers: wiki_object = datatype_parsers[datatype](datavalue) if wiki_object is not None: item_claims.append( Claim( item_label, property_name, wiki_object, datatype, title, property_id, item_id, )) return item_claims return wikidata_items.flatMap(parse_item_claims)
def _nest_bind_no_balance(rdd: RDD, func): """Nest the flat map of the given function without load balancing. """ def wrapped(obj): """Wrapped function for nest bind.""" curr = [obj] res = [] while len(curr) > 0: new_curr = [] for i in curr: step_res = func(i) if step_res is None: res.append(i) else: new_curr.extend(step_res) continue curr = new_curr continue return res return rdd.flatMap(wrapped)
def __call__(self, rdd: RDD, **kwargs: Any) -> RDD: """ Performs a single step of an algorithm, running all operations in sequence and ensuring data is partitioned correctly. Any additional keyword arguments passed to this function will be available in all life-cycle functions of the step: - `group` - `emit_by_group` - `broadcast` - `step` **DO NOT OVERRIDE WHEN DEFINING CUSTOM STEPS.** """ if rdd.getNumPartitions() != self._n_partitions: rdd = rdd.repartition(self._n_partitions) step_cls: Type[Step] = self.__class__ rdd = step_cls.group( rdd, **kwargs ).cache() # cache because we use it twice (emit and step) def unwrap_emit(kv: Tuple[Any, Iterable[Any]]) -> Optional[Tuple[Any, Any]]: k, v = kv new_v = step_cls.emit_by_group(k, v, **kwargs) return new_v emitted = list(rdd.map(unwrap_emit, preservesPartitioning=True).collect()) to_broadcast = step_cls.broadcast(emitted, **kwargs) broadcast: Broadcast = self._sc.broadcast(to_broadcast) def unwrap_step(kv: Tuple[Any, Iterable[Any]]) -> Iterable[Any]: k, v = kv for new_v in step_cls.step(k, v, broadcast, **kwargs): yield new_v rdd = rdd.flatMap(unwrap_step, preservesPartitioning=True) return rdd
def __call__(self, rows: RDD) -> RDD: return rows.flatMap(self.deserialize_uast)
def train_glove(spark: SparkContext, word_cooc: RDD, num_iterations=100, vector_size=10, learning_rate=0.001, max_value=100, alpha=3. / 4) -> (Dict[str, Array], Dict[str, float]): """Train a glove model TODO: add option to initialize form existing parameters for continued training Parameters ---------- spark : The Spark context of the session word_cooc : The co-occurrence RDD of words, ([word, word], count) max_value : The max value of the loss weighting. Counts higher then this do not have the loss applied to them num_iterations : the number of training iterations to run max_value : The maximum value where loss weighting is applied learning_rate : The learning rate of the vector alpha : Part of the loss weighting Returns ------- ’ """ if num_iterations > 0: raise ValueError( 'The number of training iterations must be greater than 0') if (alpha > 1) or (alpha < 0): raise ValueError('Alpha should be between 0 and 1') # Model Hyper-parameters max_value_bc = spark.broadcast(max_value) learning_rate_bc = spark.broadcast(learning_rate) alpha_bc = spark.broadcast(alpha) # Get the unique words to initialize the parameter dicts unique_words = word_cooc.keys().flatMap(lambda x: x).distinct().collect() # Initialize the model parameters init_vectors, init_biases, init_vectors_grads, init_biases_grads = _initialize_parameters( unique_words, vector_size) # Broadcast the new model params word_vectors = spark.broadcast(init_vectors) word_biases = spark.broadcast(init_biases) word_vector_grads = spark.broadcast(init_vectors_grads) word_bias_grads = spark.broadcast(init_biases_grads) # Start training for i in range(1, num_iterations + 1): print('Iteration Number:', i) print('\tComputing Gradients...') # Compute the loss for every word co-occurrence updates = word_cooc.flatMap(lambda x: _gradient_update( x, word_vectors.value, word_vector_grads.value, word_biases.value, word_bias_grads.value, max_value_bc.value, learning_rate_bc.value, alpha_bc.value)) # Collect gradients and sum over words aggregated_grads = updates.reduceByKey( lambda x, y: [x[i] + y[i] for i in range(4)]).collect() print('\tUpdating Params') # Separate update components updated_vectors = {} for word, grad in [(word, grad[0]) for word, grad in aggregated_grads]: updated_vectors[word] = word_vectors.value[word] - grad updated_biases = {} for word, grad in [(word, grad[1]) for word, grad in aggregated_grads]: updated_biases[word] = word_biases.value[word] - grad updated_vector_grads = {} for word, grad in [(word, grads[2]) for word, grads in aggregated_grads]: updated_vector_grads[word] = word_vector_grads.value[word] + grad updated_bias_grads = {} for word, grad in [(word, grads[3]) for word, grads in aggregated_grads]: updated_bias_grads[word] = word_bias_grads.value[word] + grad # Un-persist old values for bc_var in [ word_vectors, word_vector_grads, word_biases, word_vector_grads ]: bc_var.unpersist() # Broadcast updates word_vectors = spark.broadcast(updated_vectors) word_biases = spark.broadcast(updated_biases) word_vector_grads = spark.broadcast(updated_vector_grads) word_bias_grads = spark.broadcast(updated_bias_grads) # noinspection PyUnboundLocalVariable return updated_vectors, updated_biases
def __call__(self, rows: RDD): return rows.flatMap(self.process_row)
def call_func(self, rows: RDD): return rows.flatMap(self.extract_functions_from_row)
def flatten_trips( rdd: RDD, airport: AirportFinder, air_info, sticky: float = float("inf"), time_threshold: float = float("inf") ) -> RDD: def flatmap_split_trips(record, airport: AirportFinder = airport, air_info=air_info, sticky=sticky, time_threshold=time_threshold): """ Split the record for each plane in multiple records: One for each trip the plane took. """ alt, talt, lat, long, speed, time = record.Alt, record.TargetAlt, record.Lat, record.Long, record.Speed, record.Time # Ideas: - Apply low frequency filter on the altitude to # remove noise from data ? # - Do the same on Speed values ? # Get local minimums of the altitude -> this might be take off and landing if len(time) <= 2: return [] # local_alt_bool = np.array([a < local_alt for a, local_alt in zip(alt, # [airport.local_max_alt(pos, air_info) for pos in zip(lat, long)])]) # split_indices = [i for i, (a, b) in enumerate(zip(np.gradient(np.sign(np.gradient(alt))) > 0, # local_alt_bool)) # if a and b] ta_max = 0.5 * max(talt) local_alt_bool = np.array([ a < local_alt and ta < ta_max for a, ta, local_alt in zip(alt, talt, [ airport.local_max_alt(pos, air_info) for pos in zip(lat, long) ]) ]) split_indices = [ i for i, (a, b, c) in enumerate( zip( np.gradient(np.sign(np.gradient(alt))) > 0, np.gradient(np.sign(np.gradient(talt))) >= 0, local_alt_bool)) if a and b and c ] # TODO: Here compute distance airport - position and validate the landing. # TODO: if too big time shifts appears -> filter out ? # -> land and cut ? corresponding_airports = [ airport.closest_airport((lat[i], long[i])) for i in split_indices ] couples = [(i, j) for i, j in zip(split_indices[:-1], split_indices[1:])] trip_l = [] for k, (i, j) in enumerate(couples): air_from = air_info[corresponding_airports[k]] air_to = air_info[corresponding_airports[k + 1]] # Get real trips only if air_from == air_to: continue # A trip must last a bit if time[j] - time[i] < time_threshold: continue # TODO: control here for monotonicity of lat, long and not too long trip # -> thoughts for monotonicity: # -> real monotonicity (bad) # -> having a direction (between airports), make sure at # least 80% steps go into that direction # -> cut into multiple trips in the middle where monotonicity fails! row = Row( Icao=record.Icao, Op=record.Op, Engines=record.Engines, Mil=record.Mil, Cou=record.Cou, Mdl=record.Mdl, From=air_from, To=air_to, Lat=lat[i:j + 1], Long=long[i:j + 1], Alt=alt[i:j + 1], TargetAlt=talt[i:j + 1], Speed=speed[i:j + 1], Time=time[i:j + 1], ) trip_l.append(row) return trip_l rdd = rdd.flatMap(flatmap_split_trips) # print(f"Applied trips recognition, remains {rdd.count()} records") return rdd