def naive_multiplication_rdd(mat_a: pyspark.RDD, mat_b: pyspark.RDD, is_triangle=False): """ mat_a is the left matrix mat_b is the right matix :param mat_a: :param mat_b: :param is_triangle: :return: """ if is_triangle: left_rdd = ( mat_a.flatMap(lambda x: [((x.j, x.i), x.value), ((x.i, x.j), x.value)]) .aggregateByKey(zeroValue=(0.0, 0.0), seqFunc=lambda x, y: (x[0]+y, x[1]+1), combFunc=lambda x, y: (x[0] + y[0], x[1]+y[1])) .mapValues(lambda x: x[0] / x[1]) .map(lambda x: (x[0][0], (x[0][1], x[1]))) ) else: left_rdd = mat_a.map(lambda x: (x.j, (x.i, x.value))) right_rdd = mat_b.map(lambda x: (x.i, (x.j, x.value))) combined_rdd = (left_rdd.join(right_rdd).map(lambda x: x[1]) .map(lambda x: ((x[0][0], x[1][0]), x[0][1]*x[1][1])) .reduceByKey(lambda x, y: x+y) .map(lambda x: distributed.MatrixEntry(i=x[0][0], j=x[0][1], value=x[1])) ) return combined_rdd
def __init_parameters(self, train: RDD): """ _n_buckets/_n_items: The number of distinct buckets/items in the train RDD. _bucket_block_size/_cross_block_size/_item_block_size: The size of blocks when dividing buckets/cross buckets/items into blocks. _n_bucket_block/_n_cross_block/_n_item_block: The number of blocks when dividing buckets/cross buckets/items into blocks. """ self._n_buckets = train.map(lambda u: u[0]).distinct().count() if self._n_buckets <= self._k: self._k = float("inf") # For the bucket dimension. if self._bucket_block_size is None: # Interpret bucket_block_size from n_bucket_block self._bucket_block_size = self._n_buckets // self._n_bucket_block + 1 else: self._n_bucket_block = self._n_buckets // self._bucket_block_size + 1 # For the cross dimension. if self._cross_block_size is None: self._cross_block_size = self._n_buckets // self._n_cross_block + 1 else: self._n_cross_block = self._n_buckets // self._cross_block_size + 1 # For the item dimension self._n_items = train.map(lambda u: u[1]).distinct().count() if self._item_block_size is None: self._item_block_size = self._n_items // self._n_item_block + 1 else: self._n_item_block = self._n_item // self._item_block_size + 1 return self
def join_multiple_keys(left: RDD, right: RDD, n: int) -> RDD: """ Join RDDs with multiple keys. ((key1, key2, ...), value_left) x (key_i, value_right_i) -> ((key1, key2, ...), (value_left, value_right_1, value_right_2, ...)) :param left: RDD<tuple<int>, value> :param right: RDD<int, value> :param n: int, the length of the key in left-RDD :return: joint RDD. """ left = left.map( lambda u: (-1, (u[0], (u[1],))) ) # (_, (tuple<key>, tuple<value>)) right = right.map( lambda u: (u[0], (u[1],)) ).cache() # (_, tuple<value>) for key_order in range(n): left = left.map( lambda u: (u[1][0][key_order], u[1]) # (_, (tuple<key>, tuple<value>)) ).join( right # (_, ((tuple<key>, tuple<value>), tuple<value>)) ).map( lambda u: (-1, (u[1][0][0], u[1][0][1] + u[1][1])) ) # (_, (tuple<key>, tuple<value>)) left = left.map( lambda u: u[1] ) # (tuple<key>, tuple<value>) return left
def cStress(rdd: RDD) -> RDD: # TODO: TWH Temporary ecg_sampling_frequency = 64.0 rip_sampling_frequency = 64.0 accel_sampling_frequency = 64.0 / 6.0 # Timestamp correct datastreams ecg_corrected = rdd.map(lambda ds: ( ds['participant'], timestamp_correct(datastream=ds['ecg'], sampling_frequency=ecg_sampling_frequency))) rip_corrected = rdd.map(lambda ds: ( ds['participant'], timestamp_correct(datastream=ds['rip'], sampling_frequency=rip_sampling_frequency))) accelx_corrected = rdd.map(lambda ds: ( ds['participant'], timestamp_correct(datastream=ds['accelx'], sampling_frequency=accel_sampling_frequency))) accely_corrected = rdd.map(lambda ds: ( ds['participant'], timestamp_correct(datastream=ds['accely'], sampling_frequency=accel_sampling_frequency))) accelz_corrected = rdd.map(lambda ds: ( ds['participant'], timestamp_correct(datastream=ds['accelz'], sampling_frequency=accel_sampling_frequency))) accel_group = accelx_corrected.join(accely_corrected).join( accelz_corrected).map(fix_two_joins) accel = accel_group.map(lambda ds: ( ds[0], autosense_sequence_align(datastreams=[ds[1][0], ds[1][1], ds[1][2]], sampling_frequency=accel_sampling_frequency))) # Accelerometer Feature Computation accel_features = accel.map( lambda ds: (ds[0], accelerometer_features(ds[1], window_length=10.0))) # rip features peak_valley = rip_corrected.map( lambda ds: (ds[0], rip.compute_peak_valley(rip=ds[1]))) rip_features = peak_valley.map( lambda ds: (ds[0], rip_feature_computation(ds[1][0], ds[1][1]))) # r-peak datastream computation ecg_rr_rdd = ecg_corrected.map(lambda ds: (ds[ 0], compute_rr_intervals(ds[1], ecg_sampling_frequency))) ecg_features = ecg_rr_rdd.map(lambda ds: (ds[ 0], ecg_feature_computation(ds[1], window_size=60, window_offset=60))) # return rip_features.join(ecg_features).join(accel_features).map(fix_two_joins) return ecg_features
def evaluate(truth: RDD, prediction: RDD) -> float: """ Calculate RMSE between truth and predictions. :param truth: RDD<Hashable, Hashable, float> = RDD<(bucket, item, rating)> :param prediction: RDD<Hashable, Hashable, float> = RDD<(bucket, item, rating)> :return: float = RMSE """ truth = truth.map(lambda u: ((u[0], u[1]), u[2])) prediction = prediction.map(lambda u: ((u[0], u[1]), u[2])) return truth.join(prediction).map(lambda u: (u[1][0] - u[1][1])**2).mean()**0.5
def __preprocessRdd(self, rdd: RDD): rddc = rddCorrector() rdd = rdd.map(lambda l: rddc.correct(l)) if rdd != None: if (rdd.isEmpty() == False): rdd = rdd.map(lambda l: l.replace("<tweet>", "")) rdd = rdd.map(lambda l: l.replace("</tweet>", "")) df = DataFrameWorks().convertDataFrame(rdd, self.__spark) df = CleanText().clean(df, self.__spark) return df return None
def convertDataFrame(self, rdd: RDD, SqlObject) -> DataFrame: """RDD to DataFrame""" #rdd = rdd.map(lambda l: l.replace("½","")) rdd = rdd.map(lambda l: (l[:19], l[19:])) schema = [StructField("id", StringType(), False), StructField("rawData", StringType(), False), StructField("preprocessedData", ArrayType(elementType=StringType(), containsNull=True), True), StructField("sentiment", FloatType(), True)] final_struct = StructType(fields=schema) rdd =rdd.map(lambda l: (l[0],l[1], [None], None)) return SqlObject.createDataFrame(rdd, schema=final_struct)
def java_to_python_rdd(sc, rdd, is_pair, is_json): jrdd = sc._jvm.SerDe.javaToPython(rdd) output = RDD(jrdd, sc) if is_pair: if is_json: return output.map(lambda x: (x.split("\t")[0], json.loads(x.split("\t")[1]))) else: return output.map(lambda x: (x.split("\t")[0], x.split("\t")[1])) if is_json: return output.map(lambda x: json.loads(x)) return output
def extract_items(wikidata_items: RDD, b_property_map: Broadcast, b_item_page_map: Broadcast): def parse_item(item): property_map = b_property_map.value item_page_map = b_item_page_map.value if "enwiki" in item["sitelinks"]: page_title = item["sitelinks"]["enwiki"]["title"] else: return None, None claims = {} for prop_id, property_claims in item["claims"].items(): if prop_id in property_map: prop_name = property_map[prop_id] parsed_claims = [] for c in property_claims: if "datavalue" in c["mainsnak"]: c = c["mainsnak"]["datavalue"]["value"] if type(c) == dict and "entity-type" in c: claim_item_id = c["id"] if claim_item_id in item_page_map: c = item_page_map[c["id"]] else: continue parsed_claims.append(c) claims[prop_name] = parsed_claims return page_title, claims return (wikidata_items.map(parse_item).filter(lambda pc: pc[ 0] is not None).reduceByKey(lambda x, y: x).collectAsMap())
def __call__(self, rdd: RDD) -> RDD: def select_fields(row): return Row(**{f: getattr(row, f) for f in self.fields}) res = rdd.map(select_fields) if self.explained: self._log.info("toDebugString():\n%s", res.toDebugString().decode()) return res
def transform_online_retail( sc: SparkSession, raw_rdd: RDD, schema: str, max_month: Optional[int] = None, ) -> DataFrame: """Method to transform online retail dataset to its correct dataformats, specific for online retail :return: """ # initial transformation of the raw RDD raw_rdd = raw_rdd.map(lambda retail: ( retail[0], # InvoiceNo retail[1], # StockCode retail[2] if retail[2] != '' else None, # Description int(retail[3]), # Quantity datetime.strptime(retail[4], '%d/%m/%Y %H:%M') if int(retail[4].split('/')[1]) < max_month else datetime.strptime(retail[4], '%m/%d/%Y %H:%M'), # InvoiceDate float(retail[5]), # UnitPrice int(retail[6]) if retail[6] != '' else None, # CustomerID retail[7] if retail[7] != '' else None) # Country ) return sc.createDataFrame( raw_rdd, schema=schema )
def __compute_signature(self, data: RDD) -> RDD: """ Compute signature for items. :param data: RDD<(Hashable, Iterator<Hashable>)> = RDD<(item, content)> :return: RDD<(Hashable, tuple<int>)> = RDD<(item, signature)> """ hashing_range = self.__hashing_range signature_length = self.__signature_length random_seed = self.__seed min_hash_func = self.__min_hash def _signature(key_values: (Hashable, Iterator)) -> (Hashable, tuple): """ Compute signature for each item :return (Hashable, tuple<int>) = (item, signature) """ item, content = key_values signature = [hashing_range for _ in range(signature_length)] for element in content: for index_i, hashed_value in enumerate( min_hash_func(element, signature_length, hashing_range, random_seed)): signature[index_i] = min(hashed_value, signature[index_i]) return item, tuple(signature) return data.map(_signature)
def _compute_tfid(texts: RDD) -> IDFModel: tf = HashingTF().transform(texts.map(lambda t: t.words)) tf.cache() idf = IDF().fit(tf) tfidfs = idf.transform(tf) text_tfs = texts.zip(tfidfs) return text_tfs.map(lambda t: t[0].set_tfidf(t[1]))
def run(self, data_rdd: RDD, query_rdd: RDD, n_dim: int) -> RDD: # type: ignore empty_result_rdd = query_rdd.map(lambda idx_coords: (idx_coords[0], 0)) data_rdd = data_rdd.map( lambda idx_coords: ((), idx_coords[1], (DATA, idx_coords[0])) ) query_rdd = query_rdd.map( lambda idx_coords: ((), idx_coords[1], (QUERY, idx_coords[0])) ) rdd = data_rdd.union(query_rdd) for _ in range(n_dim): rdd = self.assign_next_label(rdd=rdd) # type: ignore rdd = empty_result_rdd.union(self.get_results_by_label(rdd)) # type: ignore rdd = self.aggregate_results_by_query(rdd).sortByKey() # type: ignore return rdd
def parseDNSInfo(pcap_packets: RDD) -> RDD: timer = Timer() rddDns = pcap_packets.map(lambda bytes_packet: DNSInfo(bytes_packet[0], bytes_packet[1])).filter( lambda dns: not dns.notDns and dns.sip not in Global.TRUSTED_DNS and dns.dip not in Global.TRUSTED_DNS) log.info(f'Time spent on parsing chunk = {timer.elapsed()}') return rddDns
def extract_item_map(wikidata_items: RDD): def parse_item(item): if 'en' in item['labels']: label = item['labels']['en']['value'] return item['id'], label else: return None return wikidata_items.map(parse_item).filter(lambda i: i is not None).collectAsMap()
def __blocking_matrix(self, train: RDD = None, test: RDD = None, similarity=None) -> RDD: """ Divide matrix into blocks for the purpose of reduce key number. :param train: RDD<(Hashable, Hashable, float)> = RDD<bucket, item, rating> :param test: RDD<(Hashable, Hashable)> = RDD<bucket, item> :param similarity: RDD<(Hashable, Hashable, float)> RDD<bucket, bucket, similarity> :return: RDD<(int, int)(Hashable, Hashable, float)> = RDD<(bucket_block, item_block), (bucket, item, rating)> or RDD<(bucket_block, bucket_block), (bucket, bucket, similarity)> """ seed = self._seed n_bucket_block = self._n_bucket_block n_item_block = self._n_item_block n_cross_block = self._n_cross_block if train is not None: train = train.map(lambda u: ((hash2int( u[0], max_value=n_cross_block, seed=seed ), hash2int(u[1], max_value=n_item_block, seed=seed)), u)).cache() train.count() return train if test is not None: test = test.map(lambda u: ((hash2int( u[0], max_value=n_bucket_block, seed=seed ), hash2int(u[1], max_value=n_item_block, seed=seed)), u)).cache() test.count() return test if similarity is not None: similarity = similarity.flatMap(lambda u: [(u[0], u[1], u[ 2]), (u[1], u[0], u[2])]).map(lambda u: ( (hash2int(u[0], max_value=n_bucket_block, seed=seed), hash2int(u[1], max_value=n_cross_block, seed=seed)), u) ).cache() similarity.count() return similarity
def extract_item_map(wikidata_items: RDD): def parse_item(item): if "en" in item["labels"]: label = item["labels"]["en"]["value"] return item["id"], label else: return None return wikidata_items.map(parse_item).filter( lambda i: i is not None).collectAsMap()
def calc_llr(ss: SparkSession, from_rdd: RDD, to_rdd: RDD) -> RDD: """ Расчет матрицы co-/cross- occurence LLR для списка списков объектов для рекомендаций A -> B Вначале вычисляется произведение матриц left_rdd.Transpose * right_rdd. Затем LogLikelihoodRatio для каждой ячейки. :param ss: Spark Session объект :param from_rdd: rdd с разреженной матрицей со статистикой по объектам A. Формат значений - (ID списка, ID объекта A) :param to_rdd: rdd с разреженной матрицей со статистикой по объектам B. Формат значений - (ID списка, ID объекта B) :return: Разреженная матрица co-/cross- occurence LLR между треками """ logger.info("Calculating co-/cross- occurence LLR matrix...") sc = ss.sparkContext lists_count = from_rdd.map(itemgetter(0)).distinct().count() bc_lists_count = sc.broadcast(lists_count) from_items_counts = from_rdd.map(itemgetter(1)).countByValue() bc_from_items_counts = sc.broadcast(from_items_counts) to_items_counts = to_rdd.map(itemgetter(1)).countByValue() bc_to_items_counts = sc.broadcast(to_items_counts) def llr_cell(x): i, j, cooc_count = x i_count = bc_to_items_counts.value[i] j_count = bc_from_items_counts.value[j] res_llr = llr_sqrt(cooc_count, j_count - cooc_count, i_count - cooc_count, bc_lists_count.value - i_count - j_count + cooc_count) return i, j, res_llr llr_rdd = to_rdd \ .join(from_rdd) \ .map(lambda x: (x[1], 1)) \ .reduceByKey(add) \ .map(lambda x: (x[0][0], x[0][1], x[1])) \ .map(llr_cell) logger.info("Co-/cross- occurence LLR matrix calculated") return llr_rdd
def extract_item_map(wikidata_items: RDD): def parse_item(item): if 'en' in item['labels']: label = item['labels']['en']['value'] return item['id'], label else: return None return wikidata_items.map(parse_item).filter( lambda i: i is not None).collectAsMap()
def evaluate(self, lables_and_predictions: RDD): TP = lables_and_predictions.map(lambda x: (set(x[0]), set([p for p,w in x[1][:self._pred_n]]))). \ filter(lambda x: len(x[0].intersection(x[1])) > self._intersect_n) accuracy = 100.0 * TP.count() / lables_and_predictions.count() if self._verbose: print('accuracy: ', accuracy) self._results.append(accuracy) return accuracy
def to_pandas_df(rdd: RDD, string_conversion=False, init_condition: dict = None): if init_condition is not None and string_conversion is False: # Typefull return to_spark(rdd=rdd, init_condition=init_condition).toPandas() elif init_condition is None and string_conversion is True: # String return rdd.map(lambda d: Row(**dict([(k, str(v)) for k, v in d.items()]))).toDF() else: # Typeless return to_pandas(rdd)
def lp_to_simple_rdd(lp_rdd: RDD, categorical: bool = False, nb_classes: int = None): """Convert a LabeledPoint RDD into an RDD of feature-label pairs :param lp_rdd: LabeledPoint RDD of features and labels :param categorical: boolean, if labels should be one-hot encode when returned :param nb_classes: int, number of total classes :return: Spark RDD with feature-label pairs """ if categorical: if not nb_classes: labels = np.asarray(lp_rdd.map(lambda lp: lp.label).collect(), dtype='int32') nb_classes = np.max(labels) + 1 rdd = lp_rdd.map(lambda lp: (from_vector(lp.features), encode_label(lp.label, nb_classes))) else: rdd = lp_rdd.map(lambda lp: (from_vector(lp.features), lp.label)) return rdd
def from_labeled_point(rdd: RDD, categorical: bool = False, nb_classes: int = None): """Convert a LabeledPoint RDD back to a pair of numpy arrays :param rdd: LabeledPoint RDD :param categorical: boolean, if labels should be one-hot encode when returned :param nb_classes: optional int, indicating the number of class labels :return: pair of numpy arrays, features and labels """ features = np.asarray( rdd.map(lambda lp: from_vector(lp.features)).collect()) labels = np.asarray(rdd.map(lambda lp: lp.label).collect(), dtype='int32') if categorical: if not nb_classes: nb_classes = np.max(labels) + 1 temp = np.zeros((len(labels), nb_classes)) for i, label in enumerate(labels): temp[i, label] = 1. labels = temp return features, labels
def toDf(cls, spatialPairRDD: RDD, sparkSession: SparkSession): """ :param spatialPairRDD: :param sparkSession: :return: """ spatialPairRDD_mapped = spatialPairRDD.map( lambda x: [x[0].geom, *x[0].getUserData().split("\t"), x[1].geom, *x[1].getUserData().split("\t")] ) df = sparkSession.createDataFrame(spatialPairRDD_mapped) return df
def calculate_node_tree( config: dict, record_rdd: RDD, spark: SparkSession, repo_type: str = "ecflow", ) -> dict: """Calculate node tree for each date in record RDDs. Parameters ---------- config: dict processor's config record_rdd: RDD records RDD spark: SparkSession spark session repo_type: ["ecflow", "sms"] repo type Returns ------- dict bunch map dict, key: date, value: bunch map """ # **STEP**: map to (date, node_path) # record object => (date, node_path) distinct def node_path_map(record): return record.date, record.node_path date_node_path_rdd = record_rdd.map(node_path_map).distinct() # **STEP**: group by date # (date, node_path) => (record_date, list of record_fullname) date_node_path_list_rdd = date_node_path_rdd.groupByKey() # **STEP**: collect date_with_node_path_list = date_node_path_list_rdd.collect() # **STEP**: generate bunch logger.info("Generating bunch...") bunch_class = get_bunch_class(repo_type) bunch_map = {} for (day, node_path_list) in date_with_node_path_list: bunch = bunch_class() for node_path in node_path_list: if node_path is not None: bunch.add_node(node_path) logger.info(f"Generating bunch...done for {day}") bunch_map[day] = bunch return bunch_map
def __call__(self, head: RDD): if self.keymap is None: return head.coalesce(self.partitions, self.shuffle) # partitionBy the key extracted using self.keymap try: # this checks if keymap is an identity probe = self.keymap("probe") except: # noqa: E722 probe = None if probe != "probe": head = head.map(lambda x: (self.keymap(x), x)) return head \ .partitionBy(self.partitions) \ .map(lambda x: x[1])
def _check_data(train: RDD = None, test: RDD = None) -> (RDD, int): # Data-type check if isinstance(train, RDD): is_legal_train = train.map( lambda u: len(u) >= 3 and u[0] is not None and u[1] is not None and isinstance(u[2], Number)).reduce(lambda u1, u2: u1 and u2) if not is_legal_train: raise ValueError( "Parameter train should be an RDD<(user, item, rating)>") num_partitions_of_train = train.getNumPartitions() return train if isinstance(test, RDD): is_legal_test = test.map(lambda u: len(u) >= 2 and u[0] is not None and u[1] is not None).reduce( lambda u1, u2: u1 and u2) if not is_legal_test: raise ValueError( "Parameter train should be an RDD<(user, item, rating)>") num_partitions_of_test = test.getNumPartitions() return test raise ValueError("RDD train/test need to be input.")
def to_spark_df(rdd: RDD, spark: SparkSession = None, init_condition: dict = None): if init_condition is not None and spark is not None: # Typefull return to_spark(rdd, init_condition) elif spark is None and init_condition is None: # String return rdd.map(lambda d: Row(**dict([(k, str(v)) for k, v in d.items()]))).toDF() else: # Typeless spark.conf.set("spark.sql.execution.arrow.enabled", "true") spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "true") warnings.simplefilter(action='ignore', category=UserWarning) pdf_from_rdd: DataFrame = to_pandas(rdd) result = spark.createDataFrame(pdf_from_rdd) del pdf_from_rdd return result
def make_dataframe_from_alerts(rdd: RDD, colnames: list) -> DataFrame: """ Make a Dataframe from a RDD of alerts and columns names. Parameters ---------- rdd: Apache Spark RDD of dictionaries RDD whose elements are dictionaries (decoded alerts) colnames: list of str List containing the keys to include. For nested levels, just chain it using double dot: firstdic:seconddic:key Returns ---------- out: DataFrame Dataframe from the input RDD and columns names. """ return rdd.map(lambda x: tuple(ret(x, k) for k in colnames)).toDF(colnames)
def __calculate_similarity(train: RDD, lsh_params: dict, maximum_num_partitions: int) -> RDD: """ Calculate Jaccard Similarity from train-RDD. :param train: RDD<(Hashable, Hashable, float)> :return: RDD<Hashable, Hashable, float> = RDD<bucket, bucket, similarity> """ train = train.map(lambda u: (u[0], u[1]))\ .groupByKey().map(lambda u: (u[0], list(u[1]))).cache() similarity_among_buckets = JaccardSimilarity( **lsh_params).predict(train).cache() if similarity_among_buckets.getNumPartitions( ) > maximum_num_partitions: similarity_among_buckets = similarity_among_buckets.coalesce( maximum_num_partitions).cache() return similarity_among_buckets
def normal_order(self, terms: RDD, **kwargs): """Normal order the terms in the RDD.""" if len(kwargs) > 0: raise ValueError('Invalid keyword arguments', kwargs) symms = self.symms swapper = self.swapper resolvers = self.resolvers init = terms.map(lambda term: _NOState( pivot=1, front=2, term=term.canon4normal(symms.value))) res = nest_bind( init, lambda x: _sort_vec(x, swapper=swapper, resolvers=resolvers.value), full_balance=self.full_balance) return res.map(lambda x: x.term)
def predict(self, x): """ Predict the label of one or more examples. :param x: Data point (feature vector), or an RDD of data points (feature vectors). """ pythonAPI = self._sc._jvm.PythonMLLibAPI() if isinstance(x, RDD): # Bulk prediction if x.count() == 0: return self._sc.parallelize([]) dataBytes = _get_unmangled_double_vector_rdd(x, cache=False) jSerializedPreds = \ pythonAPI.predictDecisionTreeModel(self._java_model, dataBytes._jrdd) serializedPreds = RDD(jSerializedPreds, self._sc, NoOpSerializer()) return serializedPreds.map(lambda bytes: _deserialize_double(bytearray(bytes))) else: # Assume x is a single data point. x_ = _serialize_double_vector(x) return pythonAPI.predictDecisionTreeModel(self._java_model, x_)
def evaluate(self, lables_and_predictions: RDD): result = lables_and_predictions.map(lambda p: _hamming_loss(p[0], p[1])). \ mean() self._results.append(result) return result