def hashString(v): if not v: return v hashVector = defaultdict(float) for k, v in v.items(): h = _mh3(v, seed=_mh3(k, seed=seed)) hashVector[h] += 1. return dict(hashVector)
def _transform(self, dataset): inputCol = self.getInputCol() dataType = dataset.schema[inputCol].dataType assert isinstance(dataType, T.MapType) assert isinstance(dataType.keyType, T.StringType) assert isinstance(dataType.valueType, (T.NumericType, T.StringType)) seed = _mh3(inputCol, seed=self.getSeed()) @F.udf(T.MapType(T.IntegerType(), T.FloatType())) def hashNumeric(v): if not v: return {} hashVector = defaultdict(float) for k, v in v.items(): h = _mh3(k, seed=seed) hashVector[h] += v return dict(hashVector) @F.udf(T.MapType(T.IntegerType(), T.FloatType())) def hashString(v): if not v: return v hashVector = defaultdict(float) for k, v in v.items(): h = _mh3(v, seed=_mh3(k, seed=seed)) hashVector[h] += 1. return dict(hashVector) if isinstance(dataType.valueType, T.NumericType): return dataset.withColumn(self.getOutputCol(), hashNumeric(dataset[inputCol])) else: return dataset.withColumn(self.getOutputCol(), hashString(dataset[inputCol]))
def _transform(self, dataset): inputCol = self.getInputCol() dataType = dataset.schema[inputCol].dataType assert isinstance(dataType, (T.BooleanType, T.NumericType, T.StringType)) seed = _mh3(inputCol, seed=self.getSeed()) @F.udf(T.MapType(T.IntegerType(), T.FloatType())) def hashNumeric(v): if not v: return {} return {seed: float(v)} @F.udf(T.MapType(T.IntegerType(), T.FloatType())) def hashString(v): if not v: return {} return {_mh3(v, seed=seed): 1.} if isinstance(dataType, (T.BooleanType, T.NumericType)): return dataset.withColumn(self.getOutputCol(), hashNumeric(dataset[inputCol])) else: return dataset.withColumn(self.getOutputCol(), hashString(dataset[inputCol]))
def hashNumeric(v): if not v: return {} hashVector = defaultdict(float) for k, v in v.items(): h = _mh3(k, seed=seed) hashVector[h] += v return dict(hashVector)
def hash_(v): if not v: return {} hashVector = defaultdict(float) for x in v: h = _mh3(x, seed=seed) hashVector[h] += 1. return dict(hashVector)
def _transform(self, dataset): inputCol = self.getInputCol() dataType = dataset.schema[inputCol].dataType assert isinstance(dataType, T.ArrayType) assert isinstance(dataType.elementType, T.StringType) seed = _mh3(inputCol, seed=self.getSeed()) @F.udf(T.MapType(T.IntegerType(), T.FloatType())) def hash_(v): if not v: return {} hashVector = defaultdict(float) for x in v: h = _mh3(x, seed=seed) hashVector[h] += 1. return dict(hashVector) return dataset.withColumn(self.getOutputCol(), hash_(dataset[inputCol]))
def hashString(v): if not v: return {} return {_mh3(v, seed=seed): 1.}