コード例 #1
0
 def hashString(v):
     if not v:
         return v
     hashVector = defaultdict(float)
     for k, v in v.items():
         h = _mh3(v, seed=_mh3(k, seed=seed))
         hashVector[h] += 1.
     return dict(hashVector)
コード例 #2
0
    def _transform(self, dataset):
        inputCol = self.getInputCol()
        dataType = dataset.schema[inputCol].dataType
        assert isinstance(dataType, T.MapType)
        assert isinstance(dataType.keyType, T.StringType)
        assert isinstance(dataType.valueType, (T.NumericType, T.StringType))
        seed = _mh3(inputCol, seed=self.getSeed())

        @F.udf(T.MapType(T.IntegerType(), T.FloatType()))
        def hashNumeric(v):
            if not v:
                return {}
            hashVector = defaultdict(float)
            for k, v in v.items():
                h = _mh3(k, seed=seed)
                hashVector[h] += v
            return dict(hashVector)

        @F.udf(T.MapType(T.IntegerType(), T.FloatType()))
        def hashString(v):
            if not v:
                return v
            hashVector = defaultdict(float)
            for k, v in v.items():
                h = _mh3(v, seed=_mh3(k, seed=seed))
                hashVector[h] += 1.
            return dict(hashVector)

        if isinstance(dataType.valueType, T.NumericType):
            return dataset.withColumn(self.getOutputCol(),
                                      hashNumeric(dataset[inputCol]))
        else:
            return dataset.withColumn(self.getOutputCol(),
                                      hashString(dataset[inputCol]))
コード例 #3
0
    def _transform(self, dataset):
        inputCol = self.getInputCol()
        dataType = dataset.schema[inputCol].dataType
        assert isinstance(dataType,
                          (T.BooleanType, T.NumericType, T.StringType))
        seed = _mh3(inputCol, seed=self.getSeed())

        @F.udf(T.MapType(T.IntegerType(), T.FloatType()))
        def hashNumeric(v):
            if not v:
                return {}
            return {seed: float(v)}

        @F.udf(T.MapType(T.IntegerType(), T.FloatType()))
        def hashString(v):
            if not v:
                return {}
            return {_mh3(v, seed=seed): 1.}

        if isinstance(dataType, (T.BooleanType, T.NumericType)):
            return dataset.withColumn(self.getOutputCol(),
                                      hashNumeric(dataset[inputCol]))
        else:
            return dataset.withColumn(self.getOutputCol(),
                                      hashString(dataset[inputCol]))
コード例 #4
0
 def hashNumeric(v):
     if not v:
         return {}
     hashVector = defaultdict(float)
     for k, v in v.items():
         h = _mh3(k, seed=seed)
         hashVector[h] += v
     return dict(hashVector)
コード例 #5
0
 def hash_(v):
     if not v:
         return {}
     hashVector = defaultdict(float)
     for x in v:
         h = _mh3(x, seed=seed)
         hashVector[h] += 1.
     return dict(hashVector)
コード例 #6
0
    def _transform(self, dataset):
        inputCol = self.getInputCol()
        dataType = dataset.schema[inputCol].dataType
        assert isinstance(dataType, T.ArrayType)
        assert isinstance(dataType.elementType, T.StringType)
        seed = _mh3(inputCol, seed=self.getSeed())

        @F.udf(T.MapType(T.IntegerType(), T.FloatType()))
        def hash_(v):
            if not v:
                return {}
            hashVector = defaultdict(float)
            for x in v:
                h = _mh3(x, seed=seed)
                hashVector[h] += 1.
            return dict(hashVector)

        return dataset.withColumn(self.getOutputCol(),
                                  hash_(dataset[inputCol]))
コード例 #7
0
 def hashString(v):
     if not v:
         return {}
     return {_mh3(v, seed=seed): 1.}