Exemple #1
0
 def convert(value):
     if value is None:
         raise TypeError("None is not allowed.")
     elif isinstance(value, JavaObject):
         return {k: TypeConverters.toFloat(value[k]) for k in value.keySet().toArray()}
     elif isinstance(value, dict):
         return {k: TypeConverters.toFloat(v) for k, v in value.items()}
     else:
         raise TypeError("Invalid type.")
Exemple #2
0
 def test_list(self):
     l = [0, 1]
     for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), range(len(l)), l),
                      array.array('l', l), xrange(2), tuple(l)]:
         converted = TypeConverters.toList(lst_like)
         self.assertEqual(type(converted), list)
         self.assertListEqual(converted, l)
Exemple #3
0
 def test_list(self):
     l = [0, 1]
     for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), range(len(l)), l),
                      pyarray.array('l', l), xrange(2), tuple(l)]:
         converted = TypeConverters.toList(lst_like)
         self.assertEqual(type(converted), list)
         self.assertListEqual(converted, l)
Exemple #4
0
 def convert(value):
     if value is None:
         return None
     else:
         return [
             H2OTypeConverters.toDenseVector()(v)
             for v in TypeConverters.toList(value)
         ]
Exemple #5
0
 def convert(value):
     if value is None:
         raise TypeError("None is not allowed.")
     else:
         return [
             H2OTypeConverters.toPairString()(v)
             for v in TypeConverters.toList(value)
         ]
Exemple #6
0
 def toStringOrTFTensor(value):
     if isinstance(value, tf.Tensor):
         return value
     else:
         try:
             return TypeConverters.toString(value)
         except TypeError:
             raise TypeError("Could not convert %s to tensorflow.Tensor or str" % type(value))
Exemple #7
0
        def convert(value):
            if value is None:
                raise TypeError("None is not allowed.")
            else:
                valueForConversion = value
                if isinstance(value, JavaObject):
                    valueForConversion = list(value)

                return TypeConverters.toListString(valueForConversion)
Exemple #8
0
 def toStringOrTFTensor(value):
     if isinstance(value, tf.Tensor):
         return value
     else:
         try:
             return TypeConverters.toString(value)
         except TypeError:
             raise TypeError(
                 "Could not convert %s to tensorflow.Tensor or str" %
                 type(value))
Exemple #9
0
 def convert(value):
     if value is None:
         raise TypeError("None is not allowed.")
     else:
         return TypeConverters.toBoolean(value)
Exemple #10
0
 def convert(value):
     if value is None:
         return None
     else:
         return TypeConverters.toBoolean(value)
Exemple #11
0
 def convert(value):
     if value is None:
         return None
     else:
         return TypeConverters.toFloat(value)
Exemple #12
0
 def convert(value):
     if value is None:
         return None
     else:
         return TypeConverters.toString(value)
Exemple #13
0
 def convert(value):
     package = getattr(_jvm().ai.h2o.sparkling.ml.params, "H2OAlgoParamsHelper$")
     return package.__getattr__("MODULE$").getValidatedEnumValue(enumClass, TypeConverters.toString(value))
Exemple #14
0
def train():
    sparkUrl = 'spark://ubuntu02:7077'

    file_path = 'hdfs://ubuntu02:9000/vectors/sentences_vector.csv'
    hdfs_url = 'http://ubuntu02:50070'
    user = '******'

    # 用來分桶的參數(愈小桶越多)
    r = 0.002

    # 所有句向量集合
    sc = get_conf(sparkUrl, 'LSH_train', "8g")
    df = load_sentence_data_frame(sc, file_path)

    # 隨機抽取一個向量v
    v = df.sample(False, 0.1, seed=0).rdd.first()['_vector']

    # 對每一個值算一個hash code: floor(dot(u,v) / r)
    tmp = df.rdd.flatMap(
        lambda x: {
            Row(x['id'], x['sentence'], x['vector'],
                TypeConverters.toInt(np.floor(x['_vector'].dot(v) / r)))
        })

    # 重新命名
    df = SQLContext(sc).createDataFrame(tmp) \
        .selectExpr("_1 as id",
                    "_2 as sentence",
                    "_3 as vector",
                    "_4 as hash_code")

    # 保存dataframe以加快速度
    df.persist()

    # 顯示各組分類情況
    summary = df.groupby("hash_code").count()
    summary.persist()

    # 取得所有桶的名稱
    names = summary.rdd.map(lambda x: x.hash_code).collect()

    # 歷遍每個分組
    for name in names:
        print('save to ' + str(name))

        tmp = df.filter(df['hash_code'] == name)

        # 刪除hash_code欄位以節省空間
        tmp = tmp.drop('hash_code')

        # 寫入hdfs(這個操作巨慢)
        tmp.toPandas().to_csv('/home/hadoop/new/' + str(name) + '.csv',
                              sep=',',
                              index=False,
                              encoding='utf-8')

    with open('/home/hadoop/new/meta.txt', 'w') as f:
        f.write('vector(v):\n')
        for e in v:
            f.write(str(e) + ',')

        f.write('\nnames:\n')
        for name in names:
            f.write(str(name) + ',')

    print('all done!!')
    return
def _tensor_name(tensor):
    if isinstance(tensor, tf.Tensor):
        return _tensor_name(tensor.name)
    return TypeConverters.toString(tensor)