Esempio n. 1
0
L = CoordinateMatrix(edges).toBlockMatrix()
L_transpose = CoordinateMatrix(edges_transpose).toBlockMatrix()

h_init = []

for i in range(1000):
  h_init.append((i, 0, 1))

h = CoordinateMatrix(sc.parallelize(h_init)).toBlockMatrix()

a = None

for i in range(40):

  a_new = L_transpose.multiply(h)
  a_new_max = np.max(np.array(a_new.toLocalMatrix().toArray()))
  a_new_max_inverse = []
  for j in range(1000):
    a_new_max_inverse.append((j, j, 1 / a_new_max))
  a_new_max_inverse = CoordinateMatrix(sc.parallelize(a_new_max_inverse)).toBlockMatrix()
  a = a_new_max_inverse.multiply(a_new)

  h_new = L.multiply(a)
  h_new_max = np.max(np.array(h_new.toLocalMatrix().toArray()))
  h_new_max_inverse = []
  for j in range(1000):
    h_new_max_inverse.append((j, j, 1 / h_new_max))
  h_new_max_inverse = CoordinateMatrix(sc.parallelize(h_new_max_inverse)).toBlockMatrix()
  h = h_new_max_inverse.multiply(h_new)
Esempio n. 2
0
    end = time.time()
    elapsed_seconds = float("%.4f" % (end - start))
    logging.info('%s: elapsed seconds: %s', name, elapsed_seconds)


logging.getLogger().setLevel(logging.INFO)

def to_matrix_entry(x):
    i, j, v = x.split(',')
    return MatrixEntry(i, j, v)

sc = pyspark.SparkContext(appName="Matrix Multiplication")

for i in range(1, 10):
    with time_usage("temps matrix multiplication"):
        matrix_a_raw = sc.textFile(sys.argv[1])
        matrix_b_raw = sc.textFile(sys.argv[2])

        spark = SparkSession(sc)

        entries_a = matrix_a_raw.map(to_matrix_entry)
        entries_b = matrix_b_raw.map(to_matrix_entry)

        mat_a = CoordinateMatrix(entries_a).toBlockMatrix()
        mat_b = CoordinateMatrix(entries_b).toBlockMatrix()

        product = mat_a.multiply(mat_b)
        product.toLocalMatrix()

#for t in result:
    #print('%s, %s, %s' % (t[0], t[1], t[2]))
Esempio n. 3
0
# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

small_data = sc.textFile('graph-small.txt')
full_data = sc.textFile('graph-full.txt')
BETA = 0.8

source_dest_pair = full_data.map(lambda x: (int(x.split('\t')[0]) - 1, int(x.split('\t')[1]) - 1)).distinct()
edges = source_dest_pair.map(lambda x: (x[1], x[0], 1))
degrees = source_dest_pair.map(lambda x: (x[0], 1)).reduceByKey(lambda x, y: x + y).map(lambda x: (x[0], x[0], 1 / x[1]))

edge_matrix = CoordinateMatrix(edges).toBlockMatrix()
degree_inverse_matrix = CoordinateMatrix(degrees).toBlockMatrix()

M = edge_matrix.multiply(degree_inverse_matrix)

r_init = []
beta_init = []
teleport_init = []
for i in range(1000):
  r_init.append((i, 0, 1 / 1000))
  beta_init.append((i, i, BETA))
  teleport_init.append((i, 0, (1 - BETA) / 1000))

r = CoordinateMatrix(sc.parallelize(r_init)).toBlockMatrix()
beta = CoordinateMatrix(sc.parallelize(beta_init)).toBlockMatrix()
teleport = CoordinateMatrix(sc.parallelize(teleport_init)).toBlockMatrix()

for i in range(40):
  r = teleport.add(beta.multiply(M).multiply(r))
Esempio n. 4
0
W = CoordinateMatrix(upper_entries.union(lower_entries), numCols=N, numRows=N)

# XXX:
laplacian = sys.argv[1]

if laplacian == 'unnormalized':
    entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], x[1]))
    D = CoordinateMatrix(entries, numCols=N, numRows=N)
    L = D.toBlockMatrix().subtract(W.toBlockMatrix()).toCoordinateMatrix()
elif laplacian == 'normalized':
    entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], 1 / x[1]))
    D_inv = CoordinateMatrix(entries, numCols=N, numRows=N).toBlockMatrix()
    I = CoordinateMatrix(sc.range(N).map(lambda i: MatrixEntry(i, i, 1.0)),
                         numCols=N,
                         numRows=N).toBlockMatrix()
    L = I.subtract(D_inv.multiply(W.toBlockMatrix())).toCoordinateMatrix()
elif laplacian == 'symmetric':
    entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], 1 / sqrt(x[1])))
    D_invsq = CoordinateMatrix(entries, numCols=N, numRows=N).toBlockMatrix()
    I = sc.range(N).map(lambda i: MatrixEntry(i, i, 1.0), N, N)
    tmp = D_invsq.multiply(W.toBlockMatrix()).multiply(D_invsq)
    L = I.toBlockMatrix().subtract(tmp)
else:
    raise ValueError('Unknown type of Laplacian.')

## SVD, and transform from dense matrix to dataframe.
svd = L.toRowMatrix().computeSVD(k=K, computeU=False)
V = svd.V.toArray().tolist()
VV = spark.createDataFrame(V)
kmeans = KMeans().setK(K).setSeed(1)
vecAssembler = VectorAssembler(inputCols=VV.schema.names, outputCol='features')