Beispiel #1
0
def main():
    datasetfile = sys.argv[1]
    beta = 0.8
    iterations = 40
    top_k = 5

    sparkcontext = SparkContext("local", "Page Rank")
    data = sparkcontext.textFile(datasetfile)
    source_dest = data.map(make_key_value_pair_1)
    source_dest_count = data.map(make_key_value_pair_2)
    groupbykey = source_dest.groupByKey()
    number_of_nodes = groupbykey.count()
    out_degree = groupbykey.map(calc_out_degree)
    pair_map = groupbykey.collectAsMap()

    matrix_m = np.zeros(shape=(number_of_nodes, number_of_nodes))
    for key, value in pair_map.items():
        for ind_value in value:
            matrix_m[ind_value - 1][key - 1] += 1 / len(list(value))

    matrix_m = sparkcontext.parallelize(matrix_m)
    matrix_m = RowMatrix(matrix_m)

    vector_r_prev = np.empty([number_of_nodes, 1])
    vector_r_prev.fill(1 / number_of_nodes)
    vector_r_prev = DenseMatrix(number_of_nodes, 1, vector_r_prev)

    index = 0
    while (index < iterations):
        mul_val = matrix_m.multiply(vector_r_prev).rows.collect()
        mul_val = [i * beta for i in mul_val]
        mul_val = [i + (1 - beta) / number_of_nodes for i in mul_val]
        vector_r_prev = DenseMatrix(number_of_nodes, 1, mul_val)
        index += 1

    vector_r_prev = vector_r_prev.toArray()
    largest_values = heapq.nlargest(top_k, vector_r_prev)
    largest_indexes = heapq.nlargest(top_k, range(number_of_nodes),
                                     vector_r_prev.__getitem__)
    smallest_values = heapq.nsmallest(top_k, vector_r_prev)
    smallest_indexes = heapq.nsmallest(top_k, range(number_of_nodes),
                                       vector_r_prev.__getitem__)

    largest_indexes = [val + 1 for val in largest_indexes]
    smallest_indexes = [val + 1 for val in smallest_indexes]

    print("Value of largest n nodes\n", largest_values)
    print("Node numbers of largest n nodes\n", largest_indexes)
    print("Value of smallest n nodes\n", smallest_values)
    print("Node numbers of smallest n nodes\n", smallest_indexes)
    sparkcontext.stop()
Beispiel #2
0
    def test_dense_matrix_is_transposed(self):
        mat1 = DenseMatrix(3, 2, [0, 4, 1, 6, 3, 9], isTransposed=True)
        mat = DenseMatrix(3, 2, [0, 1, 3, 4, 6, 9])
        self.assertEqual(mat1, mat)

        expected = [[0, 4], [1, 6], [3, 9]]
        for i in range(3):
            for j in range(2):
                self.assertEqual(mat1[i, j], expected[i][j])
        self.assertTrue(array_equal(mat1.toArray(), expected))

        sm = mat1.toSparse()
        self.assertTrue(array_equal(sm.rowIndices, [1, 2, 0, 1, 2]))
        self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5]))
        self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9]))
#Embeddings

a = model.itemFactors
b = a.sort("id")
b.show()

#Creating a dense matrix from embedding for businesses
values = (b.rdd.map(lambda x: (x.id, x.features)).sortByKey().flatMap(
    lambda (x, y): y).collect())

nrow = len(b.rdd.map(lambda x: x.features).first())
ncol = b.count()

dm = DenseMatrix(nrow, ncol, values)
dm.toArray().shape
z = dm.toArray().transpose()

#t-sne

tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(z)

# creating data frame with t-sne results and business_id
e = sqlContext.createDataFrame(pd.DataFrame(X_tsne))
e_df = e.toPandas()
j = b.select("id")
j_df = j.toPandas()
result = pd.concat([e_df, j_df], axis=1, ignore_index=True)
result = pd.DataFrame(result)