Beispiel #1
0
def process(sparkContext,sqlContext):

    print("Building Graph...")
    G_apa = buildGraphAPA()
    
    print("Meta Path...")
    paths = metaPathAPA(G_apa)
     
    print("Training...")
    authorIndex = []
    authorDegree = []
   
    authors = paths[0]
    pathNumber = paths[2]
    pathNumberAarry = []
    for pn in pathNumber.keys():
        pathNumberAarry.append(str(pn)+":"+str(pathNumber.get(pn)))
    
    index = 0
    for author in authors:
        authorDegree.append(str(author)+":"+str(len(G_apa[author])))
        authorIndex.append(str(author)+":"+str(index))
        index = index+1.0

    
#     unique_authors = authors 
     
    authorsRDD = sparkContext.parallelize(authors)
    authorIndex = sparkContext.parallelize(authorIndex)
    pathNumber = sparkContext.parallelize(pathNumberAarry)
    authorDegree = sparkContext.parallelize(authorDegree)
     
    
    
    authors = authorsRDD.collect()

    ai = authorIndex.collect()
    authorIndex = dict()
    for a in ai:
        p = a.split(":")
        authorIndex[p[0]]=p[1]
#     print authorIndex
    
    ad = authorDegree.collect()
    authorDegree = dict()
    for a in ad:
        p = a.split(":")
        authorDegree[p[0]]=p[1]
#     print authorDegree
    
    pn = pathNumber.collect()
    pathNumber = dict()
    for a in pn:
        p = a.split(":")
        pathNumber[p[0]]=p[1]
#     print pathNumber
    
    
    def matEntry(author,authors):
        row = []
#         for author in authors: 
        for a in authors:
            if author == a:
                row.append((long(float(authorIndex[author])),long(float(authorIndex[a])),1.0))
            else:
                key = str(author)+str(a)
                if pathNumber.has_key(key):
                    row.append((long(float(authorIndex[author])),long(float(authorIndex[a])), 2.0*float(pathNumber.get(key))/(float(authorDegree[author])+float(authorDegree[a]))))
                else:
                    row.append((long(float(authorIndex[author])),long(float(authorIndex[a])), 0.0))
        
        return row
    
    def matEntryNoArgs():
        row = []
        for author in authors: 
            for a in authors:
                if author == a:
                    row.append((long(float(authorIndex[author])),long(float(authorIndex[a])),1.0))
                else:
                    key = str(author)+str(a)
                    if pathNumber.has_key(key):
                        row.append((long(float(authorIndex[author])),long(float(authorIndex[a])), 2.0*float(pathNumber.get(key))/(float(authorDegree[author])+float(authorDegree[a]))))
                    else:
                        row.append((long(float(authorIndex[author])),long(float(authorIndex[a])), 0.0))
        
        return row

#     print matEntry() 
    print "memememememememmmmmmemmmm"  
    
    me = authorsRDD.map(matEntry(author,authors)).collect()#.reduce(lambda x,y: x.append(y))
#     me =  matEntry()
#     me =  matEntryNoArgs()
    print "memememememememmmmmmemmmmOoooooooooooooooo"  
    
    
    
    entries = sc.parallelize(me)  
    print "ssssssssssssssss"  
#     # Create an CoordinateMatrix from an RDD of MatrixEntries.
    mat = CoordinateMatrix(entries)
#      
    print mat
#     mat.saveAsTextFile("/home/xuepeng/uts/metapath.txt")
     
    # Get its size.
    print mat.numRows()  # 3
    print mat.numCols()  # 2
Beispiel #2
0
def process(sparkContext, sqlContext):

    print("Building Graph...")
    G_apa = buildGraphAPA()

    print("Meta Path...")
    paths = metaPathAPA(G_apa)

    print("Training...")
    authorIndex = []
    authorDegree = []

    authors = paths[0]
    pathNumber = paths[2]
    pathNumberAarry = []
    for pn in pathNumber.keys():
        pathNumberAarry.append(str(pn) + ":" + str(pathNumber.get(pn)))

    index = 0
    for author in authors:
        authorDegree.append(str(author) + ":" + str(len(G_apa[author])))
        authorIndex.append(str(author) + ":" + str(index))
        index = index + 1.0

#     unique_authors = authors

    authorsRDD = sparkContext.parallelize(authors)
    authorIndex = sparkContext.parallelize(authorIndex)
    pathNumber = sparkContext.parallelize(pathNumberAarry)
    authorDegree = sparkContext.parallelize(authorDegree)

    authors = authorsRDD.collect()

    ai = authorIndex.collect()
    authorIndex = dict()
    for a in ai:
        p = a.split(":")
        authorIndex[p[0]] = p[1]
#     print authorIndex

    ad = authorDegree.collect()
    authorDegree = dict()
    for a in ad:
        p = a.split(":")
        authorDegree[p[0]] = p[1]
#     print authorDegree

    pn = pathNumber.collect()
    pathNumber = dict()
    for a in pn:
        p = a.split(":")
        pathNumber[p[0]] = p[1]
#     print pathNumber

    def matEntry(author, authors):
        row = []
        #         for author in authors:
        for a in authors:
            if author == a:
                row.append((long(float(authorIndex[author])),
                            long(float(authorIndex[a])), 1.0))
            else:
                key = str(author) + str(a)
                if pathNumber.has_key(key):
                    row.append((long(float(authorIndex[author])),
                                long(float(authorIndex[a])),
                                2.0 * float(pathNumber.get(key)) /
                                (float(authorDegree[author]) +
                                 float(authorDegree[a]))))
                else:
                    row.append((long(float(authorIndex[author])),
                                long(float(authorIndex[a])), 0.0))

        return row

    def matEntryNoArgs():
        row = []
        for author in authors:
            for a in authors:
                if author == a:
                    row.append((long(float(authorIndex[author])),
                                long(float(authorIndex[a])), 1.0))
                else:
                    key = str(author) + str(a)
                    if pathNumber.has_key(key):
                        row.append((long(float(authorIndex[author])),
                                    long(float(authorIndex[a])),
                                    2.0 * float(pathNumber.get(key)) /
                                    (float(authorDegree[author]) +
                                     float(authorDegree[a]))))
                    else:
                        row.append((long(float(authorIndex[author])),
                                    long(float(authorIndex[a])), 0.0))

        return row

#     print matEntry()

    print "memememememememmmmmmemmmm"

    me = authorsRDD.map(matEntry(
        author, authors)).collect()  #.reduce(lambda x,y: x.append(y))
    #     me =  matEntry()
    #     me =  matEntryNoArgs()
    print "memememememememmmmmmemmmmOoooooooooooooooo"

    entries = sc.parallelize(me)
    print "ssssssssssssssss"
    #     # Create an CoordinateMatrix from an RDD of MatrixEntries.
    mat = CoordinateMatrix(entries)
    #
    print mat
    #     mat.saveAsTextFile("/home/xuepeng/uts/metapath.txt")

    # Get its size.
    print mat.numRows()  # 3
    print mat.numCols()  # 2
Beispiel #3
0
def multiply_coordinate_matrices(left: CoordinateMatrix,
                                 right: CoordinateMatrix):
    """Multiply 2 spark Coordindate Matrices
    without converting either of them into a DenseMatrix.

    NOTE: spark does not provide distributed matrix multiplication of sparse matrices
    for this reason a custom approach has to be used which is discussed here
    https://medium.com/balabit-unsupervised/scalable-sparse-matrix-multiplication-in-apache-spark-c79e9ffc0703
    """
    def key_by_col(x):
        """Take a MatrixEntry of (row, col, val) and
        return a 2-tuple of (col, (row, val))"""
        return (x.j, (x.i, x.value))

    def key_by_row(x):
        """Take a MatrixEntry of (row, col, val) and
        return a 2-tuple of (row, (col, val))"""
        return (x.i, (x.j, x.value))

    left_by_col = left.entries.map(lambda x: key_by_col(x))
    right_by_row = right.entries.map(lambda x: key_by_row(x))

    # Next we perform a row by col matrix multiplication
    # where a shared "key" is used to group entries of the left matrix
    # with COLUMN j and entries of the right matrix with ROW j.
    # Note that entries with the same j will stick together.
    # This should be obvious if you recall that matrix multiplication
    # matches the index of the left column with the index of the right row.
    col_by_row = left_by_col.join(right_by_row)

    def row_by_col_multiplication(x):
        """The input is a key-pair tuple in the following format:
        (key, ((left_row, left_val), (right_col, right_val)))

        the output is a pair of tuples in the following format:
        ((left_row, right_col), (left_val, right_val))

        Note that having finished the grouping we no longer need the shared key anymore,
        (i.e. we no longer need the original indices of the left_col or right_row).
        This is because summed values will go into the output matrix at the
        location (left_row, right_col) and thus we can  regroup by these indices and sum
        """
        return ((x[1][0][0], x[1][1][0]), (x[1][0][1] * x[1][1][1]))

    # multiply elements by the left matrix column and the right matrix row
    products = col_by_row.map(lambda x: row_by_col_multiplication(x))

    # Sum up all the products for the a given left_row and right_col
    summed = products.reduceByKey(lambda accum, n: accum + n)

    # unnest the keys so we can convert back to a coordinate matrix
    flattened = summed.map(lambda x: (x[0][0], x[0][1], x[1]))

    res = CoordinateMatrix(flattened)

    log.info(
        "finished creating coord matrix from dot product",
        rows=res.numRows(),
        cols=res.numCols(),
    )
    return res
import pyspark
from pyspark import SparkContext
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry
from operator import add
from pyspark.sql import SparkSession

sc = SparkContext()
r=sc.textFile("part-00000")
m=r.flatMap(lambda x: x.split('\n')).filter(lambda x : "A" in x).map(lambda x : (x.strip("A, ")).split(' ')).map(lambda x: tuple(list(map(int, x))))
#n=m.map(lambda x : MatrixEntry(tuple(x)))

spark = SparkSession(sc)
#m.toDF().show()
print(hasattr(m,"toDF"))

cmat=CoordinateMatrix(m)
#mat = CoordinateMatrix(n)
#o=mat.take(5)
print(cmat.numRows()) # 3
print(cmat.numCols())

rowmat = cmat.toRowMatrix()

print(rowmat.numRows()) # 3
print(rowmat.numCols())

Beispiel #5
0
def sparse_dot_product_cross_join(
    spark: SQLContext,
    output_col: str,
    primary_row_number_col: str,
    primary_vector_col: str,
    primary_df: DataFrame,
    secondary_row_number_col: str,
    secondary_vector_col: str,
    secondary_df: DataFrame,
):
    """Calculate the dot product for every pair of items between
    a column of SparseVectors in the primary dataframe and a
    column of SparseVectors in the secondary dataframe.

    The input dataframes must have a row number attached. This will
    correspond to the row number in ther resulting row matrix.
    It does not matter if the row numbers are sequential as long
    as they are unique within their dataframes respectively.

    NOTE: if you are using this function in order to generate cosine similarity
    scores then remember to normalize your input vectors first. This way the
    resulting coordinate matrix will represent the similarity scores."""
    def primary_row_to_coords(row):
        """Convert a sparse vector to a list of coords
        in the format of (row_num, col_num, value)"""
        row_num = row.__getitem__(primary_row_number_col)
        vec = row.__getitem__(primary_vector_col)
        return [(row_num, i, j) for i, j in zip(vec.indices, vec.values)]

    primary_rdd = primary_df.select(F.col(primary_row_number_col),
                                    F.col(primary_vector_col)).rdd.flatMap(
                                        lambda row: primary_row_to_coords(row))

    if primary_rdd.isEmpty():
        raise ValueError(
            "Primary RDD is empty. Cannot perform matrix multiplication")

    primary_rdd.persist(StorageLevel.MEMORY_AND_DISK_SER)

    def secondary_row_to_coords(row):
        """Convert a sparse vector to a list of coords
        in the format of (row_num, col_num, value)"""
        row_num = row.__getitem__(secondary_row_number_col)
        vec = row.__getitem__(secondary_vector_col)
        # IMPORTANT - note that we are actually creating
        # the transpose of the secondary matrix hence
        # why the coordinates are back to front
        return [(i, row_num, j) for i, j in zip(vec.indices, vec.values)]

    secondary_rdd = secondary_df.select(
        F.col(secondary_row_number_col),
        F.col(secondary_vector_col)).rdd.flatMap(
            lambda row: secondary_row_to_coords(row))

    secondary_rdd.persist(StorageLevel.MEMORY_AND_DISK_SER)

    if secondary_rdd.isEmpty():
        raise ValueError(
            "Secondary RDD is empty. Cannot perform matrix multiplication")

    # create the primary coordinate matrix from the coords
    primary_matrix = CoordinateMatrix(primary_rdd)

    log.info(
        "finished creating primary coordinate matrix",
        rows=primary_matrix.numRows(),
        cols=primary_matrix.numCols(),
    )

    # create the secondary coordinate matrix from the coords
    secondary_matrix = CoordinateMatrix(secondary_rdd)

    log.info(
        "finished creating secondary coordinate matrix transpose",
        rows=secondary_matrix.numRows(),
        cols=secondary_matrix.numCols(),
    )
    coords_matrix = multiply_coordinate_matrices(primary_matrix,
                                                 secondary_matrix)

    res = coord_matrix_to_dataframe(
        spark,
        primary_row_number_col,
        secondary_row_number_col,
        output_col,
        coords_matrix,
    )

    primary_rdd.unpersist()
    secondary_rdd.unpersist()

    return res