Ejemplo n.º 1
0
    def __init__(self, args, sc):

        self.EPSILON = 1.0e-5

        self.ctx = sc

        self.numPartitions = args.partitions

        self.numIterations = args.iterations
        self.inputVectorPath = args.inputVector
        self.inputMatrixPath = args.inputMatrix
        self.outputVectorPath = args.outputVector

        # Read Matrix input data
        # inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)

        if (self.numPartitions != 0):
            inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\
                .map(lambda line: IndexedRow(line[0], line[1]))\
                .repartition(self.numPartitions)
        else:
            inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\
                .map(lambda line: IndexedRow(line[0], line[1]))

        self.inputMatrix = IndexedRowMatrix(inputMatrixData)

        self.inputVector = readVector(self.inputVectorPath, self.ctx)

        if (self.numIterations == 0):
            self.numIterations = self.inputVector.size * 2

        self.result = Vectors.zeros(self.inputVector.size)
Ejemplo n.º 2
0
    def test_row_matrix_invalid_type(self):
        rows = self.sc.parallelize([[1, 2, 3], [4, 5, 6]])
        invalid_type = ""
        matrix = RowMatrix(rows)
        self.assertRaises(TypeError, matrix.multiply, invalid_type)

        irows = self.sc.parallelize([IndexedRow(0, [1, 2, 3]), IndexedRow(1, [4, 5, 6])])
        imatrix = IndexedRowMatrix(irows)
        self.assertRaises(TypeError, imatrix.multiply, invalid_type)
Ejemplo n.º 3
0
def multiply_matrices2(A: np.ndarray, B: np.ndarray) -> np.ndarray:
    listA = A.tolist()
    rddA = sc.parallelize([IndexedRow(i, listA[i]) for i in range(len(listA))])
    matA = IndexedRowMatrix(rddA).toBlockMatrix()

    listB = B.tolist()
    rddB = sc.parallelize([IndexedRow(i, listB[i]) for i in range(len(listB))])
    matB = IndexedRowMatrix(rddB).toBlockMatrix()

    matC = matA.multiply(matB).toLocalMatrix()
    return matC.toArray()
def readMovieChar(spark, f_name):
    my_data = list()
    with open(f_name, 'r') as handle:
        reader = csv.reader(handle, delimiter=",", quotechar='"')
        for row in reader:
            my_data.append(row)
    my_data.pop(0)

    matrix = np.zeros(shape=(int(my_data[-1][0]) + 1, len(movie_genre)),
                      dtype=int)
    movie_list = dict()

    for movie in my_data:
        movie_id = int(movie[0])
        movie_list[movie_id] = movie[1]

        genres = movie[2].split('|')
        for each in genres:
            col_idx = movie_genre.get(each, movie_genre['Other'])
            matrix[movie_id][col_idx] = 1

    indexedRows = spark.sparkContext.parallelize(
        [IndexedRow(i, matrix[i]) for i in range(len(matrix))])
    mat = IndexedRowMatrix(indexedRows)
    return mat, movie_list
Ejemplo n.º 5
0
def MatrixTranspose(
    mat
):  #have some issues --1. will cause errors for some data, not sure reasons butreducing number of rows could help.
    ###2. the transpose sometimes return wrong result which seems due to parition issue -- repartion(1) sometimes fix it,
    #also pypsark change the order of rows after transposed coordinate matrix convert to row matrix
    ## this bug ref:https://stackoverflow.com/questions/34451253/converting-coordinatematrix-to-rowmatrix-doesnt-preserve-row-order
    ## use indexed matrix could partially fix this issue by reordering but this is too wierd
    '''
	transpose a row matrix -- to save space/memory use sparse vector when input is sparse vector
	:param mat: the input row matrix
	:return a transposed row matrix
	ref: https://stackoverflow.com/questions/47102378/transpose-a-rowmatrix-in-pyspark
	'''
    if isinstance(mat, IndexedRowMatrix):
        mat = mat.toRowMatrix()
    #this line will turn everythign to some dense matrix entries, try avoid using this function for efficiency
    transposed_mat = CoordinateMatrix(mat.rows.zipWithIndex().flatMap(
        lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])]))
    transposed_mat = transposed_mat.transpose().toIndexedRowMatrix().rows.toDF(
    ).orderBy("index")
    # back to sparse first then convert to indexedrowmatrix
    transposed_mat = transposed_mat.rdd.map(lambda row: IndexedRow(
        row["index"],
        MLLibVectors.sparse(
            row["vector"].size,
            np.nonzero(row["vector"].values)[0], row["vector"].values[
                np.nonzero(row["vector"].values)])))
    return IndexedRowMatrix(transposed_mat)
def compute_similarity(df):
    """
    Compute cosine
    :param df:dataframe of rating by user for movies
    :return:
    """

    # df = df.filter(df.movieId.isin([91542.0, 1.0, 5.0, 90.0, 2541.0, 1246.0, 1552.0, 4084.0, 5679.0]))

    df = df.groupBy("userId").pivot("movieId").agg(
        first(col('rating')).cast("double"))

    mat = IndexedRowMatrix(
        df.rdd.map(lambda row: IndexedRow(row[0], Vectors.dense(row[1:]))))

    cs = mat.columnSimilarities()

    path = "test"

    cs.entries.toDF().write.parquet(path)

    cs.entries.toDF().coalesce(1)\
       .write.format("com.databricks.spark.csv")\
       .option("header", "true")\
       .save("testtest.csv")
Ejemplo n.º 7
0
def vectorDFtoIndexedMatrix(df, vecvar, idcol):
    '''
	applicable to dataframe already having assembled vectors
	'''
    df = df.rdd.map(lambda row: IndexedRow(
        row[idcol],
        MLLibVectors.sparse(row[vecvar].size, row[vecvar].indices, row[vecvar].
                            values)))
    return IndexedRowMatrix(df)
Ejemplo n.º 8
0
    def __init__(self, args, sc):
        self.ctx = sc

        self.numPartitions = args.partitions

        self.inputVectorPath = args.inputVector
        self.inputMatrixPath = args.inputMatrix
        self.outputVectorPath = args.outputVector

        self.alpha = args.alpha
        self.beta = args.beta

        # Read Matrix input data
        # inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)

        if (self.numPartitions != 0):
            inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\
                .map(lambda line: IndexedRow(line[0], line[1]))\
                .repartition(self.numPartitions)
        else:
            inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\
                .map(lambda line: IndexedRow(line[0], line[1]))

        print "Number of rows in Matrix with type" + str(type(inputMatrixData)) + " is: " + str(inputMatrixData.count())

        # PipelinedRDD to RDD
        # newData = sc.parallelize(inputMatrixData.collect())

        inputMatrix = IndexedRowMatrix(inputMatrixData)

        inputVector = readVector(self.inputVectorPath, self.ctx)

        print "Vector size is: " + str(inputVector.size)

        result = Vectors.zeros(inputVector.size)

        # print result

        # DGEMV(alpha, A, x, beta, y, jsc):
        result = L2.DGEMV(self.alpha, inputMatrix, inputVector, self.beta, result, self.ctx)

        # writeVector(self.outputVectorPath, result)

        printVector(result)
Ejemplo n.º 9
0
def df_to_indexed_row_matrix(row_number_col: str, vector_col: str,
                             df: DataFrame):
    """Convert a dataframe containing a row number and vector to a block matrix"""
    indexed_rows = (df.where(F.col(vector_col).isNotNull()).select(
        F.col(row_number_col), F.col(vector_col)).rdd.map(
            lambda row: IndexedRow(row.__getitem__(row_number_col),
                                   row.__getitem__(vector_col).toArray())))

    if indexed_rows.isEmpty():
        raise ValueError(
            "Primary RDD is empty. Cannot perform matrix multiplication")

    return IndexedRowMatrix(indexed_rows)
 def _dist_matrix(self, rddv1, rddv2, sc):
     dlist1 = rddv1.collect()
     dlist2 = rddv2.collect()
     irows1 = [
         IndexedRow(i, dlist1[i][0].toArray())
         for i in range(0, len(dlist1))
     ]
     irows2 = [
         IndexedRow(i, dlist2[i][0].toArray())
         for i in range(0, len(dlist2))
     ]
     IMatrix1 = IndexedRowMatrix(sc.parallelize(irows1))
     IMatrix2 = IndexedRowMatrix(sc.parallelize(irows2))
     cart = IMatrix1.rows.cartesian(IMatrix2.rows)
     A = cart.map(lambda x: (x[0].index, x[1].index,
                             np.sqrt(
                                 np.sum(
                                     np.power(
                                         np.array(x[0].vector) - np.array(x[
                                             1].vector), 2))))).collect()
     A.sort()
     Arr = self.__dist_array(A)
     return Arr
Ejemplo n.º 11
0
def DFtoIndexedMatrix(df, quantvars, idcol):
    '''
	convert a numeric dataframe to a rowmatrix with sparse vector as basic units, won't be applicable to dataframe already having assembled vectors
	'''
    df = VectorAssembler(
        inputCols=quantvars, outputCol="features"
    ).transform(df).select(
        [idcol, "features"]
    )  #vector assembler turn it automatically to sparse matrix, so next line should be fine
    df = df.rdd.map(lambda row: IndexedRow(
        row[idcol],
        MLLibVectors.sparse(row.features.size, row.features.indices, row.
                            features.values)))
    return IndexedRowMatrix(df)
Ejemplo n.º 12
0
	def getConnectivity(self,rddv,spark):
		sc = spark.sparkContext
		radius = self.getRadius()
		dist = self.getDistance()
		dlist = rddv.collect()
		featurecol = self.getFeaturesCol()
		irows = [IndexedRow(i,dlist[i][featurecol].toArray()) for i in range(0,len(dlist))]
		imatrix = IndexedRowMatrix(sc.parallelize(irows))
		cart = imatrix.rows.cartesian(imatrix.rows)

		rows = Row("id","vector")
		usr_row = [rows(i,np.float_(x).tolist()) for i,x in enumerate(dlist)]
		verts = spark.createDataFrame(usr_row)
		A = cart.filter(lambda x : dist(x[0].vector,x[1].vector) <= radius).map(lambda x : (x[0].index, x[1].index, 1))
		edges = spark.createDataFrame(A,['src','dst','connected'])
		return GraphFrame(verts,edges)
Ejemplo n.º 13
0
def multiply_transpose2(A: np.array) -> np.ndarray:  # A*A.T
    global counter
    print()
    print("No." + str(counter) + " matrix multiplication starts")
    start_time = time.time()
    print("matrix shape:", A.shape)
    listA = A.tolist()
    rddA = sc.parallelize([IndexedRow(i, listA[i]) for i in range(len(listA))])
    matA = IndexedRowMatrix(rddA).toBlockMatrix()
    matT = matA.transpose()
    matR = matA.multiply(matT)
    res = matR.toLocalMatrix().toArray()
    elapsed_time = time.time() - start_time
    print("No." + str(counter) + " matrix multiplication ends, takes time:",
          elapsed_time)
    counter = counter + 1
    return res
Ejemplo n.º 14
0
    def __index_row_matrix_rdd(self, scale_df):
        """

        :param scale_df:
        :return:
        """
        try:
            vector_mllib = MLUtils.convertVectorColumnsFromML(
                scale_df, 'scaled_features').drop('features')
            vector_rdd = vector_mllib.select(
                'scaled_features',
                'id').rdd.map(lambda x: IndexedRow(x[1], x[0]))
            self.__logger.info("Build Index Row Matrix RDD")
            return IndexedRowMatrix(vector_rdd)
        except TypeError as te:
            raise OpheliaMLException(
                f"An error occurred while calling __index_row_matrix_rdd() method: {te}"
            )
Ejemplo n.º 15
0
import os
os.environ["SPARK_HOME"] = "C:\spark"
os.environ["HADOOP_HOME"] = "C:\winutils"

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix

if __name__ == "__main__":
    sc = SparkContext.getOrCreate()
    sqlContext = SQLContext(sc)  # IndexRowMatrix

    # Method 1:
    m = sc.parallelize([[2, 2, 2], [3, 3, 3]]).zipWithIndex()
    n = sc.parallelize([[1, 1], [4, 4], [3, 3]]).zipWithIndex()

    # Create an Index Row Matrix
    # Convert to Block Matrix
    mat1 = IndexedRowMatrix(m.map(lambda row: IndexedRow(row[1], row[0]))).toBlockMatrix()
    mat2 = IndexedRowMatrix(n.map(lambda row2: IndexedRow(row2[1], row2[0]))).toBlockMatrix()

    # Method 2:
    #mat1 = BlockMatrix(m, 2, 3)
    #mat2 = BlockMatrix(n, 3, 2)

    # Use of multiply function from pyspark.mllib.linalg
    mat_mul_output = mat1.multiply(mat2).toLocalMatrix()
    print(mat_mul_output)
Ejemplo n.º 16
0
#     --.map(lambda x: (x[0], sorted(x[1], key=lambda x: x[0], reverse=True)))\
#     --.map(lambda x: (x[0], [p[1] for p in x[1]]))\
#     --.map(lambda x: x[1])\
#     --.zipWithIndex()
# ------------------------------------------

# do I have a 2D matrix now?
print(
    "# do I have a 2D matrix now --> FULLY PREDICTED ????????????????????????")
for item in final_stars_FINAL_READY.collect():
    print(item)
print(
    "# do I have a 2D matrix now --> FULLY PREDICTED ??????????????????????? ==> NOW WE KNOw ........."
)
iris_irm = IndexedRowMatrix(
    final_stars_FINAL_READY.map(lambda x: IndexedRow(x[1], x[0])))

# ------------------------------------------
# https://blog.paperspace.com/dimension-reduction-with-principal-component-analysis/
# do SVD:
num_of_top_sing_values = 2
SVD = iris_irm.computeSVD(num_of_top_sing_values, True)

U = SVD.U
S = SVD.s.toArray()

# compute the eigenvalues and number of components to retain
n = final_stars_FINAL_READY.count()
eigvals = S**2 / (n - 1)
eigvals = np.flipud(np.sort(eigvals))
cumsum = eigvals.cumsum()
    print(f)
    print(root + folders[f])
    data = sc.wholeTextFiles(root + folders[f])
    data.cache()
    documents = data.map(lambda s: tokenize(s[1])).map(
        lambda s: remove_stopwords(s, stopwords))
    files = data.map(lambda s: s[0]).collect()
    documents.cache()
    hashingTF = HashingTF()
    featurizedData = hashingTF.transform(documents)
    idf = IDF()
    idfModel = idf.fit(featurizedData)
    featurizedData.cache()
    tfidfs = idfModel.transform(featurizedData)
    tfidfs.cache()
    final_rdd = tfidfs.zipWithIndex().map(lambda s: IndexedRow(s[1], s[0]))
    final_rdd.cache()
    sims = IndexedRowMatrix(final_rdd).toCoordinateMatrix().transpose(
    ).toIndexedRowMatrix().columnSimilarities()
    pairs = sims.entries.map(lambda m: [m.i, m.j, m.value]).collect()
    for p in range(0, len(pairs)):
        pairs.append([pairs[p][1], pairs[p][0], pairs[p][2]])
    results = []
    for p in range(0, len(files)):
        results.append([p, 0, 0.0])

    for p in range(0, len(pairs)):
        index = pairs[p][0]
        if pairs[p][2] > results[index][2]:
            results[index] = [index, pairs[p][1], pairs[p][2]]
    file_object = open("/home/user/out/" + folders[f] + ".csv", "w")
Ejemplo n.º 18
0
def as_block_matrix(rdd, rowsPerBlock=65000, colsPerBlock=65000):
    return IndexedRowMatrix(
        rdd.zipWithIndex().map(lambda xi: IndexedRow(xi[1], xi[0]))
    ).toBlockMatrix(rowsPerBlock, colsPerBlock)
Ejemplo n.º 19
0
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
import numpy as np
import os
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix, BlockMatrix

os.environ["SPARK_HOME"] = "C:\\Users\\plfoley\\spark-2.3.1-bin-hadoop2.7"
os.environ["HADOOP_HOME"] = "C:\\Users\\plfoley\\winutils"

sc = SparkContext()
rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) \
    .zipWithIndex()
rows2 = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) \
    .zipWithIndex()

# need a SQLContext() to generate an IndexedRowMatrix from RDD
sqlContext = SQLContext(sc)
rows = IndexedRowMatrix( \
    rows \
    .map(lambda row: IndexedRow(row[1], row[0])) \
    ).toBlockMatrix()

rows2 = IndexedRowMatrix( \
    rows2 \
    .map(lambda row2: IndexedRow(row2[1], row2[0])) \
    ).toBlockMatrix()

mat_product = rows.multiply(rows2).toLocalMatrix()
print(mat_product)
Ejemplo n.º 20
0
        .getOrCreate()

    lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])

    articles = lines.map(lambda urls: getArticletText(urls))

    hashingTF = HashingTF()
    tf = hashingTF.transform(articles)

    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)
    
    rows = tfidf.zipWithIndex()
    
    bm = IndexedRowMatrix(rows.map(lambda row : IndexedRow(row[1], row[0]))).toBlockMatrix()

    #bm_t = bm.transpose()
    #result_mat = bm.multiply(bm_t)
    #exact = result_mat.toIndexedRowMatrix().toRowMatrix()

    exact = bm.transpose().toIndexedRowMatrix().columnSimilarities()

    print(exact.entries.collect())

    #print(exact.entries.collect()[0])

    #parsedArticles = articles.collect()

    #tfidf = TfidfVectorizer().fit_transform(parsedArticles)
    #pairwise_similarity = tfidf * tfidf.T
Ejemplo n.º 21
0
# final_stars_FINAL_READY = final_stars_FINAL.rdd\
#     --.map(lambda x: (x[0], [(x[1], x[2])]))\
#     --.reduceByKey(lambda a,b: a+b)\
#     --.map(lambda x: (x[0], sorted(x[1], key=lambda x: x[0], reverse=True)))\
#     --.map(lambda x: (x[0], [p[1] for p in x[1]]))\
#     --.map(lambda x: x[1])\
#     --.zipWithIndex()
# ------------------------------------------


# do I have a 2D matrix now?
print("# do I have a 2D matrix now --> FULLY PREDICTED ????????????????????????")
for item in final_stars_FINAL_READY.collect():
    print(item)
print("# do I have a 2D matrix now --> FULLY PREDICTED ??????????????????????? ==> NOW WE KNOw .........")
iris_irm = IndexedRowMatrix(final_stars_FINAL_READY.map(lambda x: IndexedRow(x[1], x[0])))










# ------------------------------------------
# https://blog.paperspace.com/dimension-reduction-with-principal-component-analysis/
# do SVD:
num_of_top_sing_values = 2
SVD = iris_irm.computeSVD(num_of_top_sing_values, True)
Ejemplo n.º 22
0
tokenizer = Tokenizer(inputCol="text", outputCol="words")
wordsData = tokenizer.transform(dataFrame)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("title", "features").show()
#Normalizacion y transformada de la matriz
normalizer = Normalizer(inputCol="features", outputCol="norm")
data = normalizer.transform(rescaledData)

#Proceso de similaridad hallando la norma y el producto punto
mat = IndexedRowMatrix(
    data.select("num", "norm")\
        .rdd.map(lambda row: IndexedRow(row.num, row.norm.toArray()))).toBlockMatrix()
dot = mat.multiply(mat.transpose())
dot.toLocalMatrix().toArray()

dot_udf = psf.udf(lambda x, y: float(x.dot(y)), DoubleType())
data.alias("i").join(data.alias("j"), psf.col("i.num") < psf.col("j.num"))\
    .select(
        psf.col("i.num").alias("i"),
        psf.col("j.num").alias("j"),
        dot_udf("i.norm", "j.norm").alias("dot"))\
    .sort("i", "j")\
    .show()

tempcosine = data.alias("i").join(data.alias("j"), psf.col("i.num") < psf.col("j.num"))\
       .select(
           psf.col("i.num").alias("i"),
Ejemplo n.º 23
0
dv2[2]
dv1.size
dv2.toArray()

from pyspark.mllib.linalg import Matrices

dm = Matrices.dense(2, 3, [5.0, 0.0, 0.0, 3.0, 1.0, 4.0])
sm = Matrices.sparse(2, 3, [0, 1, 2, 4], [0, 1, 0, 1], [5.0, 3.0, 1.0, 4.0])
sm.toDense()
dm.toSparse()
dm[1, 1]

#Section 7.2.2
from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow
rmind = IndexedRowMatrix(
    rm.rows().zipWithIndex().map(lambda x: IndexedRow(x[1], x[0])))

#Section 7.4
housingLines = sc.textFile("first-edition/ch07/housing.data", 6)
housingVals = housingLines.map(
    lambda x: Vectors.dense([float(v.strip()) for v in x.split(",")]))

#Section 7.4.1
from pyspark.mllib.linalg.distributed import RowMatrix
housingMat = RowMatrix(housingVals)
from pyspark.mllib.stat._statistics import Statistics
housingStats = Statistics.colStats(housingVals)
housingStats.min()

#Section 7.4.4
from pyspark.mllib.regression import LabeledPoint
mat = RowMatrix(rows)

m = mat.numRows()
n = mat.number_columns()

print(m)
print(n)

# An IndexedRowMatrix is similar to a RowMatrix but has row indices, which can be used to identify specific rows,
# which is useful for executing join.

from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix

# a RDD of indexed rows
indexed = sc.parallelize([
    IndexedRow(0, [1, 2, 3]),
    IndexedRow(1, [4, 5, 6]),
    IndexedRow(2, [7, 8, 9]),
    IndexedRow(3, [10, 11, 12])
])
mat = IndexedRowMatrix(indexed)
print(mat)

# convert to row matrix
rowMat = mat.toRowMatrix()
print(rowMat)

# A CoordinateMatrix is distributed and stored in an object called a coordinate list.

from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry
conf = SparkConf().setAppName("labeledPoints")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

debug = Debugger()
debug.TIMESTAMP(1)
spark = SparkSession(sc)





data = sc.textFile('hdfs://node1:9000/input/vectors_3000x500.txt')
data = data.map(lambda _ : np.array(_.strip().split()).astype(float))
data = data.map(lambda _ : _/np.linalg.norm(_))
U = data.zipWithIndex().map(lambda _ : IndexedRow(_[1], _[0]))
U = IndexedRowMatrix(U)



UT = U.toCoordinateMatrix()
UT = UT.transpose()



U = U.toBlockMatrix()
UT = UT.toBlockMatrix()

S = U.multiply(UT)

S_coord = S.toCoordinateMatrix()
Ejemplo n.º 26
0
    def get_Total_Related_Downloads(self, dfmain):
        #total downloads
        download_count = dfmain.groupby(['_id'])['_id'].agg(['count'])

        #build datasets vs ip similarity matrix
        group = pd.DataFrame({
            'download_count':
            dfmain.groupby(['_id', 'ip']).size()
        }).reset_index()
        person_u = list(group.ip.unique())
        dataset_u = list(group._id.unique())

        outF = open(self.DATA_LIST_FILE, "w")
        for line in dataset_u:
            outF.write(str(line))
            outF.write("\n")
        outF.close()

        data = group['download_count'].tolist()
        row = group._id.astype('category', categories=dataset_u).cat.codes
        cols = group.ip.astype('category', categories=person_u).cat.codes
        len_dataset = len(dataset_u)
        len_person = len(person_u)
        print("Datasets vs Ips :", str(len_dataset),
              str(len_person))  #(309235, 81566)
        sparsemat = sparse.csr_matrix((data, (row, cols)),
                                      dtype=np.int8,
                                      shape=(len_dataset, len_person))
        m, n = sparsemat.shape

        def f(x):
            d = {}
            for i in range(len(x)):
                d[str(i)] = float(x[i])
            return d

        # load PySpark using findSpark package

        #SparkContext.setSystemProperty('spark.executor.memory', '5g')
        #SparkContext.setSystemProperty('spark.driver.memory', '5g')
        #SparkContext.setSystemProperty('spark.executor.heartbeatInterval', '1000000000s')

        #conf = SparkConf().setAppName("simdownload")
        #conf = (conf.setMaster('local[*]').set('spark.executor.memory', '4G'))#.set('spark.executor.heartbeatInterval','1000000s')
        #sc = SparkContext(conf=conf)
        #sc = SparkContext("local", "simdownload")
        sc = SparkContext(appName="simdownload")
        sqlContext = SQLContext(sc)
        #print(sc._conf.getAll())
        sv_rdd = sc.parallelize(sparsemat.toarray())
        #populate the values from rdd to dataframe
        dfspark = sv_rdd.map(lambda x: Row(**f(x))).toDF()

        row_with_index = Row(*["id"] + dfspark.columns)

        def make_row(columns):
            def _make_row(row, uid):
                row_dict = row.asDict()
                return row_with_index(*[uid] +
                                      [row_dict.get(c) for c in columns])

            return _make_row

        print('parallelize-ok')

        f = make_row(dfspark.columns)
        # create a new dataframe with id column (use indexes)
        dfidx = (dfspark.rdd.zipWithIndex().map(lambda x: f(*x)).toDF(
            StructType([StructField("id", LongType(), False)] +
                       dfspark.schema.fields)))
        #compute cosine sim by rows
        pred = IndexedRowMatrix(
            dfidx.rdd.map(lambda row: IndexedRow(row.id, row[1:])))
        pred1 = pred.toBlockMatrix().transpose().toIndexedRowMatrix()
        pred_sims = pred1.columnSimilarities()
        #convert coordinatematrix (pred_sims) into a dataframe
        columns = ['from', 'to', 'sim']
        vals = pred_sims.entries.map(lambda e: (e.i, e.j, e.value))
        dfsim = sqlContext.createDataFrame(vals, columns)

        print('Sim Done!')
        print('Time Sim Done: ' + time.strftime("%H:%M:%S"))

        json_data = {}
        for i in range(m):
            target_id = int(dataset_u[i])
            dftemp = dfsim.where((psf.col("from") == i)
                                 | (psf.col("to") == i)).sort(
                                     psf.desc("sim")).limit(
                                         self.num_top_dataset)
            df = dftemp.toPandas()
            # v = df.iloc[:, :-1].values
            # ii = np.arange(len(df))[:, None]
            # ji = np.argsort(v == i, axis=1)  # replace `1` with your ID
            # related_ids = (v[ii, ji][:, 0]).tolist()
            # related_datasets = [dataset_u[i] for i in related_ids]
            myarr = []
            for index, rw in df.iterrows(
            ):  #this is a bit faster than numpy above
                from_id = rw['from']
                to_id = rw['to']
                if (from_id != i):
                    myarr.append(int(from_id))
                if (to_id != i):
                    myarr.append(int(to_id))
            related_datasets = [int(dataset_u[i]) for i in myarr]

            downloads = download_count.loc[target_id]['count']
            data = {}
            data['related_datasets'] = related_datasets
            data['total_downloads'] = int(downloads)
            json_data[target_id] = data

        print('Time JSONUSAGE_FILE 1: ' + time.strftime("%H:%M:%S"))
        with open(self.JSONUSAGE_FILE, 'w') as fp:
            json.dump(json_data, fp)

        print('Time JSONUSAGE_FILE 2: ' + time.strftime("%H:%M:%S"))
        sc.stop()
Ejemplo n.º 27
0
path = "/home/forrest/workspace/LINE/Baselines/AMR/results/19-05-23__23-07-42__MSRParaphraseCorpus/matrix/document-concept-matrix.npz"

# Load training data
# training = spark.read.format("libsvm").load(path)

sc = spark.sparkContext

doc_conc_mtx = sparse.load_npz(path)

doc_conc_mtx = doc_conc_mtx.todense()

shape = doc_conc_mtx.shape

indexed_doc_concept = [
    IndexedRow(idx, doc_conc_mtx[idx].tolist()[0])
    for idx in range(0, shape[0])
]

# indexed_sample = [IndexedRow(idx, doc_conc_list[idx]) for idx in range(0, len(sample_list))]

rows = sc.parallelize(indexed_doc_concept)

matrix = IndexedRowMatrix(rows)

del doc_conc_mtx, indexed_doc_concept

np_matrix = indexed_row_matrix_to_numpy_matrix(matrix, (11604, 14428))

print(np_matrix.shape)
# svd = mtx.computeSVD(k=100)
Ejemplo n.º 28
0
    def calculate_distance(self, sdf1, sdf2):
        """
        This will calculate the distance between the vector-type columns of two spark dataframes

        :param sdf1: This is to have a columns id1 (dtype int) and v1 (dtype Vector)
        :param sdf2: This is to have a columns id2 (dtype int) and v2 (dtype Vector)
        :return:
        """

        cov = RowMatrix(
            sdf1.select(["v1"]).withColumnRenamed("v1", "v").union(
                sdf2.select(["v2"]).withColumnRenamed(
                    "v2", "v")).rdd.map(lambda row: Vectors.fromML(row.asDict(
                    )["v"]))).computeCovariance().toArray()

        x, v = np.linalg.eigh(cov)

        indices = 1e-10 <= x

        # we are trying to enfore the data types to be only python types
        n = int(v.shape[0])
        m = int(indices.sum())

        v_vals = [float(val) for val in v[:, indices].reshape(-1, ).tolist()]

        v_spark = DenseMatrix(n, m, v_vals)

        x_vals = [
            float(val)
            for val in np.diag(x[indices]**-0.5).reshape(-1, ).tolist()
        ]

        x_spark = DenseMatrix(m, m, x_vals)

        # we get the index to maintain the order
        _sdf1 = sdf1.rdd.zipWithIndex()\
            .map(lambda val_key: Row(id1=val_key[0].id1, v1=val_key[0].v1, index=val_key[1])).toDF()

        _sdf1.persist()

        _sdf2 = sdf2.rdd.zipWithIndex()\
            .map(lambda val_key: Row(id2=val_key[0].id2, v2=val_key[0].v2, index=val_key[1])).toDF()

        _sdf2.persist()

        # we get our indexed row matrix
        _sdf1_mat = IndexedRowMatrix(
            _sdf1.rdd.map(lambda row: IndexedRow(index=row.asDict()["index"],
                                                 vector=Vectors.fromML(
                                                     row.asDict()["v1"]))))

        _sdf2_mat = IndexedRowMatrix(
            _sdf2.rdd.map(lambda row: IndexedRow(index=row.asDict()["index"],
                                                 vector=Vectors.fromML(
                                                     row.asDict()["v2"]))))

        # we apply our transformation and then set it as our new variable
        _sdf1 = _sdf1.drop("v1").join(_sdf1_mat.multiply(v_spark).multiply(x_spark).rows\
                                      .map(lambda indexed_row: Row(index=indexed_row.index,
                                                                   v1=indexed_row.vector)).toDF(), "index")

        _sdf2 = _sdf2.drop("v2").join(_sdf2_mat.multiply(v_spark).multiply(x_spark).rows\
                                      .map(lambda indexed_row: Row(index=indexed_row.index,
                                                                   v2=indexed_row.vector)).toDF(), "index")

        @F.udf(DoubleType(), VectorUDT())
        def tmp(vec):
            return float(vec[0].squared_distance(vec[1]))**0.5

        all_sdf = _sdf1.crossJoin(_sdf2)

        dist_sdf = all_sdf.select("*", tmp(F.array('v1', 'v2')).alias('diff'))

        dist_sdf.persist()

        return dist_sdf