data = sc.textFile('hdfs://node1:9000/input/vectors_3000x500.txt')
data = data.map(lambda _ : np.array(_.strip().split()).astype(float))
data = data.map(lambda _ : _/np.linalg.norm(_))
U = data.zipWithIndex().map(lambda _ : IndexedRow(_[1], _[0]))
U = IndexedRowMatrix(U)



UT = U.toCoordinateMatrix()
UT = UT.transpose()



U = U.toBlockMatrix()
UT = UT.toBlockMatrix()

S = U.multiply(UT)

S_coord = S.toCoordinateMatrix()
sim = S_coord.entries
print(sim.take(100))





debug.TIMESTAMP(2)
Example #2
0
# Open some context to allow for toDF function to work or something ??
sql.SQLContext(sc)
data = sc.textFile(dataset)

#data = (data.map(lambda s: (list(map(lambda x: float(x), s.split()))))).zipWithIndex().map(lambda x: ((x[1], 0), DenseMatrix(1, 1000, x[0])))

# Read matrix normally to format of (rownumber, vector)
data = data.map(lambda s: (list(map(lambda x: float(x), s.split())))
                ).zipWithIndex().map(lambda x: (x[1], x[0]))

# Create a transpose for the matrix
tdata = sc.textFile(dataset).map(lambda s: list(
    map(lambda x:
        (x[0], float(x[1])), enumerate(s.split())))).zipWithIndex().flatMap(
            lambda x: map(lambda y: (y[0], (x[1], y[1])), x[0])).groupByKey()

# Map the transpose data to same format as normal matrix
tdata = tdata.map(lambda x: (x[
    0], map(lambda s: s[1], sorted(list(x[1]), key=itemgetter(0)))))

# Create BlockMatrix for the normal matrix and its transpose
mat = IndexedRowMatrix(data)
mat = mat.toBlockMatrix()
matTranspose = IndexedRowMatrix(tdata).toBlockMatrix()

# Get final result by multiplying mat * mat^T * mat
matTranspose = mat.multiply(matTranspose)
matRes = matTranspose.multiply(mat)

print('Done')
Example #3
0
    def get_Total_Related_Downloads(self, dfmain):
        #total downloads
        download_count = dfmain.groupby(['_id'])['_id'].agg(['count'])

        #build datasets vs ip similarity matrix
        group = pd.DataFrame({
            'download_count':
            dfmain.groupby(['_id', 'ip']).size()
        }).reset_index()
        person_u = list(group.ip.unique())
        dataset_u = list(group._id.unique())

        outF = open(self.DATA_LIST_FILE, "w")
        for line in dataset_u:
            outF.write(str(line))
            outF.write("\n")
        outF.close()

        data = group['download_count'].tolist()
        row = group._id.astype('category', categories=dataset_u).cat.codes
        cols = group.ip.astype('category', categories=person_u).cat.codes
        len_dataset = len(dataset_u)
        len_person = len(person_u)
        print("Datasets vs Ips :", str(len_dataset),
              str(len_person))  #(309235, 81566)
        sparsemat = sparse.csr_matrix((data, (row, cols)),
                                      dtype=np.int8,
                                      shape=(len_dataset, len_person))
        m, n = sparsemat.shape

        def f(x):
            d = {}
            for i in range(len(x)):
                d[str(i)] = float(x[i])
            return d

        # load PySpark using findSpark package

        #SparkContext.setSystemProperty('spark.executor.memory', '5g')
        #SparkContext.setSystemProperty('spark.driver.memory', '5g')
        #SparkContext.setSystemProperty('spark.executor.heartbeatInterval', '1000000000s')

        #conf = SparkConf().setAppName("simdownload")
        #conf = (conf.setMaster('local[*]').set('spark.executor.memory', '4G'))#.set('spark.executor.heartbeatInterval','1000000s')
        #sc = SparkContext(conf=conf)
        #sc = SparkContext("local", "simdownload")
        sc = SparkContext(appName="simdownload")
        sqlContext = SQLContext(sc)
        #print(sc._conf.getAll())
        sv_rdd = sc.parallelize(sparsemat.toarray())
        #populate the values from rdd to dataframe
        dfspark = sv_rdd.map(lambda x: Row(**f(x))).toDF()

        row_with_index = Row(*["id"] + dfspark.columns)

        def make_row(columns):
            def _make_row(row, uid):
                row_dict = row.asDict()
                return row_with_index(*[uid] +
                                      [row_dict.get(c) for c in columns])

            return _make_row

        print('parallelize-ok')

        f = make_row(dfspark.columns)
        # create a new dataframe with id column (use indexes)
        dfidx = (dfspark.rdd.zipWithIndex().map(lambda x: f(*x)).toDF(
            StructType([StructField("id", LongType(), False)] +
                       dfspark.schema.fields)))
        #compute cosine sim by rows
        pred = IndexedRowMatrix(
            dfidx.rdd.map(lambda row: IndexedRow(row.id, row[1:])))
        pred1 = pred.toBlockMatrix().transpose().toIndexedRowMatrix()
        pred_sims = pred1.columnSimilarities()
        #convert coordinatematrix (pred_sims) into a dataframe
        columns = ['from', 'to', 'sim']
        vals = pred_sims.entries.map(lambda e: (e.i, e.j, e.value))
        dfsim = sqlContext.createDataFrame(vals, columns)

        print('Sim Done!')
        print('Time Sim Done: ' + time.strftime("%H:%M:%S"))

        json_data = {}
        for i in range(m):
            target_id = int(dataset_u[i])
            dftemp = dfsim.where((psf.col("from") == i)
                                 | (psf.col("to") == i)).sort(
                                     psf.desc("sim")).limit(
                                         self.num_top_dataset)
            df = dftemp.toPandas()
            # v = df.iloc[:, :-1].values
            # ii = np.arange(len(df))[:, None]
            # ji = np.argsort(v == i, axis=1)  # replace `1` with your ID
            # related_ids = (v[ii, ji][:, 0]).tolist()
            # related_datasets = [dataset_u[i] for i in related_ids]
            myarr = []
            for index, rw in df.iterrows(
            ):  #this is a bit faster than numpy above
                from_id = rw['from']
                to_id = rw['to']
                if (from_id != i):
                    myarr.append(int(from_id))
                if (to_id != i):
                    myarr.append(int(to_id))
            related_datasets = [int(dataset_u[i]) for i in myarr]

            downloads = download_count.loc[target_id]['count']
            data = {}
            data['related_datasets'] = related_datasets
            data['total_downloads'] = int(downloads)
            json_data[target_id] = data

        print('Time JSONUSAGE_FILE 1: ' + time.strftime("%H:%M:%S"))
        with open(self.JSONUSAGE_FILE, 'w') as fp:
            json.dump(json_data, fp)

        print('Time JSONUSAGE_FILE 2: ' + time.strftime("%H:%M:%S"))
        sc.stop()
A = A.map(lambda s : [float(x) for x in s.split()])

# Zip index values with cell values
A = A.zipWithIndex().map(lambda x: (x[1], x[0]))

# Print step 1 ready. With full set 1min.
print(" ")
print("Step 1 ready")
print(" ")

# Conver A to IndexedRowMatrix
A = IndexedRowMatrix(A)

# Convert A to blockmatrices and set block size.
#A = A.toBlockMatrix(1000, 1000) # Works with sample set. Data to 1 block.
A = A.toBlockMatrix(100, 1000) # Testing with full dataset

# Cache A, because it is used multiple times
A.cache()

# Print step 2 ready. With full set 3mins.
print(" ")
print("Step 2 ready")
print(" ")

# Next multiplications. We need to calculate A*AT*A.
# Size of the A is 1000000 x 1000 so size of the A*AT would be 100000*100000.
# and AT*A would be 1000*1000. For better performance use matrix multiplication 
# rule (A*AT)*A = A*(AT*A).

# Calculate A transpose