/
updateUsersProfiles.py
75 lines (58 loc) · 2.68 KB
/
updateUsersProfiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
''' updates users profile sparse matrix stored with not yet processed stream_xxxxxxxx files '''
import sys
import getopt
import os
from pyspark import SparkContext
'''same as in the previous script. Defined to add overwrite option'''
def saveAsTextFile(rdd, path, overwrite=True):
if overwrite:
os.system("rm -r " + path)
rdd.saveAsTextFile(path, compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
# Retrieves and deals with line arguments
def getArguments(argv):
matrixFiles = ""
inputFiles = ""
outputFile = ""
try:
options, args = getopt.getopt(argv,"hm:s:o:")
except getopt.GetoptError:
print 'updateUsersProfiles.py -m <matrixDirectory> -s <inputFiles> -o <outputFile>'
sys.exit(2)
for option, arg in options:
if option == '-h':
print 'streamToProfile.py -m <matrixDirectory> -s <inputFiles> -o <outputFile>'
sys.exit()
elif option == "-m":
if arg[len(arg)-1] == "/":
matrixFiles = arg
outputFile = arg[:-1]
else:
matrixFiles = arg + "/"
outputFile = arg
elif option == "-s":
inputFiles = arg
elif option == "-o":
outputFile = arg
return matrixFiles, inputFiles, outputFile
def main(argv):
''' matrixDirectory: the hdfs directory where we find users profile matrix. It is assumed to be compressed
and split in several files.
streamFiles: the files used to update the matrix. In userId|country|artistId|trackId format
outputFile: optional output directory for the updated matrix. By default, we simply overwrite the current one'''
matrixDirectory, streamFiles, outputFile = getArguments(argv)
sc = SparkContext(appName="usersProfile")
# open both matrix and non processed stream_xxxxxxxx files
# Turn into (key, value) pair, where key = (user, track), to prepare the join
matrix = (sc.textFile(matrixDirectory + "*.gz")
.map(lambda line: map(int, line.split(" ")))
.map(lambda t: ((t[0], t[1]), t[2])))
streamData = (sc.textFile(streamFiles)
.map(lambda line: line.split("|"))
.map(lambda t: ((int(t[0]), int(t[3])), 1)))
outData = (matrix.join(streamData) # here the entries look like ((user, track), [count, 1, 1 ...])
.map(lambda t: (t[0], sum(t[1])) ) # compute new count => ((user, track), new_count)
.sortByKey()
.map(lambda t: " ".join(map(str, (t[0][0], t[0][1], t[1]))))) # prepare output file
saveAsTextFile(outData, path = outputFile, overwrite = True)
if __name__ == "__main__":
main(sys.argv[1:])