def mapTestCategoricalFeatures(): #get spark context from import specified spark_contxt = ex.configureSpark() testDataRDD = ex.getDeviceRDD(spark_contxt, TEST_FILE) s = testDataRDD.map(testMapper) s.saveAsTextFile(TEST_DIR)
def extractDeviceTypes(): spark_contxt = ex.configureSpark() #retrieve device fields RDD device_fields = ex.xploreDevices(spark_contxt, DEVICES_FILE) #collect works only if data is small enough to fit in memory of driver machine #otherwise store RDD using saveAsTextFile to hdfs cluster device_types = device_fields.map(lambda field: field[2]).distinct().collect() device_types.sort() return device_types
def joinRDD(): spark_context = explore.configureSpark() #devices RDD devices = explore.getDeviceRDD(spark_context, DEVICES_FILE) #cookie RDD cookies = explore.getDeviceRDD(spark_context, COOKIES_FILE) #create key value pairs (drawbridge_handle, devices_data) device_pairs = devices.map(lambda line: (line.split(",")[0], line)) ##create key value pairs (drawbridge_handle, cookies_data) cookie_pairs = cookies.map(lambda linec: (linec.split(",")[0], linec)) #join devices and cookies key, val pairs on common key join_val = device_pairs.join(cookie_pairs) join_val.saveAsTextFile()
def mapCookieCategoricalFeatures(): #get spark context from import specified spark_contxt = ex.configureSpark() #get device RDD dataRDD = ex.getCookieRDD(spark_contxt, COOKIE_FILE_RAW) #retrieve device fields data_fields = dataRDD.map(lambda line: line.split(",")) dataRDD.persist(StorageLevel.DISK_ONLY) global COMP_OS_TYPES COMP_OS_TYPES = data_fields.map( lambda field: field[2]).distinct().collect() index = 0 for o in COMP_OS_TYPES: COMP_OS_TYPES_DICT[str(o)] = str(index) index = index + 1 # serialize COMP_OS_TYPES_DICT with pickle with open("comp-os-type-dict.pickle", 'wb') as f: pickle.dump(COMP_OS_TYPES_DICT, f) global BROWSER_VERSION BROWSER_VERSION = data_fields.map( lambda field: field[3]).distinct().collect() index = 0 for b in BROWSER_VERSION: BROWSER_VERSION_DICT[str(b)] = str(index) index = index + 1 #serialize BROWSER_VERSION_DICT with pickle with open("browser-version-dict.pickle", 'wb') as f: pickle.dump(BROWSER_VERSION_DICT, f) global COOKIE_COUNTRY COOKIE_COUNTRY = data_fields.map( lambda field: field[4]).distinct().collect() index = 0 for c in COOKIE_COUNTRY: COOKIE_COUNTRY_DICT[str(c)] = str(index) index = index + 1 # serialize python dictionary object with pickle with open("cookie-country-dict.pickle", "wb") as f: pickle.dump(COOKIE_COUNTRY_DICT, f)
def mapCookieCategoricalFeatures(): #get spark context from import specified spark_contxt = ex.configureSpark() #get device RDD dataRDD = ex.getCookieRDD(spark_contxt, COOKIE_FILE_RAW) #retrieve device fields data_fields = dataRDD.map(lambda line: line.split(",")) dataRDD.persist(StorageLevel.DISK_ONLY) global COMP_OS_TYPES COMP_OS_TYPES = data_fields.map(lambda field: field[2]).distinct().collect() index = 0 for o in COMP_OS_TYPES: COMP_OS_TYPES_DICT[str(o)] = str(index) index = index + 1 # serialize COMP_OS_TYPES_DICT with pickle with open("comp-os-type-dict.pickle",'wb') as f: pickle.dump(COMP_OS_TYPES_DICT, f) global BROWSER_VERSION BROWSER_VERSION = data_fields.map(lambda field: field[3]).distinct().collect() index = 0 for b in BROWSER_VERSION: BROWSER_VERSION_DICT[str(b)] = str(index) index = index + 1 #serialize BROWSER_VERSION_DICT with pickle with open("browser-version-dict.pickle", 'wb') as f: pickle.dump(BROWSER_VERSION_DICT, f) global COOKIE_COUNTRY COOKIE_COUNTRY = data_fields.map(lambda field: field[4]).distinct().collect() index = 0 for c in COOKIE_COUNTRY: COOKIE_COUNTRY_DICT[str(c) ]= str(index) index = index + 1 # serialize python dictionary object with pickle with open("cookie-country-dict.pickle", "wb") as f: pickle.dump(COOKIE_COUNTRY_DICT, f)
def mapCategoricalFeatures(): #get spark context from import specified spark_contxt = ex.configureSpark() #get device RDD dataRDD = ex.getDeviceRDD(spark_contxt, DATA_FILE) #retrieve device fields data_fields = dataRDD.map(lambda line: line.split(",")) dataRDD.persist() #get device types global DEVICE_TYPES DEVICE_TYPES = data_fields.map(lambda field: field[3]).distinct().collect() #create device type feature map to numeric values andd store in dictionary index = 0 for d in DEVICE_TYPES: DEVICE_TYPES_DICT[str(d)] = str(index) index = index + 1 #get device os global DEVICE_OS DEVICE_OS = data_fields.map(lambda field: field[4]).distinct().collect() #create device os feature map to numeric values andd store in dictionary index = 0 for o in DEVICE_OS: DEVICE_OS_DICT[str(o)] = str(index) index = index + 1 #get device country global DEVICE_COUNTRY DEVICE_COUNTRY = data_fields.map(lambda field: field[7]).distinct().collect() #create device country feature map to numeric values andd store in dictionary index = 0 for c in DEVICE_COUNTRY: DEVICE_COUNTRY_DICT[str(c)] = str(index) index = index + 1 #get comp os type global COMP_OS_TYPES COMP_OS = data_fields.map(lambda field: field[5]).distinct() COMP_OS_TYPES_COUNT = COMP_OS.count() COMP_OS_TYPES = COMP_OS.collect() print "Distinct COMP OS ", COMP_OS_TYPES_COUNT index = 0 for o in COMP_OS_TYPES: COMP_OS_TYPES_DICT[str(o)] = str(index) index = index + 1 print "COMP OS dctionary size ", len(COMP_OS_TYPES_DICT) #get browser version global BROWSER_VERSION BROWSER_VERSION = data_fields.map(lambda field: field[6]).distinct().collect() index = 0 for b in BROWSER_VERSION: BROWSER_VERSION_DICT[str(b)] = str(index) index = index + 1 ''' #get anonymous_c1 feature global ANON_C1 ANON_C1 = data_fields.map(lambda field: field[6]).distinct().collect() #create categorical-to-numeric mapping in dictionary index = 0 for a1 in ANON_C1: ANON_C1_DICT[str(a1)] = str(index) index = index + 1 #get anonymous_c2 feature global ANON_C2 ANON_C2 = data_fields.map(lambda field: field[7]).distinct().collect() #create categorical-to-numeric mapping in dictionary index = 0 for a2 in ANON_C2: ANON_C2_DICT[str(a2)] = str(index) index = index + 1 ''' print DEVICE_TYPES_DICT print DEVICE_OS_DICT print DEVICE_COUNTRY_DICT print COMP_OS_TYPES_DICT print BROWSER_VERSION_DICT
def mapCategoricalFeatures(): #get spark context from import specified spark_contxt = ex.configureSpark() #get cookie RDD cookieRDD = ex.getCookieRDD(spark_contxt, COOKIE_FILE_RAW) #retrieve cookie fields cookie_fields = cookieRDD.map(lambda line: line.split(",")) #persist the cookie fields cookie_fields.persist(StorageLevel.DISK_ONLY) global COMP_OS_TYPES COMP_OS_TYPES = cookie_fields.map(lambda field: field[2]).distinct().collect() index = 0 for o in COMP_OS_TYPES: COMP_OS_TYPES_DICT[str(o)] = str(index) index = index + 1 # serialize COMP_OS_TYPES_DICT with pickle(store it in serialized format, can be deserialized later to use) with open(PATH + "dictionary/comp-os-type-dict.pickle",'wb') as f: pickle.dump(COMP_OS_TYPES_DICT, f) global BROWSER_VERSION BROWSER_VERSION = cookie_fields.map(lambda field: field[3]).distinct().collect() index = 0 for b in BROWSER_VERSION: BROWSER_VERSION_DICT[str(b)] = str(index) index = index + 1 #serialize BROWSER_VERSION_DICT with pickle (store it in serialized format, can be deserialized later to use) with open(PATH + "dictionary/browser-version-dict.pickle", 'wb') as f: pickle.dump(BROWSER_VERSION_DICT, f) global COOKIE_COUNTRY COOKIE_COUNTRY = cookie_fields.map(lambda field: field[4]).distinct().collect() index = 0 for c in COOKIE_COUNTRY: COOKIE_COUNTRY_DICT[str(c) ]= str(index) index = index + 1 # serialize python dictionary object into a file with pickle with open(PATH + "dictionary/cookie-country-dict.pickle", "wb") as f: pickle.dump(COOKIE_COUNTRY_DICT, f) deviceRDD = ex.getDeviceRDD(spark_contxt, DEVICE_FILE_RAW) device_fields = deviceRDD.map(lambda line: line.split(',')) device_fields.persist(StorageLevel.DISK_ONLY) #get device types global DEVICE_TYPES DEVICE_TYPES = device_fields.map(lambda field: field[3]).distinct().collect() #create device type feature map to numeric values andd store in dictionary index = 0 for d in DEVICE_TYPES: DEVICE_TYPES_DICT[str(d)] = str(index) index = index + 1 with open(PATH + "dictionary/dev-type-dict.pickle", 'wb') as f: pickle.dump(DEVICE_TYPES_DICT, f) #get device os global DEVICE_OS DEVICE_OS = device_fields.map(lambda field: field[4]).distinct().collect() #create device os feature map to numeric values andd store in dictionary index = 0 for o in DEVICE_OS: DEVICE_OS_DICT[str(o)] = str(index) index = index + 1 with open(PATH + "dictionary/device-os-dict.pickle", 'wb') as f: pickle.dump(DEVICE_OS_DICT, f) saveExtractedData(spark_contxt)