def generateAd2UsersGivenAdSet(data_training, adSet) : ad2Users = dict([(adid, set())for adid in adSet]) for line in file(data_training) : fields = dataParser.parseTrainData(line) if fields == None or len(fields) == 0 : return Click, Impression, Display_url, AdID, AdvertiserID, Depth, \ Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields if UserID == '0' or AdID not in adSet : continue ad2Users[AdID].add(UserID) return ad2Users
def generateAd2UsersGivenAdSet(data_training, adSet): ad2Users = dict([(adid, set()) for adid in adSet]) for line in file(data_training): fields = dataParser.parseTrainData(line) if fields == None or len(fields) == 0: return Click, Impression, Display_url, AdID, AdvertiserID, Depth, \ Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields if UserID == '0' or AdID not in adSet: continue ad2Users[AdID].add(UserID) return ad2Users
def generateTopAdsUsersByClick(data_training, top = 200) : AdClickCnt = dict() for line in file(data_training) : fields = dataParser.parseTrainData(line) if fields == None or len(fields) == 0 : return Click, Impression, Display_url, AdID, AdvertiserID, Depth, \ Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields if AdID not in AdClickCnt : AdClickCnt[AdID] = 0 AdClickCnt[AdID] += Click adClickCntList = [(clickCnt, adid) for adid, clickCnt in AdClickCnt.items()] return heapq.nlargest(top, adClickCntList)
def dumpAd2UserStatus(data_training, adSet, userSet, fn_out) : output = file(fn_out, 'w') format = '%s\t%s\t%d\t%d\n' for line in file(data_training) : fields = dataParser.parseTrainData(line) if fields == None or len(fields) == 0 : return Click, Impression, Display_url, AdID, AdvertiserID, Depth, \ Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields #if (Click == 0 and Impression <= 2) or AdID not in adSet or UserID not in userSet : # continue if AdID not in adSet or UserID not in userSet : continue output.write(format % (AdID, UserID, Click, Impression)) output.close()
def generateTopAdsUsersByClick(data_training, top=200): AdClickCnt = dict() for line in file(data_training): fields = dataParser.parseTrainData(line) if fields == None or len(fields) == 0: return Click, Impression, Display_url, AdID, AdvertiserID, Depth, \ Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields if AdID not in AdClickCnt: AdClickCnt[AdID] = 0 AdClickCnt[AdID] += Click adClickCntList = [(clickCnt, adid) for adid, clickCnt in AdClickCnt.items()] return heapq.nlargest(top, adClickCntList)
def dumpAd2UserStatus(data_training, adSet, userSet, fn_out): output = file(fn_out, 'w') format = '%s\t%s\t%d\t%d\n' for line in file(data_training): fields = dataParser.parseTrainData(line) if fields == None or len(fields) == 0: return Click, Impression, Display_url, AdID, AdvertiserID, Depth, \ Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields #if (Click == 0 and Impression <= 2) or AdID not in adSet or UserID not in userSet : # continue if AdID not in adSet or UserID not in userSet: continue output.write(format % (AdID, UserID, Click, Impression)) output.close()
def dumpUserRawFeatureGivenUserSet(data_training, userSet, fn): userDict = dict([(userid, { 'queryIDlist': [], 'titleIDlist': [], 'descIDList': [] }) for userid in userSet]) queryIDset = set() titleIDset = set() descIDset = set() for line in file(data_training): fields = dataParser.parseTrainData(line) if fields == None or len(fields) == 0: return Click, Impression, Display_url, AdID, AdvertiserID, Depth, \ Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields if UserID == '0' or UserID not in userSet: continue queryIDset.add(QueryID) titleIDset.add(TitleID) descIDset.add(DescriptionID) userDict[UserID]['queryIDlist'].append(QueryID) #only track clicked ads' infomation if Click > 0: userDict[UserID]['titleIDlist'].append(TitleID) userDict[UserID]['descIDList'].append(DescriptionID) #dump aggregation result to file dump_format = '%s\x01%s\x02%s\x02%s\n' aggregateUserResult = file(fn, 'w') for user in userDict: aggregateUserResult.write(dump_format % \ (user, '\t'.join(userDict[user]['queryIDlist']), '\t'.join(userDict[user]['titleIDlist']), '\t'.join(userDict[user]['descIDList']))) aggregateUserResult.close() #dump all ID set to files which would be used to filter additional data. dumpFilesName = { TMP_DATA_DIR_PATH + 'queryID.set': queryIDset, TMP_DATA_DIR_PATH + 'titleID.set': titleIDset, TMP_DATA_DIR_PATH + 'descID.set': descIDset } for filename, s in dumpFilesName.items(): dumpfile = file(filename, 'w') for item in s: dumpfile.write('%s\n' % (item)) dumpfile.close()
def dumpUserRawFeatureGivenUserSet(data_training, userSet, fn) : userDict = dict([(userid, {'queryIDlist' : [], 'titleIDlist' : [], 'descIDList': []}) for userid in userSet]) queryIDset = set() titleIDset = set() descIDset = set() for line in file(data_training) : fields = dataParser.parseTrainData(line) if fields == None or len(fields) == 0 : return Click, Impression, Display_url, AdID, AdvertiserID, Depth, \ Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields if UserID == '0' or UserID not in userSet : continue queryIDset.add(QueryID) titleIDset.add(TitleID) descIDset.add(DescriptionID) userDict[UserID]['queryIDlist'].append(QueryID) #only track clicked ads' infomation if Click > 0 : userDict[UserID]['titleIDlist'].append(TitleID) userDict[UserID]['descIDList'].append(DescriptionID) #dump aggregation result to file dump_format = '%s\x01%s\x02%s\x02%s\n' aggregateUserResult = file(fn, 'w') for user in userDict : aggregateUserResult.write(dump_format % \ (user, '\t'.join(userDict[user]['queryIDlist']), '\t'.join(userDict[user]['titleIDlist']), '\t'.join(userDict[user]['descIDList']))) aggregateUserResult.close() #dump all ID set to files which would be used to filter additional data. dumpFilesName = {TMP_DATA_DIR_PATH + 'queryID.set' : queryIDset, TMP_DATA_DIR_PATH + 'titleID.set' : titleIDset, TMP_DATA_DIR_PATH + 'descID.set' : descIDset} for filename, s in dumpFilesName.items() : dumpfile = file(filename, 'w') for item in s : dumpfile.write('%s\n' % (item)) dumpfile.close()
def genQueryToken (input_file, ADID) : preFilterUserSet = set(line.split()[1] for line in file(TMP_DATA_DIR_PATH + 'status/%s.ad2userStatus.dat' % ADID )) user_query = {} num = 1 for line in file(input_file) : if num % 100000 == 0 : print ADID, num num += 1 fields = dataParser.parseTrainData(line) Click, Impression, Display_url, AdID, AdvertiserID, Depth, Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields if UserID not in preFilterUserSet : continue if UserID not in user_query : user_query[UserID] = [] user_query[UserID].append(QueryID) writer = file(TMP_DATA_DIR_PATH + 'userQuery/%s.user.query' % ADID, 'w') for user in user_query : writer.write('%s%s\n' % (user, '\t'.join(user_query[user]))) writer.close() writer = file(TMP_DATA_DIR_PATH + 'userQuery/%s.user.queryTokens' % ADID, 'w') querySet = set() for line in file(TMP_DATA_DIR_PATH + 'userQuery/%s.user.query' % ADID) : for q in line.strip().split('')[1].split() : querySet.add(q) queryMap = dict(line.strip().split() for line in file(DATA_QUERY) if line.strip().split()[0] in querySet) for line in file(TMP_DATA_DIR_PATH + 'userQuery/%s.user.query' % ADID) : user, query = line.strip().split('') query = query.split() writer.write('%s%s\n' % (user, '|'.join(queryMap[q] for q in query))) userQuery = None querySet = None queryMap = None