def runJob(appName, qUserDictPath, outputDir, beginDay, interval_, isForward): mconf = MissionConf().setAppName(appName) msc = MissionContext(conf=mconf) msc.getFolder() msc.getEmrPyFile() status = msc.pySubmit("getQuserInLast5Monthly", mconf.getS3ScriptPath(), params=','.join([qUserDictPath,outputDir,beginDay,interval_,isForward]), taskNodes='4', is_new_cluster='0', clusterName='ykang') print status msc.stepSleep(status)
def runJob(qPackageDictPath, outputDir, beginDay, interval_, isForward): mconf = MissionConf().setAppName('getUserOpenPackageWeeklyByGivenQpackage') msc = MissionContext(conf=mconf) msc.getFolder() msc.getEmrPyFile() status = msc.pySubmit('getUserOpenPackageWeeklyByGivenQpackage', scriptPath=mconf.getS3ScriptPath(), params=','.join([qPackageDictPath,outputDir,beginDay,interval_,isForward]), taskNodes='4', is_new_cluster='0', clusterName='ykang') msc.stepSleep(status)
def runJob(outputDir, beginDay, interval_, isForward): mconf = MissionConf().setAppName('getUserTotalNumber_30tian') msc = MissionContext(conf=mconf) msc.getFolder() msc.getEmrPyFile() cStatus = msc.pySubmit('getUserTotalNumber_30tian', scriptPath=mconf.getS3ScriptPath(), params=','.join([outputDir, beginDay, interval_, isForward]), taskNodes='4', is_new_cluster='0', clusterName='ykang') print cStatus msc.stepSleep(cStatus)
def runJob(qPackageDictPath, outputDir, beginDay, interval_, isForward): mconf = MissionConf().setAppName('getUserOpenPackageWeeklyByGivenQpackage') msc = MissionContext(conf=mconf) msc.getFolder() msc.getEmrPyFile() status = msc.pySubmit('getUserOpenPackageWeeklyByGivenQpackage', scriptPath=mconf.getS3ScriptPath(), params=','.join([ qPackageDictPath, outputDir, beginDay, interval_, isForward ]), taskNodes='4', is_new_cluster='0', clusterName='ykang') msc.stepSleep(status)
def __init__(self, appName, qUserPath, candPath, qPackageToId): self.mconf = MissionConf().setAppName(appName) self.msc = MissionContext(self.mconf) [_,self.appPath] = self.msc.getFolder() self.qUserPath = qUserPath self.candPath = candPath self.qPackageToId = qPackageToId
def main( qUserOpenPackageToOpenTimes, qUserPackageToUserS3Path='s3://datamining.ym/dmuser/ykang/results/qUserPackageToUser_2016_01_24_30tian', isDownload=True): mconf = MissionConf().setAppName('TF_IDF_Qpackage_UserNumber') msc = MissionContext(conf=mconf) [_, appPath] = msc.getFolder() if isDownload: BashUtil.s3Cp(qUserPackageToUserS3Path, appPath, recursived=True) qPackageToOpenTimes = getQpackageToOpenTimes(appPath) #print len(set(qUserOpenPackageToOpenTimes.keys()) - set(qPackageToOpenTimes.keys())) #intersectPackages = set(qUserOpenPackageToOpenTimes.keys()) & (set(qPackageToOpenTimes.keys())) #print len(intersectPackages) tf_idf(appPath, qUserOpenPackageToOpenTimes, qPackageToOpenTimes) shutil.copyfile(os.path.join(appPath, 'qPackageToScore.txt'), Qpackage.QPACKAGE_SCORE_TXT)
def runJob(qUserPackageDictPath, outputDir, beginDay, interval_, isForward): mconf = MissionConf().setAppName('getQuserPackageToUser') msc = MissionContext(conf=mconf) msc.getFolder() msc.getEmrPyFile() cStatus = msc.pySubmit('getQuserPackageToUser', scriptPath=mconf.getS3ScriptPath, params=','.join([ qUserPackageDictPath, outputDir, beginDay, interval_, isForward ]), taskNodes='4', is_new_cluster='0', clusterName='ykang') print cStatus msc.stepSleep(cStatus)
def getQuserOpenPackage( basePath='s3://datamining.ym/dmuser/ykang/results/qUserInLast5EachDay', beginDay='2016-01-24', interval_='30', isForward='0', s3DictBasePath='s3://datamining.ym/dmuser/ykang/data/spark.ouwan.qUserOpenPackage', isDownload=True): mconf = MissionConf().setAppName('getQuserOpenPackage') msc = MissionContext(conf=mconf) [_, appPath] = msc.getFolder() if isDownload: for theDay in getDaysGen(beginDay, int(interval_), int(isForward)): BashUtil.s3Cp(os.path.join(basePath, theDay), appPath + os.sep + theDay, recursived=True) openPackage = {} mask = { 'imei=333333333333333': 1, 'imei=123456789abcdef': 1, 'imei=111111111111111': 1, 'imei=012345678912345': 1, 'imei=000000000000000': 1, 'imei=00000000000000': 1 } for (filename, _, files) in os.walk(appPath): print filename for gzfile in files: [_, ext] = os.path.splitext(gzfile) if ext == '.gz': f = SepFile('|') f.open(filename + os.sep + gzfile, mode='gzip', flag='rb') for line in f: if line[0] not in mask: if line[1] not in openPackage: openPackage[line[1]] = int(line[2]) else: openPackage[line[1]] += int(line[2]) f.close() openTimes = [] print 'sorting' packs = openPackage.keys() for key in packs: openTimes.append(openPackage[key]) index = sorted(range(len(openTimes)), key=lambda k: openTimes[k], reverse=True) print 'sorted' writer = LineFile() writer.open(os.path.join(appPath, 'qUserOpenPackage.txt'), mode='txt', flag='w') for i in index: key = packs[i] value = openPackage[key] writer.writeLine(key + '|' + str(value)) writer.close() BashUtil.s3Cp(os.path.join(appPath, 'qUserOpenPackage.txt'), dst=os.path.join(s3DictBasePath, 'qUserOpenPackage.txt'), recursived=False) return openPackage
mode='txt', flag='w') for i in index: key = packs[i] value = openPackage[key] writer.writeLine(key + '|' + str(value)) writer.close() BashUtil.s3Cp(os.path.join(appPath, 'qUserOpenPackage.txt'), dst=os.path.join(s3DictBasePath, 'qUserOpenPackage.txt'), recursived=False) return openPackage if __name__ == '__main__': mconf = MissionConf().setAppName('getQuserOpenPackage') msc = MissionContext(conf=mconf) [_, appPath] = msc.getFolder() basePath = 's3://datamining.ym/dmuser/ykang/results/qUserInLast5EachDay' for theDay in getDaysGen('2016-01-24', 30, 0): BashUtil.s3Cp(basePath + os.sep + theDay, appPath + os.sep + theDay, recursived=True) openPackage = {} mask = { 'imei=333333333333333': 1, 'imei=123456789abcdef': 1, 'imei=111111111111111': 1, 'imei=012345678912345': 1,
from com.um.ykang.mission.MissionConf import MissionConf from com.um.ykang.mission.MissionContext import MissionContext if __name__ == '__main__': mconf = MissionConf().setAppName('getQuserInLast5EachDay') msc = MissionContext(conf=mconf) msc.getFolder() msc.getSample( 's3://datamining.ym/user_profile/last5/2016-04-14/part-00000.gz', 10, recursived=False) msc.getEmrFile() msc.submit(mem='3G', coreN='5', taskN='2', coreMulti='2')
def makeFolder(): if not os.path.exists(Constance.WORK_SPACE): os.makedirs(Constance.WORK_SPACE) if not os.path.exists(DATA_BASE_PATH): os.makedirs(DATA_BASE_PATH) os.mkdir(os.path.join(DATA_BASE_PATH, 'dict')) #存放字典目录,qPackage字典的存放路径 os.mkdir(os.path.join(DATA_BASE_PATH, 'payQualityUsers')) #优质用户存放路径 os.mkdir(os.path.join(DATA_BASE_PATH, 'candidatesInfo')) if __name__ == '__main__': makeFolder() setPath() mconf = MissionConf().setAppName('main') msc = MissionContext(conf=mconf) [_, appPath] = msc.getFolder() # BashUtil.s3Cp(Quser.TOTAL_QUSER_TXT, payQuserS3Path, recursived=False) # #从last5中计算一个月的优质用户行为 # getQuserInLast5Monthly.runJob(payQuserS3Path, # qUserInLast5EachDayS3Path, # tfBeginDay,tfInterval,tfIsForward) # # #将getQuserInLast5Monthly的结果下载到本地计算tf,并将优质用户打开的所有包上传到S3 # qUserOpenPackageToOpenTimes = getQuserOpenPackage(qUserInLast5EachDayS3Path, # tfBeginDay, tfInterval, tfIsForward, # qUserOpenPackageS3Path, isDownload=True) # # #计算每个包到使用用户个数的字典,用于计算idf # getQuserPackageToUser.runJob(qUserOpenPackageS3Path,