コード例 #1
0
def runJob(appName, qUserDictPath, outputDir, beginDay, interval_, isForward):
    mconf = MissionConf().setAppName(appName)
    msc = MissionContext(conf=mconf)
    msc.getFolder()
    msc.getEmrPyFile()
    status = msc.pySubmit("getQuserInLast5Monthly", mconf.getS3ScriptPath(), 
                 params=','.join([qUserDictPath,outputDir,beginDay,interval_,isForward]), 
                 taskNodes='4', is_new_cluster='0', clusterName='ykang')
    print status
    msc.stepSleep(status)
コード例 #2
0
def runJob(qPackageDictPath, outputDir, beginDay, interval_, isForward):
    mconf = MissionConf().setAppName('getUserOpenPackageWeeklyByGivenQpackage')
    msc = MissionContext(conf=mconf)
    msc.getFolder()
    msc.getEmrPyFile()
    status = msc.pySubmit('getUserOpenPackageWeeklyByGivenQpackage', 
                          scriptPath=mconf.getS3ScriptPath(), 
                          params=','.join([qPackageDictPath,outputDir,beginDay,interval_,isForward]), 
                          taskNodes='4', is_new_cluster='0', clusterName='ykang')
    msc.stepSleep(status)
コード例 #3
0
def runJob(outputDir, beginDay, interval_, isForward):
    mconf = MissionConf().setAppName('getUserTotalNumber_30tian')
    msc = MissionContext(conf=mconf)
    msc.getFolder()
    msc.getEmrPyFile()
    cStatus = msc.pySubmit('getUserTotalNumber_30tian', 
                           scriptPath=mconf.getS3ScriptPath(), 
                           params=','.join([outputDir, beginDay, interval_, isForward]), 
                           taskNodes='4', is_new_cluster='0', clusterName='ykang')
    print cStatus
    msc.stepSleep(cStatus)
コード例 #4
0
def runJob(qPackageDictPath, outputDir, beginDay, interval_, isForward):
    mconf = MissionConf().setAppName('getUserOpenPackageWeeklyByGivenQpackage')
    msc = MissionContext(conf=mconf)
    msc.getFolder()
    msc.getEmrPyFile()
    status = msc.pySubmit('getUserOpenPackageWeeklyByGivenQpackage',
                          scriptPath=mconf.getS3ScriptPath(),
                          params=','.join([
                              qPackageDictPath, outputDir, beginDay, interval_,
                              isForward
                          ]),
                          taskNodes='4',
                          is_new_cluster='0',
                          clusterName='ykang')
    msc.stepSleep(status)
コード例 #5
0
 def __init__(self, appName, qUserPath, candPath, qPackageToId):
     self.mconf = MissionConf().setAppName(appName)
     self.msc = MissionContext(self.mconf)
     [_,self.appPath] = self.msc.getFolder()
     self.qUserPath = qUserPath
     self.candPath = candPath
     self.qPackageToId = qPackageToId
コード例 #6
0
def main(
        qUserOpenPackageToOpenTimes,
        qUserPackageToUserS3Path='s3://datamining.ym/dmuser/ykang/results/qUserPackageToUser_2016_01_24_30tian',
        isDownload=True):
    mconf = MissionConf().setAppName('TF_IDF_Qpackage_UserNumber')
    msc = MissionContext(conf=mconf)
    [_, appPath] = msc.getFolder()
    if isDownload:
        BashUtil.s3Cp(qUserPackageToUserS3Path, appPath, recursived=True)
    qPackageToOpenTimes = getQpackageToOpenTimes(appPath)
    #print len(set(qUserOpenPackageToOpenTimes.keys()) - set(qPackageToOpenTimes.keys()))
    #intersectPackages = set(qUserOpenPackageToOpenTimes.keys()) & (set(qPackageToOpenTimes.keys()))
    #print len(intersectPackages)
    tf_idf(appPath, qUserOpenPackageToOpenTimes, qPackageToOpenTimes)
    shutil.copyfile(os.path.join(appPath, 'qPackageToScore.txt'),
                    Qpackage.QPACKAGE_SCORE_TXT)
コード例 #7
0
def runJob(qUserPackageDictPath, outputDir, beginDay, interval_, isForward):
    mconf = MissionConf().setAppName('getQuserPackageToUser')
    msc = MissionContext(conf=mconf)
    msc.getFolder()
    msc.getEmrPyFile()
    cStatus = msc.pySubmit('getQuserPackageToUser',
                           scriptPath=mconf.getS3ScriptPath,
                           params=','.join([
                               qUserPackageDictPath, outputDir, beginDay,
                               interval_, isForward
                           ]),
                           taskNodes='4',
                           is_new_cluster='0',
                           clusterName='ykang')
    print cStatus
    msc.stepSleep(cStatus)
コード例 #8
0
def getQuserOpenPackage(
        basePath='s3://datamining.ym/dmuser/ykang/results/qUserInLast5EachDay',
        beginDay='2016-01-24',
        interval_='30',
        isForward='0',
        s3DictBasePath='s3://datamining.ym/dmuser/ykang/data/spark.ouwan.qUserOpenPackage',
        isDownload=True):
    mconf = MissionConf().setAppName('getQuserOpenPackage')
    msc = MissionContext(conf=mconf)
    [_, appPath] = msc.getFolder()
    if isDownload:
        for theDay in getDaysGen(beginDay, int(interval_), int(isForward)):
            BashUtil.s3Cp(os.path.join(basePath, theDay),
                          appPath + os.sep + theDay,
                          recursived=True)
    openPackage = {}
    mask = {
        'imei=333333333333333': 1,
        'imei=123456789abcdef': 1,
        'imei=111111111111111': 1,
        'imei=012345678912345': 1,
        'imei=000000000000000': 1,
        'imei=00000000000000': 1
    }
    for (filename, _, files) in os.walk(appPath):
        print filename
        for gzfile in files:
            [_, ext] = os.path.splitext(gzfile)
            if ext == '.gz':
                f = SepFile('|')
                f.open(filename + os.sep + gzfile, mode='gzip', flag='rb')
                for line in f:
                    if line[0] not in mask:
                        if line[1] not in openPackage:
                            openPackage[line[1]] = int(line[2])
                        else:
                            openPackage[line[1]] += int(line[2])
                f.close()
    openTimes = []
    print 'sorting'
    packs = openPackage.keys()
    for key in packs:
        openTimes.append(openPackage[key])
    index = sorted(range(len(openTimes)),
                   key=lambda k: openTimes[k],
                   reverse=True)
    print 'sorted'

    writer = LineFile()
    writer.open(os.path.join(appPath, 'qUserOpenPackage.txt'),
                mode='txt',
                flag='w')
    for i in index:
        key = packs[i]
        value = openPackage[key]
        writer.writeLine(key + '|' + str(value))
    writer.close()

    BashUtil.s3Cp(os.path.join(appPath, 'qUserOpenPackage.txt'),
                  dst=os.path.join(s3DictBasePath, 'qUserOpenPackage.txt'),
                  recursived=False)
    return openPackage
コード例 #9
0
                mode='txt',
                flag='w')
    for i in index:
        key = packs[i]
        value = openPackage[key]
        writer.writeLine(key + '|' + str(value))
    writer.close()

    BashUtil.s3Cp(os.path.join(appPath, 'qUserOpenPackage.txt'),
                  dst=os.path.join(s3DictBasePath, 'qUserOpenPackage.txt'),
                  recursived=False)
    return openPackage


if __name__ == '__main__':
    mconf = MissionConf().setAppName('getQuserOpenPackage')
    msc = MissionContext(conf=mconf)
    [_, appPath] = msc.getFolder()

    basePath = 's3://datamining.ym/dmuser/ykang/results/qUserInLast5EachDay'

    for theDay in getDaysGen('2016-01-24', 30, 0):
        BashUtil.s3Cp(basePath + os.sep + theDay,
                      appPath + os.sep + theDay,
                      recursived=True)
    openPackage = {}
    mask = {
        'imei=333333333333333': 1,
        'imei=123456789abcdef': 1,
        'imei=111111111111111': 1,
        'imei=012345678912345': 1,
コード例 #10
0
from com.um.ykang.mission.MissionConf import MissionConf
from com.um.ykang.mission.MissionContext import MissionContext

if __name__ == '__main__':
    mconf = MissionConf().setAppName('getQuserInLast5EachDay')
    msc = MissionContext(conf=mconf)
    msc.getFolder()
    msc.getSample(
        's3://datamining.ym/user_profile/last5/2016-04-14/part-00000.gz',
        10,
        recursived=False)
    msc.getEmrFile()
    msc.submit(mem='3G', coreN='5', taskN='2', coreMulti='2')
コード例 #11
0
ファイル: main.py プロジェクト: KeyKy/look-alike

def makeFolder():
    if not os.path.exists(Constance.WORK_SPACE):
        os.makedirs(Constance.WORK_SPACE)
    if not os.path.exists(DATA_BASE_PATH):
        os.makedirs(DATA_BASE_PATH)
        os.mkdir(os.path.join(DATA_BASE_PATH, 'dict'))  #存放字典目录,qPackage字典的存放路径
        os.mkdir(os.path.join(DATA_BASE_PATH, 'payQualityUsers'))  #优质用户存放路径
        os.mkdir(os.path.join(DATA_BASE_PATH, 'candidatesInfo'))


if __name__ == '__main__':
    makeFolder()
    setPath()
    mconf = MissionConf().setAppName('main')
    msc = MissionContext(conf=mconf)
    [_, appPath] = msc.getFolder()
#     BashUtil.s3Cp(Quser.TOTAL_QUSER_TXT, payQuserS3Path, recursived=False)
#     #从last5中计算一个月的优质用户行为
#     getQuserInLast5Monthly.runJob(payQuserS3Path,
#                                   qUserInLast5EachDayS3Path,
#                                   tfBeginDay,tfInterval,tfIsForward)
#
#     #将getQuserInLast5Monthly的结果下载到本地计算tf,并将优质用户打开的所有包上传到S3
#     qUserOpenPackageToOpenTimes = getQuserOpenPackage(qUserInLast5EachDayS3Path,
#                         tfBeginDay, tfInterval, tfIsForward,
#                         qUserOpenPackageS3Path, isDownload=True)
#
#     #计算每个包到使用用户个数的字典,用于计算idf
#     getQuserPackageToUser.runJob(qUserOpenPackageS3Path,