예제 #1
0
 def getPackageToScore(mask=None, inOrOut=True):
     packToScore = {}
     f = SepFile(':')
     f.open(Qpackage.QPACKAGE_SCORE_TXT, mode='txt', flag='r')
     for line in f:
         if mask == None:
             packToScore[line[0]] = float(line[1])
         else:
             if inOrOut:
                 if line[0] in mask:
                     packToScore[line[0]] = float(line[1])
             else:
                 if line[0] not in mask:
                     packToScore[line[0]] = float(line[1])
     f.close()
     return packToScore
     
     
     
     
     
     
     
     
     
         
 def procInterval(self, week):  #quser只能控制周为单位的,不能控制条为单位
     userToOpenPackage = {}
     match_dir = re.compile(
         os.path.join(self.qUserPath, '(' + '|'.join(week) + ')', '.*\.gz'))
     for (filename, _, files) in os.walk(self.qUserPath):
         for gzfile in files:
             gzfile_dir = os.path.join(filename, gzfile)
             if match_dir.search(gzfile_dir):
                 f = SepFile('|').open(gzfile_dir, mode='gzip', flag='rb')
                 for line in f:  #line[0] userId, line[1] packageName, line[2] openTimes
                     if line[0] not in userToOpenPackage:
                         userToOpenPackage[line[0]] = {}
                         if line[1] in self.qPackageToId:
                             if line[1] not in userToOpenPackage[line[0]]:
                                 userToOpenPackage[line[0]][line[1]] = int(
                                     line[2])
                             else:
                                 userToOpenPackage[line[0]][line[1]] += int(
                                     line[2])
                     else:
                         if line[1] in self.qPackageToId:
                             if line[1] not in userToOpenPackage[line[0]]:
                                 userToOpenPackage[line[0]][line[1]] = int(
                                     line[2])
                             else:
                                 userToOpenPackage[line[0]][line[1]] += int(
                                     line[2])
                 f.close()
     return userToOpenPackage
예제 #3
0
    def getIdToQuser(mask=None):
        idToQuser = {}
        f = SepFile('|').open(Quser.QUSER_ID_TXT, 'txt', 'r')
        for line in f:
            if mask == None:
                idToQuser[line[1]] = line[0]
            else:
                if line[1] in mask:
                    idToQuser[line[1]] = line[0]
        f.close()
        return idToQuser


# if __name__ == '__main__':
#     user = set()
#     f = SepFile(',').open('/root/Downloads/look-alike/data/payQualityUsers/payQualityUsers.txt', 'txt', 'r')
#     for line in f:
#         username = line[0]
#         if len(line[0]) == 0 and len(line[1]) != 0:
#             username = line[1]
#         user.add(username)
#     f.close()
#     print len(user)
#     print 'return'


        
 def procInterval(self, weekInterval): #与Quser不一样因为我们不需要把7天的加起来
     userToOpenPackage = {}
     num_record = 0
     match_dir = re.compile(os.path.join(self.candPath, '(' + '|'.join(weekInterval) + ')', '.*\.gz'))
     for (filename, _, files) in os.walk(self.candPath):
         for gzfile in files:
             gzfile_dir = os.path.join(filename, gzfile)
             if match_dir.search(gzfile_dir):
                 f = SepFile('|').open(gzfile_dir, mode='gzip', flag='rb')
                 for line in f:
                     if line[0] not in userToOpenPackage:
                         if num_record >= self.max_record:
                             yield userToOpenPackage
                             userToOpenPackage = {}
                             userToOpenPackage[line[0]] = {}
                             if line[1] in self.qPackageToId:
                                 userToOpenPackage[line[0]][line[1]] = int(line[2])
                             num_record = 1
                         else:
                             userToOpenPackage[line[0]] = {}
                             if line[1] in self.qPackageToId:
                                 userToOpenPackage[line[0]][line[1]] = int(line[2])
                             num_record += 1
                     else:
                         if line[1] in self.qPackageToId:
                             if line[1] not in userToOpenPackage[line[0]]:
                                 userToOpenPackage[line[0]][line[1]] = int(line[2])
                             else:
                                 userToOpenPackage[line[0]][line[1]] += int(line[2])
                 f.close()
     yield userToOpenPackage
예제 #5
0
 def writeQpackageToId():
     f = SepFile(':').open(Qpackage.QPACKAGE_SCORE_TXT, 'txt', 'r')
     idx = 0
     writer = LineFile().open(Qpackage.QPACKAGE_ID_TXT, 'txt', 'w')
     for line in f:
         writer.writeLine(line[0] + '|' + str(idx))
         idx += 1
     writer.close()
     f.close()
예제 #6
0
 def writeQpackageToId():
     f = SepFile(':').open(Qpackage.QPACKAGE_SCORE_TXT, 'txt', 'r')
     idx = 0
     writer = LineFile().open(Qpackage.QPACKAGE_ID_TXT, 'txt', 'w')
     for line in f:
         writer.writeLine(line[0] + '|' + str(idx))
         idx += 1
     writer.close()
     f.close()
예제 #7
0
 def getQuserToId(mask=None):
     qUserToId = {}
     f = SepFile('|').open(Quser.QUSER_ID_TXT, 'txt', 'r')
     for line in f:
         if mask == None:
             qUserToId[line[0]] = line[1]
         else:
             if line[0] in mask:
                 qUserToId[line[0]] = line[1]
     f.close()
     return qUserToId
예제 #8
0
def getQuserOpenPackage(basePath='s3://datamining.ym/dmuser/ykang/results/qUserInLast5EachDay', 
                        beginDay='2016-01-24', 
                        interval_='30',
                        isForward='0',
                        s3DictBasePath='s3://datamining.ym/dmuser/ykang/data/spark.ouwan.qPackageToId',
                        isDownload=True):
    mconf = MissionConf().setAppName('getQuserOpenPackage')
    msc = MissionContext(conf=mconf)
    [_, appPath] = msc.getFolder()
    if isDownload:
        for theDay in getDaysGen(beginDay, int(interval_), int(isForward)):
            BashUtil.s3Cp(os.path.join(basePath,theDay), appPath+os.sep+theDay, recursived=True)
    openPackage = {}
    mask = {'imei=333333333333333':1, 'imei=123456789abcdef':1, 'imei=111111111111111':1, 'imei=012345678912345':1, 'imei=000000000000000':1, 'imei=00000000000000':1}
    for (filename, _, files) in os.walk(appPath):
        print filename
        for gzfile in files:
            [_, ext] = os.path.splitext(gzfile)
            if ext == '.gz':
                f = SepFile('|')
                f.open(filename+os.sep+gzfile, mode='gzip', flag='rb')
                for line in f:
                    if line[0] not in mask:
                        if line[1] not in openPackage:
                            openPackage[line[1]] = int(line[2])
                        else:
                            openPackage[line[1]] += int(line[2])
                f.close()
    openTimes = []
    print 'sorting'
    packs = openPackage.keys()
    for key in packs:
        openTimes.append(openPackage[key])
    index = sorted(range(len(openTimes)), key=lambda k: openTimes[k], reverse=True)
    print 'sorted'
 
    writer = LineFile()
    writer.open(os.path.join(appPath, 'qUserOpenPackage.txt'), mode='txt', flag='w')
    for i in index:
        key = packs[i]
        value = openPackage[key]
        writer.writeLine(key + '|' + str(value))
    writer.close()

    #可以将qUserOpenPackageToOpenTimes写入到该位置Qpackage.QPACKAGE_ID_TXT
    index = 0; f = LineFile().open(Qpackage.QPACKAGE_ID_TXT, mode='txt', flag='w')
    for qPackage in openPackage:
        f.writeLine(qPackage + '|' + str(index))
        index += 1
    f.close()
    
    BashUtil.s3Cp(Qpackage.QPACKAGE_ID_TXT, dst=os.path.join(s3DictBasePath, 'qPackageToId.txt'), recursived=False)
    
    return openPackage
예제 #9
0
 def getIdToPackage(mask=None):
     idToQpackage = {}
     f = SepFile('|').open(Qpackage.QPACKAGE_ID_TXT, 'txt', 'r')
     for line in f:
         if mask == None:
             idToQpackage[line[1]] = line[0]
         else:
             if line[1] in mask:
                 idToQpackage[line[1]] = line[0]
     f.close()
     return idToQpackage
예제 #10
0
 def getIdToCandidate(mask=None):
     print 'Info: Loading idToCandidate'
     idToCand = {}
     f = SepFile('|').open(Candidate.CANDIDATES_ID_TXT, 'txt', 'r')
     for line in f:
         if mask == None:
             idToCand[line[1]] = line[0]
         else:
             if line[1] in mask:
                 idToCand[line[1]] = line[0]
     f.close()
     return idToCand
예제 #11
0
 def getQpackageToId(mask=None):
     print 'Info: loading QpackageToId'
     qPackageToId = {}
     f = SepFile('|').open(Qpackage.QPACKAGE_ID_TXT, 'txt', 'r')
     for line in f:
         if mask == None:
             qPackageToId[line[0]] = line[1]
         else:
             if line[0] in mask:
                 qPackageToId[line[0]] = line[1]
     f.close()
     return qPackageToId
예제 #12
0
 def getCandidateToId(mask=None):
     print 'Info: Loading canditateToId'
     if mask == None:
         candToId = pickle.load(open(Candidate.CANDIDATES_ID_PICKLE, 'rb'))
     else:
         candToId = {}
         f = SepFile('|').open(Candidate.CANDIDATES_ID_TXT, 'txt', 'r')
         for line in f:
             if line[0] in mask:
                 candToId[line[0]] = line[1]
         f.close()
     return candToId
예제 #13
0
 def getQpackageToId(mask=None):
     print 'Info: loading QpackageToId'
     qPackageToId = {}
     f = SepFile('|').open(Qpackage.QPACKAGE_ID_TXT, 'txt', 'r')
     for line in f:
         if mask == None:
             qPackageToId[line[0]] = line[1]
         else:
             if line[0] in mask:
                 qPackageToId[line[0]] = line[1]
     f.close()
     return qPackageToId
예제 #14
0
 def getCandidateToId(mask=None):
     print 'Info: Loading canditateToId'
     if mask == None:
         candToId = pickle.load(open(Candidate.CANDIDATES_ID_PICKLE, 'rb'))
     else:
         candToId = {}
         f = SepFile('|').open(Candidate.CANDIDATES_ID_TXT, 'txt', 'r')
         for line in f:
             if line[0] in mask:
                 candToId[line[0]] = line[1]
         f.close()
     return candToId
예제 #15
0
 def getIdToCandidate(mask=None):
     print 'Info: Loading idToCandidate'
     idToCand = {}
     f = SepFile('|').open(Candidate.CANDIDATES_ID_TXT, 'txt', 'r')
     for line in f:
         if mask == None:
             idToCand[line[1]] = line[0]
         else:
             if line[1] in mask:
                 idToCand[line[1]] = line[0]
     f.close()
     return idToCand
예제 #16
0
def getQpackageToOpenTimes(appPath): #idf
    qPackageToOpenTimes = {}
    for (filename, _, files) in os.walk(appPath):
        for gzfile in files:
            print gzfile
            [_, ext] = os.path.splitext(gzfile)
            if ext == '.gz':
                reader = SepFile('|')
                reader.open(os.path.join(filename, gzfile), mode='gzip', flag='rb')
                for line in reader:
                    qPackageToOpenTimes[line[0]] = int(line[1])
                reader.close()
    return qPackageToOpenTimes
예제 #17
0
파일: Quser.py 프로젝트: KeyKy/look-alike
 def getIdToQuser(mask=None, inOrOut=True):
     idToQuser = {}
     f = SepFile('|').open(Quser.QUSER_ID_TXT, 'txt', 'r')
     for line in f:
         if mask == None:
             idToQuser[line[1]] = line[0]
         else:
             if inOrOut:
                 if line[1] in mask:
                     idToQuser[line[1]] = line[0]
             else:
                 if line[1] not in mask:
                     idToQuser[line[1]] = line[0]
     f.close()
     return idToQuser
예제 #18
0
 def getIdToQuser(mask=None, inOrOut=True):
     idToQuser = {}
     f = SepFile('|').open(Quser.QUSER_ID_TXT, 'txt', 'r')
     for line in f:
         if mask == None:
             idToQuser[line[1]] = line[0]
         else:
             if inOrOut:
                 if line[1] in mask:
                     idToQuser[line[1]] = line[0]
             else:
                 if line[1] not in mask:
                     idToQuser[line[1]] = line[0]
     f.close()
     return idToQuser
예제 #19
0
 def getIdToPackage(mask=None, inOrOut=True):
     idToQpackage = {}
     f = SepFile('|').open(Qpackage.QPACKAGE_ID_TXT, 'txt', 'r')
     for line in f:
         if mask == None:
             idToQpackage[line[1]] = line[0]
         else:
             if inOrOut:
                 if line[0] in mask:
                     idToQpackage[line[1]] = line[0]
             else:
                 if line[0] not in mask:
                     idToQpackage[line[1]] = line[0]
     f.close()
     return idToQpackage
예제 #20
0
def getUserTotalNumber(s3Path, isDownload=True):
    mconf = MissionConf().setAppName('userTotalNumber')
    msc = MissionContext(conf=mconf)
    [self, appPath] = msc.getFolder()
    if isDownload:
        BashUtil.s3Cp(s3Path, appPath, recursived=True)
    userTotalNumber = 0
    for (filename, _, files) in os.walk(appPath):
        for gzfile in files:
            [_, ext] = os.path.splitext(gzfile)
            if ext == '.gz':
                reader = SepFile('|')
                reader.open(os.path.join(filename, gzfile), mode='gzip', flag='rb')
                for line in reader:
                    userTotalNumber += 1
                reader.close()
    return userTotalNumber
예제 #21
0
 def writeQuserToId():
     qUser = set()
     f = SepFile(',').open(Quser.TOTAL_QUSER_TXT, 'txt', 'r')
     for line in f:
         username = '******' + line[0]
         if len(line[0]) == 0 and len(line[1]) != 0:
             username = '******' + line[1]
         qUser.add(username)
     qUser = list(qUser)
     f.close()
     
     f = LineFile().open(Quser.QUSER_ID_TXT, 'txt', 'w')
     for i in range(len(qUser)):
         f.writeLine(qUser[i] + '|' + str(i))
     f.close()
예제 #22
0
 def writeQpackageToId(mask=None, inOrOut=True):
     f = SepFile(':').open(Qpackage.QPACKAGE_SCORE_TXT, 'txt', 'r')
     idx = 0
     writer = LineFile().open(Qpackage.QPACKAGE_ID_TXT, 'txt', 'w')
     for line in f:
         if mask == None:
             writer.writeLine(line[0] + '|' + str(idx))
             idx += 1
         else:
             if inOrOut:
                 if line[0] in mask:
                     writer.writeLine(line[0] + '|' + str(idx))
                     idx += 1
             else:
                 if line[0] not in mask:
                     writer.writeLine(line[0] + '|' + str(idx))
                     idx += 1
     writer.close()
     f.close()
예제 #23
0
 def writeQpackageToId(mask=None, inOrOut=True):
     f = SepFile(':').open(Qpackage.QPACKAGE_SCORE_TXT, 'txt', 'r')
     idx = 0
     writer = LineFile().open(Qpackage.QPACKAGE_ID_TXT, 'txt', 'w')
     for line in f:
         if mask == None:
             writer.writeLine(line[0] + '|' + str(idx))
             idx += 1
         else:
             if inOrOut:
                 if line[0] in mask:
                     writer.writeLine(line[0] + '|' + str(idx))
                     idx += 1
             else:
                 if line[0] not in mask:
                     writer.writeLine(line[0] + '|' + str(idx))
                     idx += 1
     writer.close()
     f.close()
예제 #24
0
    def writeCandidateToId():
        print 'Info: Writing canditateToId'
        candidate = set()
        for pf in Candidate.PART_FILE_NAME:
            print 'Info: processing ' + pf
            f = SepFile('|').open(pf, 'gzip', 'rb')
            for line in f:
                candidate.add(line[0])
            f.close()
        candidate = list(candidate)
        writer = LineFile()
        writer.open(Candidate.CANDIDATES_ID_TXT, 'txt', 'w')
        candToId = {}
        for i in range(len(candidate)):
            candToId[candidate[i]] = str(i)
            writer.writeLine(candidate[i] + '|' + str(i))
        writer.close()

        del candidate
        gc.collect()
        pickle.dump(candToId, open(Candidate.CANDIDATES_ID_PICKLE, 'wb'), True)
예제 #25
0
 def writeCandidateToId():
     print 'Info: Writing canditateToId'
     candidate = set()
     for pf in Candidate.PART_FILE_NAME:
         print 'Info: processing ' + pf
         f = SepFile('|').open(pf, 'gzip', 'rb')
         for line in f:
             candidate.add(line[0])
         f.close()
     candidate = list(candidate)
     writer = LineFile()
     writer.open(Candidate.CANDIDATES_ID_TXT, 'txt', 'w')
     candToId = {}
     for i in range(len(candidate)):
         candToId[candidate[i]] = str(i)
         writer.writeLine(candidate[i] + '|' + str(i))
     writer.close()
     
     del candidate
     gc.collect()
     pickle.dump(candToId, open(Candidate.CANDIDATES_ID_PICKLE, 'wb'), True)
예제 #26
0
    def writeCandidateToId():
        candidates = set()
        match_dir = re.compile(os.path.join(Candidate.BASE_PATH, '.*\.gz'))
        for (filename, _, files) in os.walk(Candidate.BASE_PATH):
            for gzfile in files:
                gzfile_dir = os.path.join(filename, gzfile)
                if match_dir.search(gzfile_dir):
                    f = SepFile('|').open(gzfile_dir, mode='gzip', flag='r')
                    for line in f:
                        candidates.add(line[0])
                    f.close()
        candidates = list(candidates)
        writer = LineFile().open(Candidate.CANDIDATES_ID_TXT, mode='txt', flag='w')
        candToId = {}
        for i in range(len(candidates)):
            candToId[candidates[i]] = str(i)
            writer.writeLine(candidates[i] + '|' + str(i))
        writer.close()

        del candidates
        gc.collect()
        pickle.dump(candToId, open(Candidate.CANDIDATES_ID_PICKLE, 'wb'), True)
예제 #27
0
 def procInterval(self, weekInterval):  #与Quser不一样因为我们不需要把7天的加起来
     userToOpenPackage = {}
     num_record = 0
     match_dir = re.compile(
         os.path.join(self.candPath, '(' + '|'.join(weekInterval) + ')',
                      '.*\.gz'))
     for (filename, _, files) in os.walk(self.candPath):
         for gzfile in files:
             gzfile_dir = os.path.join(filename, gzfile)
             if match_dir.search(gzfile_dir):
                 f = SepFile('|').open(gzfile_dir, mode='gzip', flag='rb')
                 for line in f:
                     if line[0] not in userToOpenPackage:
                         if num_record >= self.max_record:
                             yield userToOpenPackage
                             userToOpenPackage = {}
                             userToOpenPackage[line[0]] = {}
                             if line[1] in self.qPackageToId:
                                 userToOpenPackage[line[0]][line[1]] = int(
                                     line[2])
                             num_record = 1
                         else:
                             userToOpenPackage[line[0]] = {}
                             if line[1] in self.qPackageToId:
                                 userToOpenPackage[line[0]][line[1]] = int(
                                     line[2])
                             num_record += 1
                     else:
                         if line[1] in self.qPackageToId:
                             if line[1] not in userToOpenPackage[line[0]]:
                                 userToOpenPackage[line[0]][line[1]] = int(
                                     line[2])
                             else:
                                 userToOpenPackage[line[0]][line[1]] += int(
                                     line[2])
                 f.close()
     yield userToOpenPackage
예제 #28
0
    def writeCandidateToId():
        candidates = set()
        match_dir = re.compile(os.path.join(Candidate.BASE_PATH, '.*\.gz'))
        for (filename, _, files) in os.walk(Candidate.BASE_PATH):
            for gzfile in files:
                gzfile_dir = os.path.join(filename, gzfile)
                if match_dir.search(gzfile_dir):
                    f = SepFile('|').open(gzfile_dir, mode='gzip', flag='r')
                    for line in f:
                        candidates.add(line[0])
                    f.close()
        candidates = list(candidates)
        writer = LineFile().open(Candidate.CANDIDATES_ID_TXT,
                                 mode='txt',
                                 flag='w')
        candToId = {}
        for i in range(len(candidates)):
            candToId[candidates[i]] = str(i)
            writer.writeLine(candidates[i] + '|' + str(i))
        writer.close()

        del candidates
        gc.collect()
        pickle.dump(candToId, open(Candidate.CANDIDATES_ID_PICKLE, 'wb'), True)
 def procInterval(self, week): #quser只能控制周为单位的,不能控制条为单位
     userToOpenPackage = {}
     match_dir = re.compile(os.path.join(self.qUserPath, '(' + '|'.join(week) + ')', '.*\.gz'))
     for (filename, _, files) in os.walk(self.qUserPath):
         for gzfile in files:
             gzfile_dir = os.path.join(filename, gzfile)
             if match_dir.search(gzfile_dir):
                 f = SepFile('|').open(gzfile_dir, mode='gzip', flag='rb')
                 for line in f: #line[0] userId, line[1] packageName, line[2] openTimes
                     if line[0] not in userToOpenPackage:
                         userToOpenPackage[line[0]] = {}
                         if line[1] in self.qPackageToId:
                             if line[1] not in userToOpenPackage[line[0]]:
                                 userToOpenPackage[line[0]][line[1]] = int(line[2])
                             else:
                                 userToOpenPackage[line[0]][line[1]] += int(line[2])
                     else:
                         if line[1] in self.qPackageToId:
                             if line[1] not in userToOpenPackage[line[0]]:
                                 userToOpenPackage[line[0]][line[1]] = int(line[2])
                             else:
                                 userToOpenPackage[line[0]][line[1]] += int(line[2])
                 f.close()
     return userToOpenPackage
예제 #30
0
    def writeQuserToId(mask=None, inOrOut=True):
        qUser = set()
        f = SepFile(',').open(Quser.TOTAL_QUSER_TXT, 'txt', 'r')
        for line in f:
            username = '******' + line[0]
            if len(line[0]) == 0 and len(line[1]) != 0:
                username = '******' + line[1]
            if mask == None:
                qUser.add(username)
            else:
                if inOrOut:
                    if username in mask:
                        qUser.add(username)
                else:
                    if username not in mask:
                        qUser.add(username)
        qUser = list(qUser)
        f.close()

        f = LineFile().open(Quser.QUSER_ID_TXT, 'txt', 'w')
        for i in range(len(qUser)):
            f.writeLine(qUser[i] + '|' + str(i))
        f.close()
예제 #31
0
def getQpackageToOpenTimes(appPath):  #idf
    qPackageToOpenTimes = {}
    for (filename, _, files) in os.walk(appPath):
        for gzfile in files:
            print gzfile
            [_, ext] = os.path.splitext(gzfile)
            if ext == '.gz':
                reader = SepFile('|')
                reader.open(os.path.join(filename, gzfile),
                            mode='gzip',
                            flag='rb')
                for line in reader:
                    qPackageToOpenTimes[line[0]] = int(line[1])
                reader.close()
    return qPackageToOpenTimes
예제 #32
0
파일: Quser.py 프로젝트: KeyKy/look-alike
 def writeQuserToId(mask=None, inOrOut=True):
     qUser = set()
     f = SepFile(',').open(Quser.TOTAL_QUSER_TXT, 'txt', 'r')
     for line in f:
         username = '******' + line[0]
         if len(line[0]) == 0 and len(line[1]) != 0:
             username = '******' + line[1]
         if mask == None:
             qUser.add(username)
         else:
             if inOrOut:
                 if username in mask:
                     qUser.add(username)
             else:
                 if username not in mask:
                     qUser.add(username)
     qUser = list(qUser)
     f.close()
     
     f = LineFile().open(Quser.QUSER_ID_TXT, 'txt', 'w')
     for i in range(len(qUser)):
         f.writeLine(qUser[i] + '|' + str(i))
     f.close()
예제 #33
0
 def getPackageToScore(mask=None, inOrOut=True):
     packToScore = {}
     f = SepFile(':')
     f.open(Qpackage.QPACKAGE_SCORE_TXT, mode='txt', flag='r')
     for line in f:
         if mask == None:
             packToScore[line[0]] = float(line[1])
         else:
             if inOrOut:
                 if line[0] in mask:
                     packToScore[line[0]] = float(line[1])
             else:
                 if line[0] not in mask:
                     packToScore[line[0]] = float(line[1])
     f.close()
     return packToScore
예제 #34
0
    msc = MissionContext(conf=mconf)
    [_, appPath] = msc.getFolder()
    
    basePath = 's3://datamining.ym/dmuser/ykang/results/qUserInLast5EachDay'

    for theDay in getDaysGen('2016-01-24', 30, 0):
        BashUtil.s3Cp(basePath+os.sep+theDay, appPath+os.sep+theDay, recursived=True)
    openPackage = {}
    mask = {'imei=333333333333333':1, 'imei=123456789abcdef':1, 'imei=111111111111111':1, 'imei=012345678912345':1, 'imei=000000000000000':1, 'imei=00000000000000':1}
    #mask = {}
    for (filename, dirs, files) in os.walk(appPath):
        print filename
        for gzfile in files:
            [name, ext] = os.path.splitext(gzfile)
            if ext == '.gz':
                f = SepFile('|')
                f.open(filename+os.sep+gzfile, mode='gzip', flag='rb')
                for line in f:
                    if line[0] not in mask:
                        if line[1] not in openPackage:
                            openPackage[line[1]] = int(line[2])
                        else:
                            openPackage[line[1]] += int(line[2])
                f.close()
    openTimes = []
    print 'sorting'
    packs = openPackage.keys()
    for key in packs:
        openTimes.append(openPackage[key])
    index = sorted(range(len(openTimes)), key=lambda k: openTimes[k], reverse=True)
    print 'sorted'
예제 #35
0
 openPackage = {}
 mask = {
     'imei=333333333333333': 1,
     'imei=123456789abcdef': 1,
     'imei=111111111111111': 1,
     'imei=012345678912345': 1,
     'imei=000000000000000': 1,
     'imei=00000000000000': 1
 }
 #mask = {}
 for (filename, dirs, files) in os.walk(appPath):
     print filename
     for gzfile in files:
         [name, ext] = os.path.splitext(gzfile)
         if ext == '.gz':
             f = SepFile('|')
             f.open(filename + os.sep + gzfile, mode='gzip', flag='rb')
             for line in f:
                 if line[0] not in mask:
                     if line[1] not in openPackage:
                         openPackage[line[1]] = int(line[2])
                     else:
                         openPackage[line[1]] += int(line[2])
             f.close()
 openTimes = []
 print 'sorting'
 packs = openPackage.keys()
 for key in packs:
     openTimes.append(openPackage[key])
 index = sorted(range(len(openTimes)),
                key=lambda k: openTimes[k],
예제 #36
0
def getQuserOpenPackage(
        basePath='s3://datamining.ym/dmuser/ykang/results/qUserInLast5EachDay',
        beginDay='2016-01-24',
        interval_='30',
        isForward='0',
        s3DictBasePath='s3://datamining.ym/dmuser/ykang/data/spark.ouwan.qUserOpenPackage',
        isDownload=True):
    mconf = MissionConf().setAppName('getQuserOpenPackage')
    msc = MissionContext(conf=mconf)
    [_, appPath] = msc.getFolder()
    if isDownload:
        for theDay in getDaysGen(beginDay, int(interval_), int(isForward)):
            BashUtil.s3Cp(os.path.join(basePath, theDay),
                          appPath + os.sep + theDay,
                          recursived=True)
    openPackage = {}
    mask = {
        'imei=333333333333333': 1,
        'imei=123456789abcdef': 1,
        'imei=111111111111111': 1,
        'imei=012345678912345': 1,
        'imei=000000000000000': 1,
        'imei=00000000000000': 1
    }
    for (filename, _, files) in os.walk(appPath):
        print filename
        for gzfile in files:
            [_, ext] = os.path.splitext(gzfile)
            if ext == '.gz':
                f = SepFile('|')
                f.open(filename + os.sep + gzfile, mode='gzip', flag='rb')
                for line in f:
                    if line[0] not in mask:
                        if line[1] not in openPackage:
                            openPackage[line[1]] = int(line[2])
                        else:
                            openPackage[line[1]] += int(line[2])
                f.close()
    openTimes = []
    print 'sorting'
    packs = openPackage.keys()
    for key in packs:
        openTimes.append(openPackage[key])
    index = sorted(range(len(openTimes)),
                   key=lambda k: openTimes[k],
                   reverse=True)
    print 'sorted'

    writer = LineFile()
    writer.open(os.path.join(appPath, 'qUserOpenPackage.txt'),
                mode='txt',
                flag='w')
    for i in index:
        key = packs[i]
        value = openPackage[key]
        writer.writeLine(key + '|' + str(value))
    writer.close()

    BashUtil.s3Cp(os.path.join(appPath, 'qUserOpenPackage.txt'),
                  dst=os.path.join(s3DictBasePath, 'qUserOpenPackage.txt'),
                  recursived=False)
    return openPackage
예제 #37
0
#
# from com.um.ykang.mission.MissionConf import MissionConf
# from com.um.ykang.mission.MissionContext import MissionContext
#
# mconf = MissionConf().setAppName('test')
# (msc, app) = MissionContext(conf=mconf).getFolder()
# msc.getSample('s3://datamining.ym/dmuser/ykang/results/test2/part-00000.gz', 20).getEmrFile()
from com.um.ykang.data.format.File import SepFile
import os

# writer = open('/root/test.txt', 'w')
# writer.writelines('123')
# writer.close()

f = SepFile('\t').open(
    '/home/bk25103378/dataExchangeYiGuan/overlap_full_label.txt', 'txt', 'r')
writer = open('/home/bk25103378/dataExchangeYiGuan/label.txt', 'w')
for line in f:
    writer.write(line[2] + '\t' + line[3].strip() + os.linesep)
writer.close()