Exemple #1
0
def getTemplate(util_gt, running_time, perSlot=3, step_tp=24, Nh_tp=7, Nl_tp=7, retCache=None, basetime=None, userList=set()):
    if len(userList)==0:
        filename='%s-Nh%s-lasts%s.template_all' %(helper.timestamp2str(running_time),Nh_tp,Nl_tp)
        if os.path.exists(filename):
            return pickle.load(open(filename,'r'))
        
    template_vecs={}
    outretEvent={}
    if basetime==None:
        basetime={}
    
    if len(userList)==0:
        for line in retCache:
            if line['event_id'] not in outretEvent:
                outretEvent[line['event_id']]=[]
            outretEvent.get(line['event_id']).append(line)
            
    else:
        for user in userList:
            if user in retCache:
                for line in retCache[user]:
                    if line['event_id'] not in outretEvent:
                        outretEvent[line['event_id']]=[]
                    outretEvent[line['event_id']].append(line)
    
    for event in util_gt:
        event_id=event.db_event_id
        # corpus_src_init, userInvolveList, outret = db.getText(timePoint=running_time, historySlots= Nh * perSlot, detectionSlots=Nd * perSlot, output_prefix='data/%s_pred_%s_his%sh_ev%s' %(jobID ,helper.timestamp2str(running_time), Nh*perSlot, event_id), userList= selUsers, event_id=event_id, cacheFlag=True)
        if event_id in outretEvent:
            eventContents=outretEvent[event_id]
            if basetime!=None:
                if event_id in basetime:
                    baseTimestamp=basetime[event_id]
                else:
                    baseTimestamp=min(eventContents, key=lambda x:x['dsttime'])['dsttime']
                    # basetime[event_id]=baseTimestamp
            else:
                baseTimestamp=min(eventContents, key=lambda x:x['dsttime'])['dsttime']
                # basetime[event_id]=baseTimestamp
            countDict={}
            templateVec=[]
            for every in eventContents:
                countDict[int((every['dsttime']-baseTimestamp)/(3600*perSlot))]=countDict.get(int((every['dsttime']-baseTimestamp)/(3600*perSlot)), 0)+1
            for i in range(0, int(Nl_tp*step_tp/perSlot)):
                templateVec.append(countDict.get(i, 0))
            # for i in range(0, NSd):
            #     templateVec.append(countDict.get(i, 0))
            # if sum(knownVec)>0:
            template_vecs[event_id]=templateVec
            # GT_vect[event_id]=GTVec
    if len(userList)==0:
        if saveTemplateToFileFlag:
            pickle.dump((template_vecs, basetime), open(filename, 'w'))
        return template_vecs, basetime
    else:
        return template_vecs
Exemple #2
0
        PParam = pr.predictionParam(PNSh=PNSh,
                                    PNSd=PNSd,
                                    PNSdLong=PNSdLong,
                                    PperSlot=PperSlot,
                                    PsimTemplateThres=PsimTemplateThres,
                                    templateTopSimSize=PtemplateTopSimSize,
                                    Nh_tp=PNh_tp,
                                    Nl_tp=PNl_tp,
                                    PNShSim=PNShSim,
                                    PNstep=PNstep,
                                    PNstepLong=PNstepLong,
                                    combineCount=combineCount)

        print '====== init %s %s ======' % (jobID,
                                            helper.timestamp2str(running_time))

        dictionary, corpus_gt, tfidf_gt, index_gt, util_gt = ti.loadEventGT(
            dictionary,
            timePoint=running_time,
            step=step,
            gt_Nh=gt_Nh,
            gt_Nd=gt_Nd,
            jobID=jobID,
            inputEvent=set(),
            detectParam=DParam)
        eventlist_gt = set()
        for event in util_gt:
            eventlist_gt.add(event.db_event_id)
        corpus_src_init, userInvolveList_init, corpus_init_ret = db.getText(
            timePoint=running_time,
Exemple #3
0
def loadEventGT(dictionary, timePoint=1347724800, step = 1, gt_Nh=0, gt_Nd=0, event_id=0, jobID='tmpgt_', inputEvent=set(), detectParam=None, verbose=False):
    ret= db.getEventInfo(startTime=timePoint, historySlots=gt_Nh*step, detectionSlots=gt_Nd*step, event_id=event_id)
    corpus_src_gt=[]
    util_gt=[]
    event_gt=[]
    if ret:
        # print 'gt_event:'
        for event in ret:
            if len(inputEvent)>0:
                if event['event_id'] not in inputEvent:
                    continue
            if event['event_id_time']<(timePoint-gt_Nh*24*3600):
                continue
            
            # keyword=[word for word in event['split_words'].split(';') if word]
            # keyword=list(set(keyword))
            # gt.append(keyword)
            # print 'ID=%s: ABS=%s TIT=%s' %(event['event_id'], event['abstract'], event['title'])
            keywords= str(event['title'])
            wordseg = wc.para2seglist(keywords, tag=True)
            seglist = wc.seglist4filter(wordseg, srcTag = True, filterLow=False, fromFile=False)
            # print seglist
            seglist=list(set(seglist))
            # for element in seglist:
                # print element,
            # print ''
            corpus_src_gt.append(seglist)
            # print event['event_id'],
            # for word in seglist:
            #     print word,
            # print ''
            # 
            # [detectCount, previousErgency, db_event_id]
            if detectParam:
                detectThres= detectParam.detectThres
            else:
                detectThres=0
            # eventReportTime=event['time_get']
            
            # d = datetime.date(2015,1,5)
            # unixtime = time.mktime(d.timetuple())
            
            a_gt_util = gt_util_param(event['event_id'], detectThres, int(event['reportTime']), int(event['event_id_time']))
            util_gt.append(a_gt_util)
        print 'total gt size: %d' %len(corpus_src_gt)

    dictionary, corpus_gt, metas_gt = corpus_init(corpus_src_gt, dictionary, prefix='detect/%s_gt_%s_his%s_det%s_ev%s' %(jobID, helper.timestamp2str(timePoint), gt_Nh*step, gt_Nd*step, event_id), seg=False)
    tfidf_gt, index_gt = sims_init(corpus_gt, prefix='model/%s_gt_%s_his%sh_det%sh_ev%s' %(jobID, helper.timestamp2str(timePoint), gt_Nh * step, gt_Nd*step, event_id))
    # print 'dict size2=%s'%len(dictionary)
    if verbose:
        for i, corpus in enumerate(corpus_gt):
            print 'gtid=%s'%util_gt[i].db_event_id
            for word in corpus:
                print dictionary[word[0]],
            print ''
    return dictionary, corpus_gt, tfidf_gt, index_gt, util_gt
Exemple #4
0
        dictionary= corpora.Dictionary()
        runFlags= de.TypeFlag(args.selType)
        
        
        DParam=de.detectParam(isNewDocThres=offline_sim_thres, detectThres=offline_detectThres, userSimThres= offline_userSimThres, ergCoef=ergCoef, costCoef=costCoef, baCoef= baCoef)
        
        if args.offtestFlag:
            offlineTestParam=de.detectParam(isNewDocThres=offline_sim_thres, recallDetectedCountThres=recallDetectedCountThres)
        
        if args.ontestFlag:
            onlineTestParam=de.detectParamOnline(isNewDocThres=online_sim_thres, onlineFilterExistThres=online_filter_thres, onlineCombineThres=online_combine_thres, recallDetectedCountThres=recallDetectedCountThres,precDetectedCountThres=precDetectedCountThres)

        PParam=pr.predictionParam(PNSh=PNSh, PNSd=PNSd, PNSdLong=PNSdLong, PperSlot=PperSlot, PsimTemplateThres=PsimTemplateThres, templateTopSimSize=PtemplateTopSimSize, Nh_tp=PNh_tp, Nl_tp=PNl_tp, PNShSim=PNShSim, PNstep=PNstep, PNstepLong=PNstepLong, combineCount=combineCount)

        
        print '====== init %s %s ======' %(jobID, helper.timestamp2str(running_time))

        dictionary, corpus_gt, tfidf_gt, index_gt, util_gt = ti.loadEventGT(dictionary, timePoint=running_time, step=step, gt_Nh=gt_Nh, gt_Nd=gt_Nd, jobID=jobID, inputEvent= set(), detectParam= DParam)
        eventlist_gt=set()
        for event in util_gt:
            eventlist_gt.add(event.db_event_id)
        corpus_src_init, userInvolveList_init, corpus_init_ret = db.getText(timePoint=running_time, historySlots= Nh * step, timelastsSlots = Nl * step, output_prefix='data/%s_init_%s_%sh%sh%sh_ev%s' %(jobID ,helper.timestamp2str(running_time), Nh*step, Nd*step, Nl*step, len(eventlist_gt)), event_id=eventlist_gt, util_gt=util_gt, cacheFlag=True)
        template_vecs_all, basetime_all_init = pr.getTemplate(util_gt, running_time, perSlot=PParam.PperSlot, step_tp=PParam.step_tp, Nh_tp=PParam.Nh_tp, Nl_tp=PParam.Nl_tp, retCache=corpus_init_ret)
        PParam.template=(template_vecs_all, None, basetime_all_init)
        delta_time=time.time() - program_start_time_init
        print("--- init using %s seconds, or %s minutes ---" % (delta_time, delta_time/60.0))

        program_start_time_select=time.time()
        # alg. swc jnt
        if args.selectFlag:
            print '====== selecting heu %s %s ====== ' %(jobID, helper.timestamp2str(running_time))
Exemple #5
0
def selectUserHeu(c, k, userInvolveList, runFlags, dictionary, corpus_gt, tfidf_gt, index_gt, util_gt, running_time, step=1, Nh=0, Nl=0, jobID='tmp_user', eventlist=set(), selectSize=1, detectParam=None, predictParam=None):
    # print 'user before in init %d' %len(userInvolveList)
    userInfo = db.getUserInfoAll(userInvolveList)
    userInfoDict={}
    for user in userInfo:
        userInfoDict[user['uid']]=user
    #sort by fo desc, and cost asce
    userInfo=sorted(userInfo, key= lambda x: (x['followers_count'], -x['cost']), reverse=True)

    cur_c=0
    cur_k=0
    corpus_src_detect, userInv, corpus_ret = db.getText(timePoint=running_time, historySlots= Nh * step, timelastsSlots= Nl * step, output_prefix='data_sel/%s_heu_%s_his%sh_evall' %(jobID, helper.timestamp2str(running_time), Nh*step), userList=userInvolveList, event_id=eventlist, cacheFlag=True)
    # print 'user after in init=%d' %len(userInv)
    print 'corpus detect size=%d' %len(corpus_ret)
    # print 'onestep ', type(corpus_ret), len(corpus_ret)

    # for cor in corpus_ret:
    #     print type(cor),cor
    #
    selected_uid = []
    if runFlags.extFlag:
        updated_util_gt_ext=copy.deepcopy(util_gt)
    updated_util_gt=copy.deepcopy(util_gt)
    blackUsers=set()
    
    PParam=predictParam
 
    
    for i in range(0,k):
        left_c = c- cur_c
        left_k = k- cur_k
        
        if runFlags.extFlag:
        # updated_util_gt=copy.deepcopy(updated_util_gt_ext)
            for i, event in enumerate(updated_util_gt_ext):
                if updated_util_gt[i].detectCount >=1:
                    event.detectCount += 1#event.detectCount
                
                if event.detectCount>=event.detectThres:
                    updated_util_gt[i].detectThres=0
                    updated_util_gt[i].detectCount=0
                else:
                    updated_util_gt[i].detectThres=1
                    updated_util_gt[i].detectCount=0
        
        print 'selecting round for %d users, left_k=%s, left_c= %s' %(selectSize, left_k, left_c)
        if left_c <= 0 or left_k <= 0:
            print 'selected done!, cur_c=%s, cur_k=%s' %(cur_c, cur_k)
            return selected_uid
        else:
            userCand=set()
            bondCount=0
            for user in userInfo:
                if user['uid'] in blackUsers:
                    continue
                if user['uid'] in userInvolveList and user['uid'] not in selected_uid:
                    if user['avr_rp_count']>0:
                        if runFlags.bondFlag:
                            if user['cost'] > costBond(left_c, left_k, detectParam.costCoef) or user['cost']<=0:
                                bondCount+=1
                                continue
                        userCand.add(user['uid'])
            if len(userCand)>0:
                print 'selecting from all_user %d' %len(userCand)
                if runFlags.bondFlag:
                    print 'has bypassed %d bigger than bond' %bondCount
                
                if runFlags.preFlag:
                    PParam.SelectedID=set(copy.deepcopy(selected_uid))
                    selSet, updated_util_gt, user_score, blackUsersStep, stepPredictionRMSES = selectUserOneStep(left_c, left_k, userCand, runFlags, 
                        userInfoDict, corpus_ret, dictionary, corpus_gt, tfidf_gt, index_gt, updated_util_gt, running_time,
                        step=step, Nh=Nh, Nl=Nl, jobID=jobID, eventlist=eventlist, selectSize=selectSize, detectParam=detectParam, predictionParam=PParam)
                    if len(selSet)==0:
                        break
                    print 'selected user %s with score %f, RMSE %s' %(str(selSet), user_score, getRMSEMap(stepPredictionRMSES))
                else:
                    selSet, updated_util_gt, user_score, blackUsersStep = selectUserOneStep(left_c, left_k, userCand, runFlags, 
                    userInfoDict, corpus_ret, dictionary, corpus_gt, tfidf_gt, index_gt, updated_util_gt, running_time,
                    step=step, Nh=Nh, Nl=Nl, jobID=jobID, eventlist=eventlist, selectSize=selectSize, detectParam=detectParam, predictionParam=PParam)
                    if len(selSet)==0:
                        break
                    print 'selected user %s with score %f' %(str(selSet), user_score)
                for user in selSet:
                    selected_uid.append(user)
                    cur_c +=userInfoDict[user]['cost']
                cur_k += len(selSet)
                for user in blackUsersStep:
                    blackUsers.add(user)
            else:
                break

    print 'selected done!, cur_c=%d, cur_k=%s' %(cur_c, cur_k)
    return selected_uid
Exemple #6
0
def selectUserOneStep(left_c, left_k, involveUsers, runFlags, userInfoDict, corpus_ret, dictionary, corpus_gt, tfidf_gt, index_gt, util_gt, running_time, step=1, Nh=0, Nl=0, jobID='tmp_user', eventlist=set(), selectSize=1, detectParam=None, predictionParam=None):
    base_util=copy.deepcopy(util_gt)
    if runFlags.preFlag:
        PParam=predictionParam
        # PselectID=set(PParam.SelectedID)
        PParam.predictCache=corpus_ret

        known_vecs_all, gt_vecs_all, basetime_all = pr.getKnownAndGTAll(corpus_ret, util_gt, perSlot=PParam.PperSlot, NSh= PParam.PNSh, NSd=PParam.PNSd)

    all_users=[]
    userScores=[]
    cnt = 0
    total_cnt = 0
    split_size=10000
    for userID in involveUsers:
        cnt += 1
        total_cnt += 1
        user = perUser()
        user.uid=userID
        user.running_time=running_time
        user.Nh=Nh
        # user.Nl=Nl
        user.step=step
        user.cost=userInfoDict[user.uid]['cost']
        # user.corpus_ret=corpus_ret
        user.util_gt=util_gt        
        user.runFlags=runFlags
        user.detectParam=detectParam
        if not runFlags.docFlag:
            user.jobID=jobID
            # user.eventlist=eventlist
            uset=set()
            uset.add(user.uid)
            user.corpus_src_detect, user.userInv = db.getText(timePoint=running_time, historySlots= Nh * step, timelastsSlots=Nl * step, output_prefix='data_sel/%s_user_%s_%s_his%sh_lasts%sh_evl0' %(jobID, user.uid, helper.timestamp2str(running_time), Nh*step, Nl*step), userList=uset, event_id=set(), verboseFlag=False,cacheRet=corpus_ret)
            user.dictionary=dictionary
            user.corpus_gt=corpus_gt
            user.tfidf_gt=tfidf_gt
            user.index_gt=index_gt
        else:
            # user.jobID=jobID
            uset=set()
            uset.add(user.uid)
            user.userInv=uset
            user.corpus_src_detect=corpus_ret.get(user.uid,[])

        all_users.append(user)

        
        if runFlags.extFlag:
            if total_cnt< len(involveUsers):
                continue
            else:
                # ext_cnt=0
                global parCount,totalCount,leftCount
                parCount = Value('i', 0)
                totalCount = Value('i', len(involveUsers))
                leftCount = Value('i', left_k)
            
                for us in all_users:
                    ret=parSelect(us)
                    userScores.append(ret)

        else:
            if cnt <split_size and total_cnt < len(involveUsers):
                continue
            else:
            
                cnt=0
                parCount = Value('i', len(userScores))
                totalCount = Value('i', len(involveUsers))
                leftCount = Value('i', left_k)
    
                workerThres=20
                workerCount=multiprocessing.cpu_count()
                if workerCount > workerThres:
                    workerCount = workerThres
                if runFlags.preFlag:
                    pool = Pool(processes = workerCount, maxtasksperchild = 1600, initializer = parSelectInitP, initargs = (parCount, totalCount, leftCount, known_vecs_all, gt_vecs_all, basetime_all, PParam, ))
                    chunk, extra= divmod(len(all_users), 40)
                    if extra:
                        chunk+=1
                    it=pool.imap_unordered(parSelect, all_users, chunk)
                else:
                    pool = Pool(processes = workerCount, maxtasksperchild = 4800, initializer = parSelectInit, initargs = (parCount, totalCount, leftCount ))
                    chunk, extra= divmod(len(all_users), 8)
                    if extra:
                        chunk+=1
                    it=pool.imap_unordered(parSelect, all_users, chunk)#


                for ret in it:
                    userScores.append(ret)
                pool.close()
                pool.join()
                all_users=[]
                pool=None

    
    blackUsers=set()
    for user in userScores:
        if user[1]<=0:
            blackUsers.add(user[0])
        elif runFlags.preFlag:
            tempRMSE=numpy.mean(getRMSEMap(user[3]))
            if tempRMSE>phonyRMSE:
                blackUsers.add(user[0])
            # if numpy.isnan(tempRMSE):
                # user[3][3]=phonyRMSE
    userScores=[user for user in userScores if user[0] not in blackUsers]
    
    if runFlags.preFlag:
        userScoresSorted= sorted(userScores, key = lambda x:x[1]+detectParam.baCoef*numpy.mean(getRMSEMap(x[3])), reverse=False)
    else:
        userScoresSorted= sorted(userScores, key = lambda x:x[1], reverse=False)

    userScores=userScoresSorted
    
    selSet=[]
    selCost=0
    selID=set()
    selScore=0.0
    
    similarCount=0
    noSmallScoreCount=1
    predErrCount=0
    
    
    
    candidateUser=None
    while len(selSet)<min(selectSize, left_k) and len(userScores)>0 and selCost<left_c:
        candidateUser=userScores.pop()
        if runFlags.extFlag:
            if candidateUser[1]<=0:
                break
        if candidateUser[0] in selID:
            continue
        if len(selSet)>10:
            if candidateUser[1] < 0.0001 * float(selScore)/float(len(selSet)):
                break
        if userSim(candidateUser, selSet, thres=detectParam.userSimThres, countThres=math.ceil(selectSize * detectParam.userSimCountCoef)):
            similarCount+=1
            # print 'user %s similar, unselect' %user[0]
            continue
        
        selSet.append(candidateUser)
        selCost+=userInfoDict[candidateUser[0]]['cost']
        selID.add(candidateUser[0])
        selScore+=candidateUser[1]
        
        eCount=0
        tCount=0
        for i, aut in enumerate(candidateUser[2]):
            if aut.detectCount>base_util[i].detectCount:
                eCount+=1
                if aut.detectThres<=aut.detectCount:
                    tCount+=1
                    
                    
        if runFlags.preFlag:
            print 'user %s selected, score %s, count %s, overcount %s, rmse %s' %(candidateUser[0], candidateUser[1], eCount, tCount, getRMSEMap(candidateUser[3]))
            sys.stdout.flush()
        else:
            print 'user %s selected, score %s, count %s, overcount %s' %(candidateUser[0], candidateUser[1], eCount, tCount)
            sys.stdout.flush()
        
        if len(selSet) >= selectSize:
            break
        
        newUserScores=[]
        for user in userScores:
            updated_gt, updated_score = updateScore(user[2], candidateUser[2], base_util, userInfoDict[user[0]]['cost'], runFlags, detectParam)
            if runFlags.preFlag:
                newUserScores.append((user[0], updated_score, updated_gt, user[3]))
            else:
                newUserScores.append((user[0], updated_score, updated_gt))

        if runFlags.preFlag:
            userScores= sorted(newUserScores, key = lambda x:x[1]+detectParam.baCoef*numpy.mean(getRMSEMap(x[3])), reverse=False)
        else:
            userScores= sorted(newUserScores, key = lambda x:x[1], reverse=False)
        base_util=copy.deepcopy(candidateUser[2])
            

            
    print 'this round selected %s users: consider selecting %s, bypass similar %s' %(len(selID), len(involveUsers)-len(blackUsers), similarCount)
    sys.stdout.flush()

    if not runFlags.docFlag:
        corpus_src_detect, userInv = db.getText(timePoint=running_time, historySlots= Nh * step, output_prefix='data_sel/%s_userstep%s-%s_%s_his%sh_evall' %(jobID, left_k, len(selID), helper.timestamp2str(running_time), Nh*step), userList=selID, event_id=set(), cacheRet=corpus_ret)
        if len(userInv)<1:
            return selID, None, None
        dictionary, corpus_detect,metas = ti.corpus_init(corpus_src_detect, dictionary, prefix='model_sel/%s_userstep%s-%s_%s_his%sh_evall' %(jobID, left_k, len(selID), helper.timestamp2str(running_time), Nh*step), verbose=False, updateDict=False)
        step_util_gt, user_score = detectUtilWithGT(corpus_detect, dictionary, util_gt, corpus_gt, tfidf_gt, index_gt, runFlags, selCost, detectParam, countOnlyOnceFlag=False)
    else:
        corpus_src_detect=[]
        for uID in selID:
            corpus_src_detect.extend(corpus_ret.get(uID,[]))
        if runFlags.extFlag:
            step_util_gt, user_score = detectUtilWithDoc(corpus_src_detect, util_gt, runFlags, selCost, detectParam, countOnlyOnceFlag=True)
        else:
            step_util_gt, user_score = detectUtilWithDoc(corpus_src_detect, util_gt, runFlags, selCost, detectParam, countOnlyOnceFlag=False)
        step_util_gt=candidateUser[2]
#
    
    if runFlags.preFlag:
        known_vecs_part_step, gt_vecs_part_step= pr.getKnownAndGTPart(util_gt=util_gt, running_time=running_time, selUsers=PParam.SelectedID.union(selID), perSlot=PParam.PperSlot, NSh=PParam.PNSh , NSd=PParam.PNSd, jobID='predict', inputCache=PParam.predictCache, basetime=basetime_all)
        template_part_step = pr.getTemplate(util_gt, running_time, perSlot=PParam.PperSlot, step_tp=PParam.step_tp, Nh_tp=PParam.Nh_tp, Nl_tp=PParam.Nl_tp, retCache=PParam.predictCache, userList=PParam.SelectedID.union(selID), basetime=PParam.template[2])
        rmses= pr.predictResult((PParam.template[0], template_part_step, PParam.template[2]), known_vecs_part_step, known_vecs_all, gt_vecs_part_step, gt_vecs_all, NSh=PParam.PNSh, NSd=PParam.PNSd, NShSim=PParam.PNShSim, Nstep=PParam.PNstep, topSimSize=PParam.templateTopSimSize, simThres=PParam.PsimTemplateThres, combineCount=PParam.combineCount)
        PredictionRMSES = rmses
        return selID, step_util_gt, user_score, blackUsers, PredictionRMSES
    else:
        return selID, step_util_gt, user_score, blackUsers
Exemple #7
0
def parSelect(user):
    global parCount,totalCount,leftCount
   

    if len(user.userInv)<1:
        return user.uid, -1.0

    if not user.runFlags.docFlag:
        user.dictionary, corpus_detect, metas = ti.corpus_init(user.corpus_src_detect, user.dictionary, prefix='model_sel/%s_user_%s_%s_his%sh_evall' %(user.jobID, user.uid, helper.timestamp2str(user.running_time), user.Nh*user.step), verbose=False, outFlag=False, updateDict=False)
        tmp_util_gt, user_score = detectUtilWithGT(corpus_detect, user.dictionary, user.util_gt, user.corpus_gt, user.tfidf_gt, user.index_gt, user.runFlags, user.cost, user.detectParam)
    else:
        tmp_util_gt, user_score = detectUtilWithDoc(user.corpus_src_detect, user.util_gt, user.runFlags, user.cost, user.detectParam)


    countPoint=200
    if user.runFlags.preFlag:
        #if nothing is detected, no need to calc preRMSE
        if user_score<=0:
            # countPoint=10
            currentPredictionRMSES=[[phonyRMSE], [phonyRMSE], [phonyRMSE], [phonyRMSE]]
        else:
            global known_vecs_all_user,gt_vecs_all_user,basetime_all_user
            global PParam_user

            known_vecs_part_user, gt_vecs_part_user= pr.getKnownAndGTPart(util_gt=user.util_gt, running_time=user.running_time, selUsers=PParam_user.SelectedID.union(set([user.uid])), perSlot=PParam_user.PperSlot, NSh=PParam_user.PNSh , NSd=PParam_user.PNSd, jobID='predict', inputCache=PParam_user.predictCache, basetime=basetime_all_user)
            template_part_user = pr.getTemplate(user.util_gt, user.running_time, perSlot=PParam_user.PperSlot, step_tp=PParam_user.step_tp, Nh_tp=PParam_user.Nh_tp, Nl_tp=PParam_user.Nl_tp, retCache=PParam_user.predictCache, userList=PParam_user.SelectedID.union(set([user.uid])), basetime=PParam_user.template[2])
            rmses= pr.predictResult((PParam_user.template[0], template_part_user, PParam_user.template[2]), known_vecs_part_user, known_vecs_all_user, gt_vecs_part_user, gt_vecs_all_user, NSh=PParam_user.PNSh, NSd=PParam_user.PNSd, NShSim=PParam_user.PNShSim, Nstep=PParam_user.PNstep, topSimSize=PParam_user.templateTopSimSize, simThres=PParam_user.PsimTemplateThres, select=True, combineCount=PParam_user.combineCount)
            currentPredictionRMSES= rmses

    parCount.value +=1
    if(parCount.value % countPoint == 0):
        if user.runFlags.preFlag:
            print '%s/%s, left %s. user %s score %s rmse %s' %(parCount.value,totalCount.value,leftCount.value, user.uid,user_score, getRMSEMap(currentPredictionRMSES))
        else:
            print '%s/%s, left %s. user %s score %s' %(parCount.value,totalCount.value,leftCount.value, user.uid,user_score)

        sys.stdout.flush()
    if user.runFlags.preFlag:
        return user.uid, user_score, tmp_util_gt, currentPredictionRMSES
    else:
        return user.uid, user_score, tmp_util_gt
Exemple #8
0
def loadEventGT(dictionary,
                timePoint=1347724800,
                step=1,
                gt_Nh=0,
                gt_Nd=0,
                event_id=0,
                jobID='tmpgt_',
                inputEvent=set(),
                detectParam=None,
                verbose=False):
    ret = db.getEventInfo(startTime=timePoint,
                          historySlots=gt_Nh * step,
                          detectionSlots=gt_Nd * step,
                          event_id=event_id)
    corpus_src_gt = []
    util_gt = []
    event_gt = []
    if ret:
        # print 'gt_event:'
        for event in ret:
            if len(inputEvent) > 0:
                if event['event_id'] not in inputEvent:
                    continue
            if event['event_id_time'] < (timePoint - gt_Nh * 24 * 3600):
                continue

            # keyword=[word for word in event['split_words'].split(';') if word]
            # keyword=list(set(keyword))
            # gt.append(keyword)
            # print 'ID=%s: ABS=%s TIT=%s' %(event['event_id'], event['abstract'], event['title'])
            keywords = str(event['title'])
            wordseg = wc.para2seglist(keywords, tag=True)
            seglist = wc.seglist4filter(wordseg,
                                        srcTag=True,
                                        filterLow=False,
                                        fromFile=False)
            # print seglist
            seglist = list(set(seglist))
            # for element in seglist:
            # print element,
            # print ''
            corpus_src_gt.append(seglist)
            # print event['event_id'],
            # for word in seglist:
            #     print word,
            # print ''
            #
            # [detectCount, previousErgency, db_event_id]
            if detectParam:
                detectThres = detectParam.detectThres
            else:
                detectThres = 0
            # eventReportTime=event['time_get']

            # d = datetime.date(2015,1,5)
            # unixtime = time.mktime(d.timetuple())

            a_gt_util = gt_util_param(event['event_id'], detectThres,
                                      int(event['reportTime']),
                                      int(event['event_id_time']))
            util_gt.append(a_gt_util)
        print 'total gt size: %d' % len(corpus_src_gt)

    dictionary, corpus_gt, metas_gt = corpus_init(
        corpus_src_gt,
        dictionary,
        prefix='detect/%s_gt_%s_his%s_det%s_ev%s' %
        (jobID, helper.timestamp2str(timePoint), gt_Nh * step, gt_Nd * step,
         event_id),
        seg=False)
    tfidf_gt, index_gt = sims_init(corpus_gt,
                                   prefix='model/%s_gt_%s_his%sh_det%sh_ev%s' %
                                   (jobID, helper.timestamp2str(timePoint),
                                    gt_Nh * step, gt_Nd * step, event_id))
    # print 'dict size2=%s'%len(dictionary)
    if verbose:
        for i, corpus in enumerate(corpus_gt):
            print 'gtid=%s' % util_gt[i].db_event_id
            for word in corpus:
                print dictionary[word[0]],
            print ''
    return dictionary, corpus_gt, tfidf_gt, index_gt, util_gt
Exemple #9
0
def getTemplate(util_gt,
                running_time,
                perSlot=3,
                step_tp=24,
                Nh_tp=7,
                Nl_tp=7,
                retCache=None,
                basetime=None,
                userList=set()):
    if len(userList) == 0:
        filename = '%s-Nh%s-lasts%s.template_all' % (
            helper.timestamp2str(running_time), Nh_tp, Nl_tp)
        if os.path.exists(filename):
            return pickle.load(open(filename, 'r'))

    template_vecs = {}
    outretEvent = {}
    if basetime == None:
        basetime = {}

    if len(userList) == 0:
        for line in retCache:
            if line['event_id'] not in outretEvent:
                outretEvent[line['event_id']] = []
            outretEvent.get(line['event_id']).append(line)

    else:
        for user in userList:
            if user in retCache:
                for line in retCache[user]:
                    if line['event_id'] not in outretEvent:
                        outretEvent[line['event_id']] = []
                    outretEvent[line['event_id']].append(line)

    for event in util_gt:
        event_id = event.db_event_id
        # corpus_src_init, userInvolveList, outret = db.getText(timePoint=running_time, historySlots= Nh * perSlot, detectionSlots=Nd * perSlot, output_prefix='data/%s_pred_%s_his%sh_ev%s' %(jobID ,helper.timestamp2str(running_time), Nh*perSlot, event_id), userList= selUsers, event_id=event_id, cacheFlag=True)
        if event_id in outretEvent:
            eventContents = outretEvent[event_id]
            if basetime != None:
                if event_id in basetime:
                    baseTimestamp = basetime[event_id]
                else:
                    baseTimestamp = min(eventContents,
                                        key=lambda x: x['dsttime'])['dsttime']
                    # basetime[event_id]=baseTimestamp
            else:
                baseTimestamp = min(eventContents,
                                    key=lambda x: x['dsttime'])['dsttime']
                # basetime[event_id]=baseTimestamp
            countDict = {}
            templateVec = []
            for every in eventContents:
                countDict[int((every['dsttime'] - baseTimestamp) /
                              (3600 * perSlot))] = countDict.get(
                                  int((every['dsttime'] - baseTimestamp) /
                                      (3600 * perSlot)), 0) + 1
            for i in range(0, int(Nl_tp * step_tp / perSlot)):
                templateVec.append(countDict.get(i, 0))
            # for i in range(0, NSd):
            #     templateVec.append(countDict.get(i, 0))
            # if sum(knownVec)>0:
            template_vecs[event_id] = templateVec
            # GT_vect[event_id]=GTVec
    if len(userList) == 0:
        if saveTemplateToFileFlag:
            pickle.dump((template_vecs, basetime), open(filename, 'w'))
        return template_vecs, basetime
    else:
        return template_vecs