def func():
    core_ind_d = load_obj(CORE_IND_PATH)
    lt_d = load_obj(IND_EDGE_PATH)  #d[parent] = [child1,child2...]
    translate_d = load_obj(TRANS_PATH)

    adpt_time_outnei_d = {}

    '''
    def collector(rtmid):
        new_rtmid_d = {}
        rtmid_d = core_ind_d[rtmid]
        rtTime = rtmid_d.pop('rtTime')
        rtUid = rtmid_d.pop('rtUid')

        new_rtmid_d['rtTime'] = rtTime
        new_rtmid_d['rtUid'] = rtUid

        for uid_ind,temp_l in rtmid_d.iteritems():
            t_diff = temp_l[1]

            out_neis = lt_d[uid_ind]

            new_rtmid_d[uid_ind] = [t_diff, out_neis]
        return new_rtmid_d
    '''

    for rtmid in core_ind_d:
        new_rtmid_d = {}
        rtmid_d = core_ind_d[rtmid]
        #rtTime = rtmid_d.pop('rtTime')
        #rtUid = rtmid_d.pop('rtUid')

        #new_rtmid_d['rtTime'] = rtTime
        #new_rtmid_d['rtUid'] = rtUid

        for uid_ind,temp_l in rtmid_d.iteritems():
            #print type(uid_ind)
            t_diff = temp_l[1]
            parent = temp_l[0]
            if parent not in translate_d:
                parent_ind = -10000*random.random()
            else:
                parent_ind = translate_d[parent]
            uid_ind = str(int(uid_ind)).replace('L','')
            #Sometimes one node in G does not have any outgoing nei, so I wrote this if.
            #Maybe I should have a number of nodes like this ---> cnt receivers.py
            #I hope it is not a mistake
            if uid_ind in lt_d: #match them up is a problem!
                out_neis = lt_d[uid_ind]

                new_rtmid_d[uid_ind] = [t_diff,out_neis,parent_ind]

        adpt_time_outnei_d[rtmid] = new_rtmid_d


    save_obj(adpt_time_outnei_d,OUT_PATH)
def ind_core():

    #ind it and calculate time diff together

    aggre_d = load_obj(CORE_PATH)
    lookup_d = load_obj(LT_PATH)


    core_d_ind = {}

    good = 0
    bad = 0

    for rtmid in aggre_d:
        if random.random() <= 0.99:
            continue
        rtmid_d = aggre_d[rtmid]
        rtUid = rtmid_d.pop('rtUid')
        rtTime = rtmid_d.pop('rtTime')
        rtTime = time_trans(rtTime)


        core_d_ind[rtmid] = {}

        for adopter in rtmid_d:
            #here I am worried about what if this node does not appear in the G?
            #Maybe I should record the ratio or update G dynamicly because run Louvain is not that slow here using the java package



            if adopter in lookup_d:
                good += 1
                ind = lookup_d[adopter]
                t = rtmid_d[adopter][1]
                t = time_trans(t)
                parent = rtmid_d[adopter][0]
                t_diff = rtTime - t
                t_diff = t_diff.seconds/60 + 1 #t_diff is in minutes
                core_d_ind[rtmid][ind] = [parent, t_diff]

            else:
                bad += 1

    print 'the number of cases that node in G is %d ' %good
    print 'the number of bad cases is %d' %bad
    save_obj(core_d_ind,CORE_OUT_PATH)
def func():


    print 'Choose the size as %d, with lambda equal to %d' %(THRES_SIZE,LAMBDA)

    core_d_outnei = load_obj(CORE_PATH)
    cm_d = comm_dict()

    comm_dist = {}
    comm_dist['size'] = THRES_SIZE
    comm_dist['lbd'] = LAMBDA
    ind = 0
    for rtmid in core_d_outnei:
        rtmid_d = core_d_outnei[rtmid]
        adps = []
        final_size = len(rtmid_d.keys())

        ind += 1


        if final_size< THRES_SIZE:
            continue


        #tfr_list = [] # find the first THRES_SIZE adopters
        tfr_dict = {}
        for adp_ in rtmid_d:
            t_diff = abs(int(rtmid_d[adp_][0]))+0.01*random.random()
            #print 'time from root for this adp %d is %d, t_diif - T_FR = %d' %(int(adp_),t_diff,t_diff-T_FR)
            tfr_dict[int(adp_)] = t_diff

        tfr_list = tfr_dict.values()
        tfr_list.sort()

        t_time = tfr_list[THRES_SIZE-1] - tfr_list[0]
        avg_t_fr = float(sum(tfr_list[0:(THRES_SIZE-1)]))/THRES_SIZE #average time for one adoption behavior happen, not average exposure time for adopters
        for adp_ in tfr_dict:
            t_diff = tfr_dict[adp_]
            if tfr_list.index(t_diff) < THRES_SIZE:
                adps.append(adp_)


        #For the size version, we also need a time stamp to judge if the outnei is a NA or FNT
        #So I pick the time when the 50th adopter reposted the msg as time stamp
        T_FR = tfr_list[THRES_SIZE-1]

        #cur_size = len(adps)
        set_adps = set(adps)
        cur_size = len(set_adps)
        if cur_size != THRES_SIZE:
            print cur_size
            raise 'the size is not correct'

        adp_comms,na_comms,fnt_comms = [],[],[]
        #NEW VERSION
        #Check dup of fnt and na before look up their comms
        #this would change the entropy while only decrease the number of comm in fnt while keep the num of comms in na
        na_nodes,fnt_nodes = [],[]

        #adp_comms = Parallel(n_jobs=4)(delayed(comm_look_up)(adopter,cm_d) for adopter in adps)


        for adp in adps:
            adp_l = rtmid_d[str(adp)] #no dup in adp for it is key of one dict
            #adp = int(adp)
            if adp in cm_d:
                #print 'in'
                adp_comms.append(cm_d[adp])

                outneis = list(set(adp_l[1])) # dup exists in adp_l[1], so we can set delta here, need to think twice
                #need to delete all adps from this list of outneis

                outneis = [x for x in outneis if x not in set_adps]

                t_diff = int(adp_l[0])
                t_exp = abs(T_FR - t_diff)

                #comms = [cm_d[int(outnei)] for outnei in outneis]
                if t_exp >= LAMBDA:
                    #na_comms.extend(comms)
                    na_nodes.extend(outneis)

                else:
                    #fnt_comms.extend(comms)
                    fnt_nodes.extend(outneis)

        set_na_nodes = set(na_nodes)
        na_nodes_ = list(set_na_nodes)
        cur_na_size = len(na_nodes_)
        fnt_nodes_ = [x for x in set(fnt_nodes) if x not in set_na_nodes]
        cur_fnt_size = len(fnt_nodes_)

        #na_not_a_nodes = [x for x in set(na_nodes_) if x not in set_adps]
        #f_not_a_nodes = [x for x in set(fnt_nodes_) if x not in set_adps]
        #f_not_ana_nodes = [x for x in f_not_a_nodes if x not in set(na_nodes_)]

        for node in na_nodes_:
            comm = cm_d[int(node)]
            na_comms.append(comm)

        for node in fnt_nodes_:
            comm = cm_d[int(node)]
            fnt_comms.append(comm) #keep dup of comm here to calculate entropy

        set_a_comms = set(adp_comms)
        set_f_comms = set(fnt_comms)
        set_na_comms = set(na_comms)

        na_not_a_comms = [comm for comm in na_comms if comm not in set_a_comms]
        f_not_a_comms = [comm for comm in fnt_comms if comm  not in set_a_comms]
        f_not_ana_comms = [comm for comm in fnt_comms if comm not in set_na_comms and comm not in set_a_comms]


        comm_dist[rtmid] = {'adp':adp_comms,'fnt':fnt_comms,'na':na_comms,'fs':final_size,
                            'cs':cur_size,'cnas':cur_na_size,
                            'fnts':cur_fnt_size,'avg_time':avg_t_fr,'total_time':t_time,'ind':ind,'NA-A':na_not_a_comms,
                            'F-A':f_not_a_comms,'F-A-NA':f_not_ana_comms}
        print comm_dist[rtmid]
        print cur_size
        #print
    save_obj(comm_dist,OUT_PATH)
def func():
    comm_dist = load_obj(PATH)
    comm_dist.pop('lbd')
    comm_dist.pop('size')

    res_cmcnt = {} #adp,fnt or na ---> integer
    res_entro = {} #adp,fnt or na ---> float
    res_ol = {} #a_f,a_na,f_na  ---> integer

    nonviral = {}
    viral = {}
    keys = ['ind','num_cms_a','num_cms_f','num_cms_na', 'ent_a','ent_f','ent_na', 'ol_af','ol_ana','ol_fna','cs','fs','cnas','fnts','avg_time','total_time']
    for x in keys:
        nonviral[x],viral[x] = [],[]



    for rtmid in comm_dist:


        dist_d = comm_dist[rtmid]
        ind = dist_d['ind']
        total_t = dist_d['total_time']
        #print type(dist_d['fs'])
        #print int(str(dist_d['fs']))
        final_size = dist_d['fs']
        cs = dist_d['cs']
        avg_t = dist_d['avg_time']

        cnas = dist_d['cnas']
        fnts = dist_d['fnts']

        comms_a = dist_d['adp']
        comms_f = dist_d['fnt']
        comms_na = dist_d['na']

        #number of comms
        num_cms_a = len(set(comms_a))
        num_cms_f = len(set(comms_f))
        num_cms_na = len(set(comms_na))

        #num_cms = [num_cms_a,num_cms_f,num_cms_na]

        #entropy
        ent_a = entro(comms_a)
        ent_f = entro(comms_f)
        ent_na = entro(comms_na)

        #ent = [ent_a,ent_f,ent_na]

        #overlap
        ol_af = overlap(comms_a,comms_f)
        ol_ana = overlap(comms_a,comms_na)
        ol_fna = overlap(comms_f,comms_na)

        #ol = [ol_af,ol_ana,ol_fna]

        my_data = [ind,num_cms_a,num_cms_f,num_cms_na,ent_a,ent_f,ent_na,ol_af,ol_ana,ol_fna,cs,final_size,cnas,fnts,avg_t,total_t]

        if final_size >= T:
            for i in xrange(0,len(keys)):
                viral[keys[i]].append(my_data[i])
        else:
            for j in xrange(0,len(keys)):
                nonviral[keys[j]].append(my_data[j])

    save_obj(viral,'C:\weibodataset\\for_box_plot_size_new\\'+F_NAME+'v.pkl')
    save_obj(nonviral,'C:\weibodataset\\for_box_plot_size_new\\'+F_NAME+'nv.pkl')
def func():


    print 'Choose the time as %d, with lambda equal to %d' %(T_FR,LAMBDA)

    core_d_outnei = load_obj(CORE_PATH)
    cm_d = comm_dict()

    comm_dist = {}
    comm_dist['tfr'] = T_FR
    comm_dist['lbd'] = LAMBDA

    for rtmid in core_d_outnei:
        rtmid_d = core_d_outnei[rtmid]
        adps = []
        final_size = len(rtmid_d.keys())
        if final_size< 5:
            continue
        for adp_ in rtmid_d:
            t_diff = abs(int(rtmid_d[adp_][0]))
            #print 'time from root for this adp %d is %d, t_diif - T_FR = %d' %(int(adp_),t_diff,t_diff-T_FR)
            if t_diff <= T_FR:
                #adopted already
                adps.append(int(adp_))

        cur_size = len(adps)
        set_adps = set(adps)
        adp_comms,na_comms,fnt_comms = [],[],[]
        #NEW VERSION
        #Check dup of fnt and na before look up their comms
        #this would change the entropy while only decrease the number of comm in fnt while keep the num of comms in na
        na_nodes,fnt_nodes = [],[]


        #adp_comms = Parallel(n_jobs=4)(delayed(comm_look_up)(adopter,cm_d) for adopter in adps)


        for adp in adps:
            adp_l = rtmid_d[str(adp)] #no dup in adp for it is key of one dict
            #adp = int(adp)
            if adp in cm_d:
                #print 'in'
                adp_comms.append(cm_d[adp])

                outneis = list(set(adp_l[1])) # dup exists in adp_l[1], so we can set delta here, need to think twice
                #need to delete all adps from this list of outneis
                #TODO eliminate dup of frontier and non-adopters
                outneis = [x for x in outneis if x not in set_adps]

                t_diff = int(adp_l[0])
                t_exp = abs(T_FR - t_diff)

                #comms = [cm_d[int(outnei)] for outnei in outneis]
                if t_exp >= LAMBDA:
                    #na_comms.extend(comms)
                    na_nodes.extend(outneis)

                else:
                    #fnt_comms.extend(comms)
                    fnt_nodes.extend(outneis)

        set_na_nodes = set(na_nodes)
        na_nodes_ = list(set_na_nodes)
        cur_na_size = len(na_nodes_)
        fnt_nodes_ = [x for x in set(fnt_nodes) if x not in set_na_nodes]
        cur_fnt_size = len(fnt_nodes_)

        for node in na_nodes_:
            comm = cm_d[int(node)]
            na_comms.append(comm)

        for node in fnt_nodes_:
            comm = cm_d[int(node)]
            fnt_comms.append(comm) #keep dup of comm here to calculate entropy

        comm_dist[rtmid] = {'adp':adp_comms,'fnt':fnt_comms,'na':na_comms,'fs':final_size,'cs':cur_size,'cnas':cur_na_size,'fnts':cur_fnt_size}
        print comm_dist[rtmid]
        #print
    save_obj(comm_dist,OUT_PATH)
def func():
    comm_dist = load_obj(PATH)
    comm_dist.pop('lbd')
    comm_dist.pop('tfr')

    res_cmcnt = {} #adp,fnt or na ---> integer
    res_entro = {} #adp,fnt or na ---> float
    res_ol = {} #a_f,a_na,f_na  ---> integer

    nonviral = {}
    viral = {}
    keys = ['ind','num_cms_a','num_cms_f','num_cms_na', 'ent_a','ent_f','ent_na', 'ol_af','ol_ana','ol_fna',
            'cs','fs','cnas','fnts',
            #'avg_time','total_time',
            'NA-A','F-A','F-A-NA','intra_comm_ratio','gini_a','gini_f','gini_na','gini_NA-A','gini_F-A','gini_F-A-NA']
    for x in keys:
        nonviral[x],viral[x] = [],[]



    for rtmid in comm_dist:


        dist_d = comm_dist[rtmid]
        ind = dist_d['ind']
        #total_t = dist_d['total_time']
        #print type(dist_d['fs'])
        #print int(str(dist_d['fs']))
        final_size = dist_d['fs']
        cs = dist_d['cs']
        #avg_t = dist_d['avg_time']

        cnas = dist_d['cnas']
        fnts = dist_d['fnts']

        comms_a = dist_d['adp']
        comms_f = dist_d['fnt']
        comms_na = dist_d['na']

        comms_na_not_a = dist_d['NA-A']
        comms_f_not_a = dist_d['F-A']
        comms_f_not_ana = dist_d['F-A-NA']

        intra_ratio = dist_d['intra_comm_ratio']

        #number of comms
        num_cms_a = len(set(comms_a))
        num_cms_f = len(set(comms_f))
        num_cms_na = len(set(comms_na))

        #num_cms_na_not_a = len(set(comms_na_not_a))
        #num_cms_f_not_a = len(set(comms_f_not_a))
        #num_cms_f_not_ana = len(set(comms_f_not_ana))

        #num_cms = [num_cms_a,num_cms_f,num_cms_na]

        #entropy
        ent_a,gini_a = entro_gini(comms_a)
        ent_f,gini_f = entro_gini(comms_f)
        ent_na,gini_na = entro_gini(comms_na)

        ent_na_not_a,gini_na_not_a = entro_gini(comms_na_not_a)
        ent_f_not_a,gini_f_not_a = entro_gini(comms_f_not_a)
        ent_f_not_ana,gini_f_not_ana = entro_gini(comms_f_not_ana)


        #ent = [ent_a,ent_f,ent_na]

        #overlap
        ol_af = overlap(comms_a,comms_f)
        ol_ana = overlap(comms_a,comms_na)
        ol_fna = overlap(comms_f,comms_na)

        #ol_na_not_a = overlap()

        #ol = [ol_af,ol_ana,ol_fna]

        my_data = [ind,num_cms_a,num_cms_f,num_cms_na,ent_a,ent_f,ent_na,ol_af,ol_ana,ol_fna,cs,final_size,cnas,fnts,
                   #avg_t,total_t,
                   ent_na_not_a,ent_f_not_a,ent_f_not_ana,intra_ratio,gini_a,gini_f,gini_na,gini_na_not_a,gini_f_not_a,gini_f_not_ana]


        if final_size >= T:
            for i in xrange(0,len(keys)):
                viral[keys[i]].append(my_data[i])
        else:
            for j in xrange(0,len(keys)):
                nonviral[keys[j]].append(my_data[j])

    save_obj(viral,'E:\SNAM_2015\\for_box_plot_time_comparison_infomap\\'+F_NAME+'v.pkl')
    save_obj(nonviral,'E:\SNAM_2015\\for_box_plot_time_comparison_infomap\\'+F_NAME+'nv.pkl')