def func(): core_ind_d = load_obj(CORE_IND_PATH) lt_d = load_obj(IND_EDGE_PATH) #d[parent] = [child1,child2...] translate_d = load_obj(TRANS_PATH) adpt_time_outnei_d = {} ''' def collector(rtmid): new_rtmid_d = {} rtmid_d = core_ind_d[rtmid] rtTime = rtmid_d.pop('rtTime') rtUid = rtmid_d.pop('rtUid') new_rtmid_d['rtTime'] = rtTime new_rtmid_d['rtUid'] = rtUid for uid_ind,temp_l in rtmid_d.iteritems(): t_diff = temp_l[1] out_neis = lt_d[uid_ind] new_rtmid_d[uid_ind] = [t_diff, out_neis] return new_rtmid_d ''' for rtmid in core_ind_d: new_rtmid_d = {} rtmid_d = core_ind_d[rtmid] #rtTime = rtmid_d.pop('rtTime') #rtUid = rtmid_d.pop('rtUid') #new_rtmid_d['rtTime'] = rtTime #new_rtmid_d['rtUid'] = rtUid for uid_ind,temp_l in rtmid_d.iteritems(): #print type(uid_ind) t_diff = temp_l[1] parent = temp_l[0] if parent not in translate_d: parent_ind = -10000*random.random() else: parent_ind = translate_d[parent] uid_ind = str(int(uid_ind)).replace('L','') #Sometimes one node in G does not have any outgoing nei, so I wrote this if. #Maybe I should have a number of nodes like this ---> cnt receivers.py #I hope it is not a mistake if uid_ind in lt_d: #match them up is a problem! out_neis = lt_d[uid_ind] new_rtmid_d[uid_ind] = [t_diff,out_neis,parent_ind] adpt_time_outnei_d[rtmid] = new_rtmid_d save_obj(adpt_time_outnei_d,OUT_PATH)
def ind_core(): #ind it and calculate time diff together aggre_d = load_obj(CORE_PATH) lookup_d = load_obj(LT_PATH) core_d_ind = {} good = 0 bad = 0 for rtmid in aggre_d: if random.random() <= 0.99: continue rtmid_d = aggre_d[rtmid] rtUid = rtmid_d.pop('rtUid') rtTime = rtmid_d.pop('rtTime') rtTime = time_trans(rtTime) core_d_ind[rtmid] = {} for adopter in rtmid_d: #here I am worried about what if this node does not appear in the G? #Maybe I should record the ratio or update G dynamicly because run Louvain is not that slow here using the java package if adopter in lookup_d: good += 1 ind = lookup_d[adopter] t = rtmid_d[adopter][1] t = time_trans(t) parent = rtmid_d[adopter][0] t_diff = rtTime - t t_diff = t_diff.seconds/60 + 1 #t_diff is in minutes core_d_ind[rtmid][ind] = [parent, t_diff] else: bad += 1 print 'the number of cases that node in G is %d ' %good print 'the number of bad cases is %d' %bad save_obj(core_d_ind,CORE_OUT_PATH)
def func(): print 'Choose the size as %d, with lambda equal to %d' %(THRES_SIZE,LAMBDA) core_d_outnei = load_obj(CORE_PATH) cm_d = comm_dict() comm_dist = {} comm_dist['size'] = THRES_SIZE comm_dist['lbd'] = LAMBDA ind = 0 for rtmid in core_d_outnei: rtmid_d = core_d_outnei[rtmid] adps = [] final_size = len(rtmid_d.keys()) ind += 1 if final_size< THRES_SIZE: continue #tfr_list = [] # find the first THRES_SIZE adopters tfr_dict = {} for adp_ in rtmid_d: t_diff = abs(int(rtmid_d[adp_][0]))+0.01*random.random() #print 'time from root for this adp %d is %d, t_diif - T_FR = %d' %(int(adp_),t_diff,t_diff-T_FR) tfr_dict[int(adp_)] = t_diff tfr_list = tfr_dict.values() tfr_list.sort() t_time = tfr_list[THRES_SIZE-1] - tfr_list[0] avg_t_fr = float(sum(tfr_list[0:(THRES_SIZE-1)]))/THRES_SIZE #average time for one adoption behavior happen, not average exposure time for adopters for adp_ in tfr_dict: t_diff = tfr_dict[adp_] if tfr_list.index(t_diff) < THRES_SIZE: adps.append(adp_) #For the size version, we also need a time stamp to judge if the outnei is a NA or FNT #So I pick the time when the 50th adopter reposted the msg as time stamp T_FR = tfr_list[THRES_SIZE-1] #cur_size = len(adps) set_adps = set(adps) cur_size = len(set_adps) if cur_size != THRES_SIZE: print cur_size raise 'the size is not correct' adp_comms,na_comms,fnt_comms = [],[],[] #NEW VERSION #Check dup of fnt and na before look up their comms #this would change the entropy while only decrease the number of comm in fnt while keep the num of comms in na na_nodes,fnt_nodes = [],[] #adp_comms = Parallel(n_jobs=4)(delayed(comm_look_up)(adopter,cm_d) for adopter in adps) for adp in adps: adp_l = rtmid_d[str(adp)] #no dup in adp for it is key of one dict #adp = int(adp) if adp in cm_d: #print 'in' adp_comms.append(cm_d[adp]) outneis = list(set(adp_l[1])) # dup exists in adp_l[1], so we can set delta here, need to think twice #need to delete all adps from this list of outneis outneis = [x for x in outneis if x not in set_adps] t_diff = int(adp_l[0]) t_exp = abs(T_FR - t_diff) #comms = [cm_d[int(outnei)] for outnei in outneis] if t_exp >= LAMBDA: #na_comms.extend(comms) na_nodes.extend(outneis) else: #fnt_comms.extend(comms) fnt_nodes.extend(outneis) set_na_nodes = set(na_nodes) na_nodes_ = list(set_na_nodes) cur_na_size = len(na_nodes_) fnt_nodes_ = [x for x in set(fnt_nodes) if x not in set_na_nodes] cur_fnt_size = len(fnt_nodes_) #na_not_a_nodes = [x for x in set(na_nodes_) if x not in set_adps] #f_not_a_nodes = [x for x in set(fnt_nodes_) if x not in set_adps] #f_not_ana_nodes = [x for x in f_not_a_nodes if x not in set(na_nodes_)] for node in na_nodes_: comm = cm_d[int(node)] na_comms.append(comm) for node in fnt_nodes_: comm = cm_d[int(node)] fnt_comms.append(comm) #keep dup of comm here to calculate entropy set_a_comms = set(adp_comms) set_f_comms = set(fnt_comms) set_na_comms = set(na_comms) na_not_a_comms = [comm for comm in na_comms if comm not in set_a_comms] f_not_a_comms = [comm for comm in fnt_comms if comm not in set_a_comms] f_not_ana_comms = [comm for comm in fnt_comms if comm not in set_na_comms and comm not in set_a_comms] comm_dist[rtmid] = {'adp':adp_comms,'fnt':fnt_comms,'na':na_comms,'fs':final_size, 'cs':cur_size,'cnas':cur_na_size, 'fnts':cur_fnt_size,'avg_time':avg_t_fr,'total_time':t_time,'ind':ind,'NA-A':na_not_a_comms, 'F-A':f_not_a_comms,'F-A-NA':f_not_ana_comms} print comm_dist[rtmid] print cur_size #print save_obj(comm_dist,OUT_PATH)
def func(): comm_dist = load_obj(PATH) comm_dist.pop('lbd') comm_dist.pop('size') res_cmcnt = {} #adp,fnt or na ---> integer res_entro = {} #adp,fnt or na ---> float res_ol = {} #a_f,a_na,f_na ---> integer nonviral = {} viral = {} keys = ['ind','num_cms_a','num_cms_f','num_cms_na', 'ent_a','ent_f','ent_na', 'ol_af','ol_ana','ol_fna','cs','fs','cnas','fnts','avg_time','total_time'] for x in keys: nonviral[x],viral[x] = [],[] for rtmid in comm_dist: dist_d = comm_dist[rtmid] ind = dist_d['ind'] total_t = dist_d['total_time'] #print type(dist_d['fs']) #print int(str(dist_d['fs'])) final_size = dist_d['fs'] cs = dist_d['cs'] avg_t = dist_d['avg_time'] cnas = dist_d['cnas'] fnts = dist_d['fnts'] comms_a = dist_d['adp'] comms_f = dist_d['fnt'] comms_na = dist_d['na'] #number of comms num_cms_a = len(set(comms_a)) num_cms_f = len(set(comms_f)) num_cms_na = len(set(comms_na)) #num_cms = [num_cms_a,num_cms_f,num_cms_na] #entropy ent_a = entro(comms_a) ent_f = entro(comms_f) ent_na = entro(comms_na) #ent = [ent_a,ent_f,ent_na] #overlap ol_af = overlap(comms_a,comms_f) ol_ana = overlap(comms_a,comms_na) ol_fna = overlap(comms_f,comms_na) #ol = [ol_af,ol_ana,ol_fna] my_data = [ind,num_cms_a,num_cms_f,num_cms_na,ent_a,ent_f,ent_na,ol_af,ol_ana,ol_fna,cs,final_size,cnas,fnts,avg_t,total_t] if final_size >= T: for i in xrange(0,len(keys)): viral[keys[i]].append(my_data[i]) else: for j in xrange(0,len(keys)): nonviral[keys[j]].append(my_data[j]) save_obj(viral,'C:\weibodataset\\for_box_plot_size_new\\'+F_NAME+'v.pkl') save_obj(nonviral,'C:\weibodataset\\for_box_plot_size_new\\'+F_NAME+'nv.pkl')
def func(): print 'Choose the time as %d, with lambda equal to %d' %(T_FR,LAMBDA) core_d_outnei = load_obj(CORE_PATH) cm_d = comm_dict() comm_dist = {} comm_dist['tfr'] = T_FR comm_dist['lbd'] = LAMBDA for rtmid in core_d_outnei: rtmid_d = core_d_outnei[rtmid] adps = [] final_size = len(rtmid_d.keys()) if final_size< 5: continue for adp_ in rtmid_d: t_diff = abs(int(rtmid_d[adp_][0])) #print 'time from root for this adp %d is %d, t_diif - T_FR = %d' %(int(adp_),t_diff,t_diff-T_FR) if t_diff <= T_FR: #adopted already adps.append(int(adp_)) cur_size = len(adps) set_adps = set(adps) adp_comms,na_comms,fnt_comms = [],[],[] #NEW VERSION #Check dup of fnt and na before look up their comms #this would change the entropy while only decrease the number of comm in fnt while keep the num of comms in na na_nodes,fnt_nodes = [],[] #adp_comms = Parallel(n_jobs=4)(delayed(comm_look_up)(adopter,cm_d) for adopter in adps) for adp in adps: adp_l = rtmid_d[str(adp)] #no dup in adp for it is key of one dict #adp = int(adp) if adp in cm_d: #print 'in' adp_comms.append(cm_d[adp]) outneis = list(set(adp_l[1])) # dup exists in adp_l[1], so we can set delta here, need to think twice #need to delete all adps from this list of outneis #TODO eliminate dup of frontier and non-adopters outneis = [x for x in outneis if x not in set_adps] t_diff = int(adp_l[0]) t_exp = abs(T_FR - t_diff) #comms = [cm_d[int(outnei)] for outnei in outneis] if t_exp >= LAMBDA: #na_comms.extend(comms) na_nodes.extend(outneis) else: #fnt_comms.extend(comms) fnt_nodes.extend(outneis) set_na_nodes = set(na_nodes) na_nodes_ = list(set_na_nodes) cur_na_size = len(na_nodes_) fnt_nodes_ = [x for x in set(fnt_nodes) if x not in set_na_nodes] cur_fnt_size = len(fnt_nodes_) for node in na_nodes_: comm = cm_d[int(node)] na_comms.append(comm) for node in fnt_nodes_: comm = cm_d[int(node)] fnt_comms.append(comm) #keep dup of comm here to calculate entropy comm_dist[rtmid] = {'adp':adp_comms,'fnt':fnt_comms,'na':na_comms,'fs':final_size,'cs':cur_size,'cnas':cur_na_size,'fnts':cur_fnt_size} print comm_dist[rtmid] #print save_obj(comm_dist,OUT_PATH)
def func(): comm_dist = load_obj(PATH) comm_dist.pop('lbd') comm_dist.pop('tfr') res_cmcnt = {} #adp,fnt or na ---> integer res_entro = {} #adp,fnt or na ---> float res_ol = {} #a_f,a_na,f_na ---> integer nonviral = {} viral = {} keys = ['ind','num_cms_a','num_cms_f','num_cms_na', 'ent_a','ent_f','ent_na', 'ol_af','ol_ana','ol_fna', 'cs','fs','cnas','fnts', #'avg_time','total_time', 'NA-A','F-A','F-A-NA','intra_comm_ratio','gini_a','gini_f','gini_na','gini_NA-A','gini_F-A','gini_F-A-NA'] for x in keys: nonviral[x],viral[x] = [],[] for rtmid in comm_dist: dist_d = comm_dist[rtmid] ind = dist_d['ind'] #total_t = dist_d['total_time'] #print type(dist_d['fs']) #print int(str(dist_d['fs'])) final_size = dist_d['fs'] cs = dist_d['cs'] #avg_t = dist_d['avg_time'] cnas = dist_d['cnas'] fnts = dist_d['fnts'] comms_a = dist_d['adp'] comms_f = dist_d['fnt'] comms_na = dist_d['na'] comms_na_not_a = dist_d['NA-A'] comms_f_not_a = dist_d['F-A'] comms_f_not_ana = dist_d['F-A-NA'] intra_ratio = dist_d['intra_comm_ratio'] #number of comms num_cms_a = len(set(comms_a)) num_cms_f = len(set(comms_f)) num_cms_na = len(set(comms_na)) #num_cms_na_not_a = len(set(comms_na_not_a)) #num_cms_f_not_a = len(set(comms_f_not_a)) #num_cms_f_not_ana = len(set(comms_f_not_ana)) #num_cms = [num_cms_a,num_cms_f,num_cms_na] #entropy ent_a,gini_a = entro_gini(comms_a) ent_f,gini_f = entro_gini(comms_f) ent_na,gini_na = entro_gini(comms_na) ent_na_not_a,gini_na_not_a = entro_gini(comms_na_not_a) ent_f_not_a,gini_f_not_a = entro_gini(comms_f_not_a) ent_f_not_ana,gini_f_not_ana = entro_gini(comms_f_not_ana) #ent = [ent_a,ent_f,ent_na] #overlap ol_af = overlap(comms_a,comms_f) ol_ana = overlap(comms_a,comms_na) ol_fna = overlap(comms_f,comms_na) #ol_na_not_a = overlap() #ol = [ol_af,ol_ana,ol_fna] my_data = [ind,num_cms_a,num_cms_f,num_cms_na,ent_a,ent_f,ent_na,ol_af,ol_ana,ol_fna,cs,final_size,cnas,fnts, #avg_t,total_t, ent_na_not_a,ent_f_not_a,ent_f_not_ana,intra_ratio,gini_a,gini_f,gini_na,gini_na_not_a,gini_f_not_a,gini_f_not_ana] if final_size >= T: for i in xrange(0,len(keys)): viral[keys[i]].append(my_data[i]) else: for j in xrange(0,len(keys)): nonviral[keys[j]].append(my_data[j]) save_obj(viral,'E:\SNAM_2015\\for_box_plot_time_comparison_infomap\\'+F_NAME+'v.pkl') save_obj(nonviral,'E:\SNAM_2015\\for_box_plot_time_comparison_infomap\\'+F_NAME+'nv.pkl')