def recommend(self):
     close_users = BasicNetworkAnalyzer.compute_knearest_neighbors(self.usercircle,
                                                                   self.netdata.get_friends_nodes(self.usercircle),
                                                                   self.interact_type, self.K,data_type="learn")
     #print "Num close users", len(close_users), "Num friends", self.usercircle.get_num_friends()
     if len(close_users)< self.K:
         logging.warning("Cannot find k closest friends for recommend")
         return None                                                                  
     self.rec_items = self.usercircle.compute_weighted_popular_recs(close_users,self.max_items) 
     
     """
     if len(self.rec_items) == 0:
         print "oh"
         for sim, unode in close_users:
             print unode.length_train_ids
     """
     return self.rec_items
 def recommend(self):
     close_users = BasicNetworkAnalyzer.compute_knearest_neighbors(self.usercircle,
                                                                   self.netdata.get_nonfriends_nodes(self.usercircle),
                                                                   self.interact_type, self.K, data_type="learn"
                                                                   )
     
     #print "Num close users", len(close_users)
     if len(close_users) < self.K:
         logging.warning("Cannot find k closest global for recommend")
         return None
     self.rec_items = self.usercircle.compute_weighted_popular_recs(close_users, self.max_items)
    
     """
     Length of recs can be zero because there are so little train_interactions of the close users 
     if len(self.rec_items) == 0:
         print "oh"
         for sim, unode in close_users:
             print unode.length_train_ids
     """
     return self.rec_items
Esempio n. 3
0
 def recommend(self):
     close_users = BasicNetworkAnalyzer.compute_knearest_neighbors(
         self.usercircle,
         self.netdata.get_friends_nodes(self.usercircle),
         self.interact_type,
         self.K,
         data_type="learn")
     #print "Num close users", len(close_users), "Num friends", self.usercircle.get_num_friends()
     if len(close_users) < self.K:
         logging.warning("Cannot find k closest friends for recommend")
         return None
     self.rec_items = self.usercircle.compute_weighted_popular_recs(
         close_users, self.max_items)
     """
     if len(self.rec_items) == 0:
         print "oh"
         for sim, unode in close_users:
             print unode.length_train_ids
     """
     return self.rec_items
Esempio n. 4
0
    def recommend(self):
        close_users = BasicNetworkAnalyzer.compute_knearest_neighbors(
            self.usercircle,
            self.netdata.get_nonfriends_nodes(self.usercircle),
            self.interact_type,
            self.K,
            data_type="learn")

        #print "Num close users", len(close_users)
        if len(close_users) < self.K:
            logging.warning("Cannot find k closest global for recommend")
            return None
        self.rec_items = self.usercircle.compute_weighted_popular_recs(
            close_users, self.max_items)
        """
        Length of recs can be zero because there are so little train_interactions of the close users 
        if len(self.rec_items) == 0:
            print "oh"
            for sim, unode in close_users:
                print unode.length_train_ids
        """
        return self.rec_items
def run_computation(data, computation_cmd, outf, interact_type, create_fake_prefs,
        allow_duplicates, split_date_str, dataset_domain, dataset_path,
        min_interacts_beforeaftersplit_per_user,
        max_interact_ratio_error, max_sim_ratio_error, min_friends_match_ratio, 
        traindata_fraction, M):
    net_analyzer = BasicNetworkAnalyzer(data)
    interaction_types = data.interact_types_dict
    filename_prefix = computation_cmd if computation_cmd is not None else ""

    if computation_cmd=="basic_stats" or computation_cmd is None:
        net_analyzer.show_basic_stats()
        ## use below if you want to write a new dataset (e.g. after filtering)
        data.store_ego_dataset("/home/amit/datasets/social_activity_data/lastfm_filtered_listen/", write_maps=False)
        #data.compute_allpairs_sim(interact_type, data_type=ord("a"))

    elif computation_cmd=="random_similarity":
        for type_name, type_index in interaction_types.iteritems():
            circlesims, globalsims = net_analyzer.compare_circle_global_similarity(type_index, num_random_trials=5, cutoff_rating=cutoff_rating)
            #plotter.plotLinesYY(circlesims, globalsims, "Friends", "Global")
            outf.write("User_id\tcircle_sim\tnonfriend_sim\n")
            outf.write(type_name + '\n')
            for ind in range(len(circlesims)):
                outf.write("%s\t%f\t%f\n" %(circlesims[ind][0], circlesims[ind][1], globalsims[ind][1]))
            print "\n", type_name, ":" 
            print "Circle Average", sum([v2 for v1,v2 in circlesims])/float(len(circlesims))
            print "Global Average", sum([v2 for v1,v2 in globalsims])/float(len(globalsims))

    elif computation_cmd=="knn_similarity":
        #Compute K-nearest similarity
        KLIMITS = [10]
        outf.write("User_id\tk\tcircle_sim\tnonfriend_sim\n")
        
        for type_name, type_index in interaction_types.iteritems():
            for curr_lim in KLIMITS:
                plot_circle, plot_external = net_analyzer.compare_circle_global_knnsimilarity(type_index, klim=curr_lim, cutoff_rating=cutoff_rating)
                compare_sims(plot_circle, plot_external)
                outf.write(type_name+'\n')
                for ind in range(len(plot_circle)):
                    outf.write("%s\t%d\t%f\t%f\n" %(plot_circle[ind][0], curr_lim, plot_circle[ind][1], plot_external[ind][1]))
                #plotter.plotLinesYY(plot_circle, plot_external, "Friends", "Global")
                print type_name, "K", curr_lim
                print "Circle Average", utils.mean_sd([v2 for v1,v2 in plot_circle]), len(plot_circle)
                print "Global Average", utils.mean_sd([v2 for v1,v2 in plot_external]), len(plot_external)

    elif computation_cmd=="knn_recommender":
        #Compute K-nearest recommender
        KLIMITS = [10]
        rec_analyzer = RecommenderAnalyzer(data, max_recs_shown=10, traintest_split=0.7, cutoff_rating=cutoff_rating)
        outf.write("User_id\tk\trun_index\tcircle_ndcg\tnonfriend_ndcg\n")
        for type_name, type_index in interaction_types.iteritems():
            for curr_lim in KLIMITS:
                local_avg=[]
                global_avg=[]
                Ntotal = 10
                for i in range(Ntotal): # randomize because of training-test split.
                    plot_circle, plot_external = rec_analyzer.compare_knearest_recommenders(type_index, klim=curr_lim, num_processes=2)
                    compare_sims(plot_circle, plot_external)
                    outf.write(type_name + "\n")
                    for ind in range(len(plot_circle)):
                        outf.write("%s\t%d\t%d\t%f\t%f\n" %(plot_circle[ind][0], curr_lim, i, plot_circle[ind][1], plot_external[ind][1]))
                    print "\n", type_name, "K", curr_lim

                    #print plot_circle, plot_external
                    curr_avg_local = utils.mean_sd([v2 for v1,v2 in plot_circle])
                    curr_avg_global =  utils.mean_sd([v2 for v1,v2 in plot_external])
                    print "Circle Average", curr_avg_local
                    print "Global Average", curr_avg_global
                    local_avg.append(curr_avg_local[0])
                    global_avg.append(curr_avg_global[0])
                    #plotLinesYY(plot_circle, plot_external, "Friends", "Global")
                print "Local", sum(local_avg)/float(Ntotal)
                print "Global", sum(global_avg)/float(Ntotal)
    elif computation_cmd == "circle_coverage":
        lim_friends = [(5,10), (10,20), (20,50), (50,100)]
        for fr_limit in lim_friends:
            locality_analyzer = LocalityAnalyzer(data)
            coverage_list = locality_analyzer.compare_circle_item_coverages(0, fr_limit[0], fr_limit[1])
            plotter.plotLineY(sorted(coverage_list), "User", "Fraction of Items Covered with %d-%d friends" % (fr_limit[0], fr_limit[1]))
            print utils.mean_sd(coverage_list)
    elif computation_cmd == "items_edge_coverage":
        locality_analyzer = LocalityAnalyzer(data)
        items_cov_list, items_popularity, cov_ratio_list = locality_analyzer.compare_items_edge_coverage(1, minimum_interactions=1)
        print utils.mean_sd(items_cov_list)
        print utils.mean_sd(items_popularity)
        #plotter.plotHist(sorted([val for val in cov_ratio_list if val<=1]), "Ratio of Edge coverage to total popularity", "Frequency", logyscale=True)
        #####plotter.plotHist(sorted([val for val in cov_ratio_list]), "Ratio of Edge coverage to total popularity", "Frequency", logyscale=True)
        #plotter.plotHist(sorted(items_popularity), "Item", "total popularity")
        plotter.plotCumulativePopularity(items_popularity, labelx="Item percentile", labely="Cum. percent of number of likes")
    elif computation_cmd == "network_draw":
        net_visualizor = NetworkVisualizor(data)
        net_visualizor.draw_network()
    elif computation_cmd == "network_item_adopt":
        net_visualizor = NetworkVisualizor(data)
        pprint(net_visualizor.plot_item_adoption(1669118))
    elif computation_cmd == "node_details":
        for node_id in open('user_ids'):
            if node_id.strip('\n') != "User_id":
                net_analyzer.get_node_details(int(node_id.strip('\n')))
    elif computation_cmd=="store_dataset":
        user_interacts = net_analyzer.get_user_interacts(1, cutoff_rating)
        f = open(outf_path+ 'user_interacts_'+dataset_domain+'.tsv', 'w')
        f.write("user_id\titem_id\ttimestamp\n")
        for user_id, item_id, timestamp in user_interacts:
            f.write("%s\t%s\t%s\n" %(user_id, item_id, timestamp)) 
        f.close()
        
        item_pop = net_analyzer.get_items_popularity(1, cutoff_rating)    
        f = open(outf_path+'items_'+dataset_domain+'.tsv','w')
        f.write("item_id\tpopularity\n")
        for item_id, pop in item_pop.iteritems():
            f.write("%s\t%s\n" %(item_id, pop))
        f.close()

        user_friends = net_analyzer.get_user_friends()
        f = open('user_friends_'+dataset_domain+'.tsv','w')
        f.write("user_id\tfriend_id\n")
        for user_id, friend_id in user_friends:
            f.write("%s\t%s\n" %(user_id, friend_id))
        f.close()
        print "Successfully stored tsv dataset"
    elif computation_cmd=="compare_interact_types":
        num_interacts_dict = net_analyzer.compare_interaction_types()
        interact_types = num_interacts_dict.keys()
        plotter.plotLinesYY(num_interacts_dict[interact_types[0]], 
                            num_interacts_dict[interact_types[1]],
                            interact_types[0], interact_types[1], 
                            display=True, logyscale=True)
         
        plotter.plotLinesYY(num_interacts_dict[interact_types[1]], 
                            num_interacts_dict[interact_types[2]],
                            interact_types[1], interact_types[2], 
                            display=True, logyscale=True)
         
        plotter.plotLinesYY(num_interacts_dict[interact_types[0]], 
                            num_interacts_dict[interact_types[2]],
                            interact_types[0], interact_types[2], 
                            display=True, logyscale=True)
    elif computation_cmd=="influence_test":
        #   ta = TemporalAnalyzer(data)
        #interact_type = data.interact_types_dict["listen"
        # time_scale can be 'w':wallclock_time or 'o':ordinal_time
        split_date_str = "2008/01/01"
        t_window = -1
        t_scale = ord('w')
        max_tries_val = 10000
        max_node_computes_val = 100
        max_interact_ratio_error = 0.1
        klim_val=5
        split_timestamp = int(time.mktime(datetime.datetime.strptime(split_date_str, "%Y/%m/%d").timetuple()))
        # crate trainig test sets that will be used by fake geernation
        data.create_training_test_bytime(interact_type, split_timestamp)
        if create_fake_prefs is not None:
            print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1)
            fake_data.generate_fake_preferences(data,interact_type, split_timestamp, 
                        min_interactions_beforeaftersplit_per_user=min_interacts_beforeaftersplit_per_user,
                        time_window=t_window, time_scale=t_scale, method=create_fake_prefs)
            
            #fake_data.generate_random_preferences(data, interact_type, split_timestamp)
            print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1)
        # Need to generate again because fake data changes test data           
        data.create_training_test_bytime(interact_type, split_timestamp)
        
        la = LocalityAnalyzer(data)
        inf_tuple = compute.test_influence(la, interact_type=interact_type, 
                               time_diff=t_window, time_scale=ord('w'), split_timestamp=split_timestamp, 
                               #time_diff=100000, split_date_str="1970/06/23", 
                               control_divider=0.01,
                               min_interactions_beforeaftersplit_per_user = min_interacts_beforeaftersplit_per_user,
                               max_tries = max_tries_val, max_node_computes=max_node_computes_val, num_processes=4,
                               max_interact_ratio_error=max_interact_ratio_error,
                               klim=klim_val,
                               method="influence")
        print "t-test results", ttest_rel(inf_tuple[2], inf_tuple[3])
        num_vals = len(inf_tuple[0])
        f = open("influence_test", "w")
        for i in range(num_vals):
            f.write("%f\t%f\t%f\t%f\n" % (inf_tuple[0][i], inf_tuple[1][i], 
                        inf_tuple[2][i], inf_tuple[3][i]))
        f.close()
             
    elif computation_cmd=="suscept_test":
        use_artists = "songs" if "songs" in dataset_path else "artists"
        interact_type_str = "listen" if interact_type==0 else "love"
        #M = [50]#,20]#,30,40,50]
        t_scale = ord('o') # ordinal scale, this is the default used in paper.
        NUM_NODES_TO_COMPUTE = 4000000 # maximum number nodes to compute?
        num_threads=4 # the number of threads to spawn
        max_tries_val = None#30000 # should we stop after max_tries?
        max_node_computes_val = NUM_NODES_TO_COMPUTE/num_threads # number of nodes to compute at each node
        #max_interact_ratio_error =0.2 # these are errors (defaults are 0.1,0.1)
        #max_sim_ratio_error = 0.2
        #min_friends_match_ratio = 0.5 # important to be 1 for simulation--because e.g. in influence, we use a person's all friends to compute his next like
        klim_val = None # not used for influence test
        nonfr_match = "random" #random, serial, kbest. Default is random.
        num_loop = 1 # number of times we calculate this. For averaging results over multiple runs.
        f = open("suscept_test_results/"+dataset_domain + dataset_path.split("/")[-2] + interact_type_str+ strftime("%Y-%m-%d_%H:%M:%S")+'.dat', 'w')
        f.write("# use_artists=%r\tallow_duplicates=%r\tmax_node_computes_val=%d\tcreate_fake_prefs=%r\tnum_loop=%d\n" % (
                    use_artists, allow_duplicates, max_node_computes_val,
                        create_fake_prefs, num_loop))
        f.write("# split_train_test_date=%s\ttime_scale=%d\tmin_interactions_beforeaftersplit_per_user=%d\tnum_threads=%d\n" % (
                    split_date_str, t_scale, min_interacts_beforeaftersplit_per_user, num_threads))
        f.write("# max_interact_ratio_error=%f\tmax_sim_ratio_error=%f\tmin_friends_match_ratio=%f\n" %(
                    max_interact_ratio_error, max_sim_ratio_error, min_friends_match_ratio
                    ))
        for t_window in M:
            for h in range(num_loop):
                f.write("\n\n################### ALERTINFO: STARTING ITERATION %d  with M=%d\n" %( h, t_window))
                if split_date_str=="test": split_timestamp = 2000
                else:
                    split_timestamp = int(time.mktime(datetime.datetime.strptime(split_date_str, "%Y/%m/%d").timetuple()))
                #split_timestamp=25000000
                if create_fake_prefs is not None:
                    data.create_training_test_bytime(interact_type, split_timestamp)
                    #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1)
                    fake_data.generate_fake_preferences(data,interact_type, split_timestamp,
                            min_interactions_beforeaftersplit_per_user = min_interacts_beforeaftersplit_per_user,
                            time_window=t_window, time_scale=t_scale, method=create_fake_prefs)
                    #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1)
                # Need to generate again because fake data changes test data           
                data.create_training_test_bytime(interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user=min_interacts_beforeaftersplit_per_user)
                la = LocalityAnalyzer(data)
                inf_tuple = compute.test_influence(la, interact_type=interact_type, 
                                       time_diff=t_window, time_scale=t_scale, split_timestamp=split_timestamp, 
                                       #time_diff=100000, split_date_str="1970/06/23", 
                                       control_divider=0.01, # not used anymore
                                       min_interactions_beforeaftersplit_per_user = min_interacts_beforeaftersplit_per_user,
                                       max_tries = max_tries_val, max_node_computes=max_node_computes_val, num_threads=num_threads,
                                       max_interact_ratio_error = max_interact_ratio_error,
                                       max_sim_ratio_error = max_sim_ratio_error,
                                       min_friends_match_ratio=min_friends_match_ratio,
                                       klim = klim_val,
                                       nonfr_match=nonfr_match,
                                       method="suscept", 
                                       allow_duplicates=allow_duplicates)
                print "t-test results", ttest_rel(inf_tuple[2], inf_tuple[3])
                num_vals = len(inf_tuple[0])
                f.write("TestSetSize\tFrSimilarity\tNonFrSimilarity\tFrOverlap\tNonFrOverlap\tRandom_run_no\tM\n")
                for i in range(num_vals):
                    f.write("%d\t%f\t%f\t%f\t%f\t%d\t%d\n" % (inf_tuple[0][i], inf_tuple[1][i], 
                                inf_tuple[2][i], inf_tuple[3][i], inf_tuple[4][i], h, t_window))
        f.close()
    elif computation_cmd=="gen_adopt_data":
        t_window = 100 
        t_scale = ord('o')
        if split_date_str=="test": split_timestamp = 2000
        else:
            split_timestamp = int(time.mktime(datetime.datetime.strptime(split_date_str, "%Y/%m/%d").timetuple()))
        if create_fake_prefs is not None:
            data.create_training_test_bytime(interact_type, split_timestamp)
            #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1)
            fake_data.generate_fake_preferences(data,interact_type, split_timestamp,
                    min_interactions_beforeaftersplit_per_user = min_interacts_beforeaftersplit_per_user,
                    time_window=t_window, time_scale=t_scale, method=create_fake_prefs)
        
        data.create_training_test_bytime(interact_type, split_timestamp)
        gen_adopt.generate_adoption_data(data, interact_type, split_timestamp, 
            min_interactions_beforeaftersplit_per_user=min_interacts_beforeaftersplit_per_user, time_window=t_window, 
            time_scale=t_scale)
    elif computation_cmd=="compute_split_date":
        ret_timestamp = compute.compute_cutoff_date(data, interact_type, traindata_fraction)
        print ret_timestamp
        print datetime.datetime.fromtimestamp(ret_timestamp*86400).strftime("%Y-%m-%d")
    """
Esempio n. 6
0
def instantiate_networkdata_class(dataset_domain, dataset_path, impl_type,
                                  max_core_nodes, cutoff_rating, store_dataset,
                                  interact_type_val,
                                  min_interacts_beforeaftersplit_per_user):
    data = None
    #h = hpy()
    #h.setref()
    if dataset_domain == "twitter":
        data = HashtagDataPreparser(dataset_path, impl_type)
    elif dataset_domain == "lastfm":
        data = LastfmDataPreparserCSV(dataset_path,
                                      impl_type,
                                      cutoff_rating,
                                      max_core_nodes,
                                      store_dataset,
                                      use_artists=False)
    elif dataset_domain == "lastfm_simple":
        data = LastfmDataPreparserSimple(
            dataset_path,
            impl_type,
            cutoff_rating,
            max_core_nodes,
            store_dataset,
            use_artists=False,
            interact_type_val=interact_type_val,
            min_interactions_per_user=min_interacts_beforeaftersplit_per_user *
            2)
    elif dataset_domain == "lastfm_lovelisten":
        data = LastfmDataPreparserLovelisten(
            dataset_path,
            impl_type,
            cutoff_rating,
            max_core_nodes,
            store_dataset,
            use_artists=False,
            interact_type_val=interact_type_val,
            min_interactions_per_user=min_interacts_beforeaftersplit_per_user *
            2)
    elif dataset_domain == "goodreads":
        data = GoodreadsDataPreparser(
            dataset_path,
            impl_type,
            cutoff_rating,
            max_core_nodes,
            store_dataset,
            min_interactions_per_user=min_interacts_beforeaftersplit_per_user *
            2)
    elif dataset_domain == "flixster":
        data = FlixsterDataPreparser(
            dataset_path,
            impl_type,
            cutoff_rating,
            max_core_nodes,
            store_dataset,
            min_interactions_per_user=min_interacts_beforeaftersplit_per_user *
            2)
    elif dataset_domain == "flickr":
        data = FlickrDataPreparser(
            dataset_path,
            impl_type,
            cutoff_rating,
            max_core_nodes,
            store_dataset,
            min_interactions_per_user=min_interacts_beforeaftersplit_per_user *
            2)

    try:
        data.get_all_data()
        BasicNetworkAnalyzer(data).show_basic_stats()
    except:
        raise
    return data
def compute_susceptibility_randomselect(
        netdata, nodes_list, interact_type, cutoff_rating, control_divider,
        min_interactions_per_user, time_diff, time_scale, max_tries,
        max_node_computes, max_interact_ratio_error, nonfr_match,
        allow_duplicates):
    # Find similarity on training set
    max_sim_ratio_error = 0.1
    triplet_nodes = []
    counter = 0
    failed_counter = 0
    eligible_nodes_counter = 0
    count_success = 0
    edges_counter = 0
    total_tries_counter = 0
    time_saved_counter = 0

    if max_tries is None:
        max_tries = netdata.get_total_num_nodes()
    randomized_node_ids = random.sample(
        xrange(1,
               netdata.get_total_num_nodes() + 1), max_tries)

    data_type = "compare_train"
    data_type_code = ord(data_type[0])
    #sim_dict = {}
    for node in nodes_list:
        nonfr_ids = {}
        sim_dict = {}
        num_node_interacts = node.get_num_interactions(
            interact_type)  # return all interactions, no check for duplicates
        #if not node.has_interactions(interact_type) or not node.has_friends():
        if node.length_train_ids < min_interactions_per_user or node.length_test_ids < min_interactions_per_user or not node.has_friends(
        ):
            #print "Node has no interactions. Skipping!"
            counter += 1
            continue
        eligible_nodes_counter += 1
        fnodes = netdata.get_friends_nodes(node)
        control_nonfr_nodes = []
        avg_fsim = 0
        avg_rsim = 0
        num_eligible_friends = 0
        selected_friends = []
        friend_ids = node.get_friend_ids()
        edges_counter += len(friend_ids)
        for fobj in fnodes:
            num_fobj_interacts = fobj.get_num_interactions(interact_type)
            if fobj.length_train_ids >= min_interactions_per_user and fobj.length_test_ids >= min_interactions_per_user:
                """
                fsim2 = node.compute_node_similarity(fobj, interact_type, 
                        cutoff_rating, data_type_code, 
                        min_interactions_per_user, time_diff=500000, time_scale=ord('w'))#time_diff=-1, time_scale=time_scale)
                """
                if (fobj.uid, node.uid) in sim_dict:
                    fsim = sim_dict[(fobj.uid, node.uid)]
                elif (node.uid, fobj.uid) in sim_dict:
                    fsim = sim_dict[(node.uid, fobj.uid)]
                else:
                    fsim = node.compute_node_similarity(
                        fobj,
                        interact_type,
                        cutoff_rating,
                        data_type_code,
                        min_interactions_per_user,
                        time_diff=-1,
                        time_scale=time_scale)
                    sim_dict[(fobj.uid, node.uid)] = fsim


#if fsim is None:
#                        print "Error:fsim cannot be None"
#print fsim
                found = False
                if fsim is not None and fsim != -1:
                    num_eligible_friends += 1
                    total_tries_counter += 1
                    tries = 0
                    if nonfr_match == "random":
                        randomized_node_ids = random.sample(
                            xrange(1,
                                   netdata.get_total_num_nodes() + 1),
                            max_tries)
                    elif nonfr_match == "kbest":
                        global_candidates = netdata.get_othernodes_iterable(
                            fobj, should_have_interactions=True)
                        globalk_neighbors = BasicNetworkAnalyzer.compute_knearest_neighbors(
                            fobj,
                            global_candidates,
                            interact_type,
                            1000,
                            data_type=data_type,
                            cutoff_rating=-1,
                            min_interactions_per_user=min_interactions_per_user,
                            time_diff=-1,
                            time_scale=ord('w'))
                        randomized_node_ids = [
                            heapq.heappop(globalk_neighbors)[1].uid
                            for h in xrange(len(globalk_neighbors))
                        ]
                        randomized_node_ids.reverse()
                    elif nonfr_match == "serial":
                        randomized_node_ids = range(1, max_tries + 1)
                    else:
                        print "Error in parameter"
                        sys.exit(1)
                    r_index = 0

                    while not found and r_index < max_tries and r_index < len(
                            randomized_node_ids):
                        rand_node_id = randomized_node_ids[r_index]
                        r_index += 1
                        if rand_node_id in nonfr_ids:
                            continue
                        rand_node = netdata.nodes[rand_node_id]
                        if rand_node.length_train_ids >= min_interactions_per_user and rand_node.length_test_ids >= min_interactions_per_user:
                            ratio_train = abs(rand_node.length_train_ids -
                                              fobj.length_train_ids) / float(
                                                  fobj.length_train_ids)
                            if ratio_train <= max_interact_ratio_error:
                                if rand_node.uid not in friend_ids and rand_node.uid != node.uid:
                                    if (rand_node.uid, node.uid) in sim_dict:
                                        rsim = sim_dict[(rand_node.uid,
                                                         node.uid)]
                                        time_saved_counter += 1
                                    elif (node.uid, rand_node.uid) in sim_dict:
                                        rsim = sim_dict[(node.uid,
                                                         rand_node.uid)]
                                        time_saved_counter += 1
                                    else:
                                        rsim = node.compute_node_similarity(
                                            rand_node,
                                            interact_type,
                                            cutoff_rating,
                                            data_type_code,
                                            min_interactions_per_user,
                                            time_diff=-1,
                                            time_scale=time_scale)
                                        sim_dict[(rand_node.uid,
                                                  node.uid)] = rsim
                                        """
                                        rsim2 = node.compute_node_similarity(rand_node, interact_type, 
                                                        cutoff_rating, data_type_code, min_interactions_per_user, 
                                                        time_diff=500000, time_scale=ord('w'))#time_diff=-1, time_scale=time_scale)
                                                        #time_diff=-1, time_scale=time_scale)
                                        """
                                    num_rnode_interacts = rand_node.get_num_interactions(
                                        interact_type)
                                    if rsim is not None and rsim != -1:
                                        sim_diff = abs(rsim - fsim)
                                        if (
                                                fsim == 0
                                                and sim_diff <= 0.00001
                                        ) or (
                                                fsim > 0 and sim_diff / fsim <=
                                                max_sim_ratio_error
                                        ):  # and (fsim2 >0 and abs(rsim2-fsim2)/fsim2<=max_sim_ratio_error)):
                                            """
                                            fr_nonfr_sim = fobj.compute_node_similarity(rand_node, interact_type, 
                                                        cutoff_rating, data_type_code, min_interactions_per_user, 
                                                        time_diff=-1, time_scale=time_scale)
                                            print fr_nonfr_sim, node.length_train_ids, fobj.length_train_ids, rand_node.length_train_ids, fsim, rsim, r_index, max_tries
                                            if fr_nonfr_sim > 2*fsim:
                                            """
                                            if True:
                                                found = True
                                                avg_fsim += fsim
                                                avg_rsim += rsim
                                                nonfr_ids[rand_node_id] = True
                                                control_nonfr_nodes.append(
                                                    rand_node)
                                                selected_friends.append(fobj)
                        tries += 1
                    if not found:
                        #print "Could not get random non-friend with sim", fsim, "in %d tries" %tries
                        failed_counter += 1
        #print "SEE:", len(control_nonfr_nodes), num_eligible_friends
        if num_eligible_friends > 0 and len(
                control_nonfr_nodes) >= 1 * num_eligible_friends:
            avg_fsim = avg_fsim / float(len(control_nonfr_nodes))
            avg_rsim = avg_rsim / float(len(control_nonfr_nodes))
            #print num_eligible_friends, len(selected_friends)
            if len(selected_friends) != len(control_nonfr_nodes):
                print "ALERT: Something is wrong here!!"
                sys.exit(2)
            if len(control_nonfr_nodes) != num_eligible_friends:
                print "WARN: Cannot match all eligible friends", num_eligible_friends, len(
                    control_nonfr_nodes)
            #print node.uid, [fr.uid for fr in selected_friends]
            triplet_nodes.append((node, selected_friends, control_nonfr_nodes,
                                  0, 0, 0, avg_fsim, avg_rsim))
            count_success += 1
        if counter % 10 == 0:
            print "Done counter", counter
        if max_node_computes is not None:
            if counter > max_node_computes:
                print counter, max_node_computes
                break
        counter += 1
    print "\n--Number of nodes assigned to me(with interactions and friends):", len(
        nodes_list)
    print "--Eligible nodes (with interactions > %d): " % min_interactions_per_user, eligible_nodes_counter
    print "--Total Edges from eligible nodes:", edges_counter
    #print "--Eligible friend-edges (with friend hving interactions >%d): " %min_interactions_per_user, eligible_edges_counter
    print "--Number of tries (and successful caches) to find random non-friend:", total_tries_counter, time_saved_counter
    print "--Number of  successful nodes (can find rnodes):", count_success
    print "--Successful triplets:", len(triplet_nodes)

    # Now compare influencer effect on test set
    data_type = "influence_effect"
    data_type_code = ord(data_type[0])
    influence_arr = compare_susceptibility_effect(
        triplet_nodes, interact_type, cutoff_rating, min_interactions_per_user,
        time_diff, time_scale, data_type_code, allow_duplicates)
    return influence_arr
Esempio n. 8
0
def run_computation(data, computation_cmd, outf, interact_type,
                    create_fake_prefs, allow_duplicates, split_date_str,
                    dataset_domain, dataset_path,
                    min_interacts_beforeaftersplit_per_user,
                    max_interact_ratio_error, max_sim_ratio_error,
                    min_friends_match_ratio, traindata_fraction, M):
    net_analyzer = BasicNetworkAnalyzer(data)
    interaction_types = data.interact_types_dict
    filename_prefix = computation_cmd if computation_cmd is not None else ""

    if computation_cmd == "basic_stats" or computation_cmd is None:
        net_analyzer.show_basic_stats()
        ## use below if you want to write a new dataset (e.g. after filtering)
        data.store_ego_dataset(
            "/home/amit/datasets/social_activity_data/lastfm_filtered_listen/",
            write_maps=False)
        #data.compute_allpairs_sim(interact_type, data_type=ord("a"))

    elif computation_cmd == "random_similarity":
        for type_name, type_index in interaction_types.iteritems():
            circlesims, globalsims = net_analyzer.compare_circle_global_similarity(
                type_index, num_random_trials=5, cutoff_rating=cutoff_rating)
            #plotter.plotLinesYY(circlesims, globalsims, "Friends", "Global")
            outf.write("User_id\tcircle_sim\tnonfriend_sim\n")
            outf.write(type_name + '\n')
            for ind in range(len(circlesims)):
                outf.write("%s\t%f\t%f\n" %
                           (circlesims[ind][0], circlesims[ind][1],
                            globalsims[ind][1]))
            print "\n", type_name, ":"
            print "Circle Average", sum([v2 for v1, v2 in circlesims]) / float(
                len(circlesims))
            print "Global Average", sum([v2 for v1, v2 in globalsims]) / float(
                len(globalsims))

    elif computation_cmd == "knn_similarity":
        #Compute K-nearest similarity
        KLIMITS = [10]
        outf.write("User_id\tk\tcircle_sim\tnonfriend_sim\n")

        for type_name, type_index in interaction_types.iteritems():
            for curr_lim in KLIMITS:
                plot_circle, plot_external = net_analyzer.compare_circle_global_knnsimilarity(
                    type_index, klim=curr_lim, cutoff_rating=cutoff_rating)
                compare_sims(plot_circle, plot_external)
                outf.write(type_name + '\n')
                for ind in range(len(plot_circle)):
                    outf.write("%s\t%d\t%f\t%f\n" %
                               (plot_circle[ind][0], curr_lim,
                                plot_circle[ind][1], plot_external[ind][1]))
                #plotter.plotLinesYY(plot_circle, plot_external, "Friends", "Global")
                print type_name, "K", curr_lim
                print "Circle Average", utils.mean_sd(
                    [v2 for v1, v2 in plot_circle]), len(plot_circle)
                print "Global Average", utils.mean_sd(
                    [v2 for v1, v2 in plot_external]), len(plot_external)

    elif computation_cmd == "knn_recommender":
        #Compute K-nearest recommender
        KLIMITS = [10]
        rec_analyzer = RecommenderAnalyzer(data,
                                           max_recs_shown=10,
                                           traintest_split=0.7,
                                           cutoff_rating=cutoff_rating)
        outf.write("User_id\tk\trun_index\tcircle_ndcg\tnonfriend_ndcg\n")
        for type_name, type_index in interaction_types.iteritems():
            for curr_lim in KLIMITS:
                local_avg = []
                global_avg = []
                Ntotal = 10
                for i in range(
                        Ntotal):  # randomize because of training-test split.
                    plot_circle, plot_external = rec_analyzer.compare_knearest_recommenders(
                        type_index, klim=curr_lim, num_processes=2)
                    compare_sims(plot_circle, plot_external)
                    outf.write(type_name + "\n")
                    for ind in range(len(plot_circle)):
                        outf.write(
                            "%s\t%d\t%d\t%f\t%f\n" %
                            (plot_circle[ind][0], curr_lim, i,
                             plot_circle[ind][1], plot_external[ind][1]))
                    print "\n", type_name, "K", curr_lim

                    #print plot_circle, plot_external
                    curr_avg_local = utils.mean_sd(
                        [v2 for v1, v2 in plot_circle])
                    curr_avg_global = utils.mean_sd(
                        [v2 for v1, v2 in plot_external])
                    print "Circle Average", curr_avg_local
                    print "Global Average", curr_avg_global
                    local_avg.append(curr_avg_local[0])
                    global_avg.append(curr_avg_global[0])
                    #plotLinesYY(plot_circle, plot_external, "Friends", "Global")
                print "Local", sum(local_avg) / float(Ntotal)
                print "Global", sum(global_avg) / float(Ntotal)
    elif computation_cmd == "circle_coverage":
        lim_friends = [(5, 10), (10, 20), (20, 50), (50, 100)]
        for fr_limit in lim_friends:
            locality_analyzer = LocalityAnalyzer(data)
            coverage_list = locality_analyzer.compare_circle_item_coverages(
                0, fr_limit[0], fr_limit[1])
            plotter.plotLineY(
                sorted(coverage_list), "User",
                "Fraction of Items Covered with %d-%d friends" %
                (fr_limit[0], fr_limit[1]))
            print utils.mean_sd(coverage_list)
    elif computation_cmd == "items_edge_coverage":
        locality_analyzer = LocalityAnalyzer(data)
        items_cov_list, items_popularity, cov_ratio_list = locality_analyzer.compare_items_edge_coverage(
            1, minimum_interactions=1)
        print utils.mean_sd(items_cov_list)
        print utils.mean_sd(items_popularity)
        #plotter.plotHist(sorted([val for val in cov_ratio_list if val<=1]), "Ratio of Edge coverage to total popularity", "Frequency", logyscale=True)
        #####plotter.plotHist(sorted([val for val in cov_ratio_list]), "Ratio of Edge coverage to total popularity", "Frequency", logyscale=True)
        #plotter.plotHist(sorted(items_popularity), "Item", "total popularity")
        plotter.plotCumulativePopularity(
            items_popularity,
            labelx="Item percentile",
            labely="Cum. percent of number of likes")
    elif computation_cmd == "network_draw":
        net_visualizor = NetworkVisualizor(data)
        net_visualizor.draw_network()
    elif computation_cmd == "network_item_adopt":
        net_visualizor = NetworkVisualizor(data)
        pprint(net_visualizor.plot_item_adoption(1669118))
    elif computation_cmd == "node_details":
        for node_id in open('user_ids'):
            if node_id.strip('\n') != "User_id":
                net_analyzer.get_node_details(int(node_id.strip('\n')))
    elif computation_cmd == "store_dataset":
        user_interacts = net_analyzer.get_user_interacts(1, cutoff_rating)
        f = open(outf_path + 'user_interacts_' + dataset_domain + '.tsv', 'w')
        f.write("user_id\titem_id\ttimestamp\n")
        for user_id, item_id, timestamp in user_interacts:
            f.write("%s\t%s\t%s\n" % (user_id, item_id, timestamp))
        f.close()

        item_pop = net_analyzer.get_items_popularity(1, cutoff_rating)
        f = open(outf_path + 'items_' + dataset_domain + '.tsv', 'w')
        f.write("item_id\tpopularity\n")
        for item_id, pop in item_pop.iteritems():
            f.write("%s\t%s\n" % (item_id, pop))
        f.close()

        user_friends = net_analyzer.get_user_friends()
        f = open('user_friends_' + dataset_domain + '.tsv', 'w')
        f.write("user_id\tfriend_id\n")
        for user_id, friend_id in user_friends:
            f.write("%s\t%s\n" % (user_id, friend_id))
        f.close()
        print "Successfully stored tsv dataset"
    elif computation_cmd == "compare_interact_types":
        num_interacts_dict = net_analyzer.compare_interaction_types()
        interact_types = num_interacts_dict.keys()
        plotter.plotLinesYY(num_interacts_dict[interact_types[0]],
                            num_interacts_dict[interact_types[1]],
                            interact_types[0],
                            interact_types[1],
                            display=True,
                            logyscale=True)

        plotter.plotLinesYY(num_interacts_dict[interact_types[1]],
                            num_interacts_dict[interact_types[2]],
                            interact_types[1],
                            interact_types[2],
                            display=True,
                            logyscale=True)

        plotter.plotLinesYY(num_interacts_dict[interact_types[0]],
                            num_interacts_dict[interact_types[2]],
                            interact_types[0],
                            interact_types[2],
                            display=True,
                            logyscale=True)
    elif computation_cmd == "influence_test":
        #   ta = TemporalAnalyzer(data)
        #interact_type = data.interact_types_dict["listen"
        # time_scale can be 'w':wallclock_time or 'o':ordinal_time
        split_date_str = "2008/01/01"
        t_window = -1
        t_scale = ord('w')
        max_tries_val = 10000
        max_node_computes_val = 100
        max_interact_ratio_error = 0.1
        klim_val = 5
        split_timestamp = int(
            time.mktime(
                datetime.datetime.strptime(split_date_str,
                                           "%Y/%m/%d").timetuple()))
        # crate trainig test sets that will be used by fake geernation
        data.create_training_test_bytime(interact_type, split_timestamp)
        if create_fake_prefs is not None:
            print data.get_nodes_list()[1].get_interactions(interact_type,
                                                            cutoff_rating=-1)
            fake_data.generate_fake_preferences(
                data,
                interact_type,
                split_timestamp,
                min_interactions_beforeaftersplit_per_user=
                min_interacts_beforeaftersplit_per_user,
                time_window=t_window,
                time_scale=t_scale,
                method=create_fake_prefs)

            #fake_data.generate_random_preferences(data, interact_type, split_timestamp)
            print data.get_nodes_list()[1].get_interactions(interact_type,
                                                            cutoff_rating=-1)
        # Need to generate again because fake data changes test data
        data.create_training_test_bytime(interact_type, split_timestamp)

        la = LocalityAnalyzer(data)
        inf_tuple = compute.test_influence(
            la,
            interact_type=interact_type,
            time_diff=t_window,
            time_scale=ord('w'),
            split_timestamp=split_timestamp,
            #time_diff=100000, split_date_str="1970/06/23",
            control_divider=0.01,
            min_interactions_beforeaftersplit_per_user=
            min_interacts_beforeaftersplit_per_user,
            max_tries=max_tries_val,
            max_node_computes=max_node_computes_val,
            num_processes=4,
            max_interact_ratio_error=max_interact_ratio_error,
            klim=klim_val,
            method="influence")
        print "t-test results", ttest_rel(inf_tuple[2], inf_tuple[3])
        num_vals = len(inf_tuple[0])
        f = open("influence_test", "w")
        for i in range(num_vals):
            f.write("%f\t%f\t%f\t%f\n" % (inf_tuple[0][i], inf_tuple[1][i],
                                          inf_tuple[2][i], inf_tuple[3][i]))
        f.close()

    elif computation_cmd == "suscept_test":
        use_artists = "songs" if "songs" in dataset_path else "artists"
        interact_type_str = "listen" if interact_type == 0 else "love"
        #M = [50]#,20]#,30,40,50]
        t_scale = ord('o')  # ordinal scale, this is the default used in paper.
        NUM_NODES_TO_COMPUTE = 4000000  # maximum number nodes to compute?
        num_threads = 4  # the number of threads to spawn
        max_tries_val = None  #30000 # should we stop after max_tries?
        max_node_computes_val = NUM_NODES_TO_COMPUTE / num_threads  # number of nodes to compute at each node
        #max_interact_ratio_error =0.2 # these are errors (defaults are 0.1,0.1)
        #max_sim_ratio_error = 0.2
        #min_friends_match_ratio = 0.5 # important to be 1 for simulation--because e.g. in influence, we use a person's all friends to compute his next like
        klim_val = None  # not used for influence test
        nonfr_match = "random"  #random, serial, kbest. Default is random.
        num_loop = 1  # number of times we calculate this. For averaging results over multiple runs.
        f = open(
            "suscept_test_results/" + dataset_domain +
            dataset_path.split("/")[-2] + interact_type_str +
            strftime("%Y-%m-%d_%H:%M:%S") + '.dat', 'w')
        f.write(
            "# use_artists=%r\tallow_duplicates=%r\tmax_node_computes_val=%d\tcreate_fake_prefs=%r\tnum_loop=%d\n"
            % (use_artists, allow_duplicates, max_node_computes_val,
               create_fake_prefs, num_loop))
        f.write(
            "# split_train_test_date=%s\ttime_scale=%d\tmin_interactions_beforeaftersplit_per_user=%d\tnum_threads=%d\n"
            % (split_date_str, t_scale,
               min_interacts_beforeaftersplit_per_user, num_threads))
        f.write(
            "# max_interact_ratio_error=%f\tmax_sim_ratio_error=%f\tmin_friends_match_ratio=%f\n"
            % (max_interact_ratio_error, max_sim_ratio_error,
               min_friends_match_ratio))
        for t_window in M:
            for h in range(num_loop):
                f.write(
                    "\n\n################### ALERTINFO: STARTING ITERATION %d  with M=%d\n"
                    % (h, t_window))
                if split_date_str == "test": split_timestamp = 2000
                else:
                    split_timestamp = int(
                        time.mktime(
                            datetime.datetime.strptime(
                                split_date_str, "%Y/%m/%d").timetuple()))
                #split_timestamp=25000000
                if create_fake_prefs is not None:
                    data.create_training_test_bytime(interact_type,
                                                     split_timestamp)
                    #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1)
                    fake_data.generate_fake_preferences(
                        data,
                        interact_type,
                        split_timestamp,
                        min_interactions_beforeaftersplit_per_user=
                        min_interacts_beforeaftersplit_per_user,
                        time_window=t_window,
                        time_scale=t_scale,
                        method=create_fake_prefs)
                    #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1)
                # Need to generate again because fake data changes test data
                data.create_training_test_bytime(
                    interact_type,
                    split_timestamp,
                    min_interactions_beforeaftersplit_per_user=
                    min_interacts_beforeaftersplit_per_user)
                la = LocalityAnalyzer(data)
                inf_tuple = compute.test_influence(
                    la,
                    interact_type=interact_type,
                    time_diff=t_window,
                    time_scale=t_scale,
                    split_timestamp=split_timestamp,
                    #time_diff=100000, split_date_str="1970/06/23",
                    control_divider=0.01,  # not used anymore
                    min_interactions_beforeaftersplit_per_user=
                    min_interacts_beforeaftersplit_per_user,
                    max_tries=max_tries_val,
                    max_node_computes=max_node_computes_val,
                    num_threads=num_threads,
                    max_interact_ratio_error=max_interact_ratio_error,
                    max_sim_ratio_error=max_sim_ratio_error,
                    min_friends_match_ratio=min_friends_match_ratio,
                    klim=klim_val,
                    nonfr_match=nonfr_match,
                    method="suscept",
                    allow_duplicates=allow_duplicates)
                print "t-test results", ttest_rel(inf_tuple[2], inf_tuple[3])
                num_vals = len(inf_tuple[0])
                f.write(
                    "TestSetSize\tFrSimilarity\tNonFrSimilarity\tFrOverlap\tNonFrOverlap\tRandom_run_no\tM\n"
                )
                for i in range(num_vals):
                    f.write("%d\t%f\t%f\t%f\t%f\t%d\t%d\n" %
                            (inf_tuple[0][i], inf_tuple[1][i], inf_tuple[2][i],
                             inf_tuple[3][i], inf_tuple[4][i], h, t_window))
        f.close()
    elif computation_cmd == "gen_adopt_data":
        t_window = 100
        t_scale = ord('o')
        if split_date_str == "test": split_timestamp = 2000
        else:
            split_timestamp = int(
                time.mktime(
                    datetime.datetime.strptime(split_date_str,
                                               "%Y/%m/%d").timetuple()))
        if create_fake_prefs is not None:
            data.create_training_test_bytime(interact_type, split_timestamp)
            #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1)
            fake_data.generate_fake_preferences(
                data,
                interact_type,
                split_timestamp,
                min_interactions_beforeaftersplit_per_user=
                min_interacts_beforeaftersplit_per_user,
                time_window=t_window,
                time_scale=t_scale,
                method=create_fake_prefs)

        data.create_training_test_bytime(interact_type, split_timestamp)
        gen_adopt.generate_adoption_data(
            data,
            interact_type,
            split_timestamp,
            min_interactions_beforeaftersplit_per_user=
            min_interacts_beforeaftersplit_per_user,
            time_window=t_window,
            time_scale=t_scale)
    elif computation_cmd == "compute_split_date":
        ret_timestamp = compute.compute_cutoff_date(data, interact_type,
                                                    traindata_fraction)
        print ret_timestamp
        print datetime.datetime.fromtimestamp(ret_timestamp *
                                              86400).strftime("%Y-%m-%d")
    """
 def __init__(self, netdata):
     BasicNetworkAnalyzer.__init__(self, netdata)
     self.interactions_stream = []
     self.items_pop = defaultdict(int)
     self.num_users_with_interactions = 0
 def __init__(self, netdata):
     BasicNetworkAnalyzer.__init__(self, netdata)
     self.interactions_stream = []
     self.items_pop = defaultdict(int)
     self.num_users_with_interactions = 0
def compute_susceptibility_randomselect(netdata, nodes_list, interact_type, 
                                            cutoff_rating, control_divider, min_interactions_per_user, 
                                            time_diff, time_scale, max_tries, max_node_computes,
                                            max_interact_ratio_error, nonfr_match,
                                            allow_duplicates):   
    # Find similarity on training set
    max_sim_ratio_error = 0.1
    triplet_nodes = []
    counter = 0
    failed_counter = 0
    eligible_nodes_counter = 0
    count_success = 0
    edges_counter = 0
    total_tries_counter = 0
    time_saved_counter = 0
   
    if max_tries is None:
        max_tries = netdata.get_total_num_nodes()
    randomized_node_ids = random.sample(xrange(1, netdata.get_total_num_nodes()+1), max_tries)
    
    data_type="compare_train"
    data_type_code=ord(data_type[0]) 
    #sim_dict = {}
    for node in nodes_list:
        nonfr_ids = {}
        sim_dict = {}
        num_node_interacts = node.get_num_interactions(interact_type) # return all interactions, no check for duplicates
        #if not node.has_interactions(interact_type) or not node.has_friends():
        if node.length_train_ids < min_interactions_per_user or node.length_test_ids <min_interactions_per_user or not node.has_friends():
            #print "Node has no interactions. Skipping!"
            counter +=1
            continue
        eligible_nodes_counter += 1
        fnodes = netdata.get_friends_nodes(node)
        control_nonfr_nodes = []
        avg_fsim = 0
        avg_rsim = 0
        num_eligible_friends = 0
        selected_friends = []
        friend_ids = node.get_friend_ids()
        edges_counter += len(friend_ids)
        for fobj in fnodes:
            num_fobj_interacts = fobj.get_num_interactions(interact_type)
            if fobj.length_train_ids >=min_interactions_per_user and fobj.length_test_ids >=min_interactions_per_user:
                """
                fsim2 = node.compute_node_similarity(fobj, interact_type, 
                        cutoff_rating, data_type_code, 
                        min_interactions_per_user, time_diff=500000, time_scale=ord('w'))#time_diff=-1, time_scale=time_scale)
                """
                if (fobj.uid,node.uid) in sim_dict:
                    fsim = sim_dict[(fobj.uid,node.uid)]
                elif (node.uid,fobj.uid) in sim_dict:
                    fsim = sim_dict[(node.uid,fobj.uid)]
                else:
                    fsim = node.compute_node_similarity(fobj, interact_type, 
                            cutoff_rating, data_type_code, 
                            min_interactions_per_user, time_diff=-1, time_scale=time_scale)
                    sim_dict[(fobj.uid, node.uid)] = fsim
#if fsim is None:
#                        print "Error:fsim cannot be None"
                #print fsim
                found = False
                if fsim is not None and fsim!=-1:
                    num_eligible_friends += 1
                    total_tries_counter += 1
                    tries=0
                    if nonfr_match=="random":
                        randomized_node_ids = random.sample(xrange(1, netdata.get_total_num_nodes()+1), max_tries)
                    elif nonfr_match=="kbest":
                        global_candidates = netdata.get_othernodes_iterable(fobj, should_have_interactions=True)
                        globalk_neighbors = BasicNetworkAnalyzer.compute_knearest_neighbors(fobj, global_candidates, 
                                                                interact_type,1000, data_type=data_type, 
                                                                cutoff_rating = -1,
                                                                min_interactions_per_user=min_interactions_per_user,
                                                                time_diff=-1, time_scale=ord('w'))
                        randomized_node_ids = [heapq.heappop(globalk_neighbors)[1].uid for h in xrange(len(globalk_neighbors))]
                        randomized_node_ids.reverse()
                    elif nonfr_match=="serial":
                        randomized_node_ids = range(1, max_tries+1)
                    else:
                        print "Error in parameter"; sys.exit(1)
                    r_index = 0
                   
                    while not found and r_index < max_tries and r_index<len(randomized_node_ids):
                        rand_node_id = randomized_node_ids[r_index]
                        r_index += 1
                        if rand_node_id in nonfr_ids:
                            continue
                        rand_node = netdata.nodes[rand_node_id]
                        if rand_node.length_train_ids >=min_interactions_per_user and rand_node.length_test_ids >=min_interactions_per_user:
                            ratio_train = abs(rand_node.length_train_ids-fobj.length_train_ids)/float(fobj.length_train_ids)
                            if ratio_train <= max_interact_ratio_error: 
                                if rand_node.uid not in friend_ids and rand_node.uid!=node.uid:
                                    if (rand_node.uid,node.uid) in sim_dict: 
                                        rsim = sim_dict[(rand_node.uid,node.uid)]
                                        time_saved_counter += 1
                                    elif (node.uid,rand_node.uid) in sim_dict:
                                        rsim = sim_dict[(node.uid,rand_node.uid)]
                                        time_saved_counter += 1
                                    else:
                                        rsim = node.compute_node_similarity(rand_node, interact_type, 
                                                        cutoff_rating, data_type_code, min_interactions_per_user, 
                                                        time_diff=-1, time_scale=time_scale)
                                        sim_dict[(rand_node.uid, node.uid)] = rsim
                                        """
                                        rsim2 = node.compute_node_similarity(rand_node, interact_type, 
                                                        cutoff_rating, data_type_code, min_interactions_per_user, 
                                                        time_diff=500000, time_scale=ord('w'))#time_diff=-1, time_scale=time_scale)
                                                        #time_diff=-1, time_scale=time_scale)
                                        """
                                    num_rnode_interacts = rand_node.get_num_interactions(interact_type)
                                    if rsim is not None and rsim!=-1:
                                        sim_diff = abs(rsim-fsim)
                                        if (fsim==0 and sim_diff<=0.00001) or (fsim>0 and
                                                sim_diff/fsim <= max_sim_ratio_error):# and (fsim2 >0 and abs(rsim2-fsim2)/fsim2<=max_sim_ratio_error)):
                                            """
                                            fr_nonfr_sim = fobj.compute_node_similarity(rand_node, interact_type, 
                                                        cutoff_rating, data_type_code, min_interactions_per_user, 
                                                        time_diff=-1, time_scale=time_scale)
                                            print fr_nonfr_sim, node.length_train_ids, fobj.length_train_ids, rand_node.length_train_ids, fsim, rsim, r_index, max_tries
                                            if fr_nonfr_sim > 2*fsim:
                                            """
                                            if True:
                                                found = True
                                                avg_fsim += fsim
                                                avg_rsim += rsim
                                                nonfr_ids[rand_node_id] = True
                                                control_nonfr_nodes.append(rand_node)
                                                selected_friends.append(fobj)
                        tries += 1
                    if not found:
                        #print "Could not get random non-friend with sim", fsim, "in %d tries" %tries
                        failed_counter += 1
        #print "SEE:", len(control_nonfr_nodes), num_eligible_friends
        if num_eligible_friends >0 and len(control_nonfr_nodes) >= 1*num_eligible_friends:
            avg_fsim = avg_fsim/float(len(control_nonfr_nodes))
            avg_rsim = avg_rsim/float(len(control_nonfr_nodes))
            #print num_eligible_friends, len(selected_friends)
            if len(selected_friends) != len(control_nonfr_nodes):
                print "ALERT: Something is wrong here!!"; sys.exit(2)
            if len(control_nonfr_nodes) != num_eligible_friends:
                print "WARN: Cannot match all eligible friends", num_eligible_friends, len(control_nonfr_nodes)
            #print node.uid, [fr.uid for fr in selected_friends]
            triplet_nodes.append((node, selected_friends, control_nonfr_nodes, 
                                 0, 0, 0, avg_fsim, avg_rsim))
            count_success +=1
        if counter %10==0:
            print "Done counter", counter
        if max_node_computes is not None:
            if counter > max_node_computes:
                print counter, max_node_computes
                break
        counter += 1
    print "\n--Number of nodes assigned to me(with interactions and friends):", len(nodes_list)
    print "--Eligible nodes (with interactions > %d): " %min_interactions_per_user, eligible_nodes_counter
    print "--Total Edges from eligible nodes:", edges_counter
    #print "--Eligible friend-edges (with friend hving interactions >%d): " %min_interactions_per_user, eligible_edges_counter
    print "--Number of tries (and successful caches) to find random non-friend:", total_tries_counter, time_saved_counter
    print "--Number of  successful nodes (can find rnodes):", count_success
    print "--Successful triplets:", len(triplet_nodes) 


    # Now compare influencer effect on test set
    data_type="influence_effect"
    data_type_code=ord(data_type[0]) 
    influence_arr = compare_susceptibility_effect(triplet_nodes, interact_type, 
                                              cutoff_rating, min_interactions_per_user, 
                                              time_diff, time_scale, data_type_code,
                                              allow_duplicates)
    return influence_arr
Esempio n. 12
0
import numpy as np
import operator
from network_analyzer_example import *
import compare_adopt_share
from compare_adopt_share import *
import socintpy.util.plotter as plotter
from socintpy.networkcompute.basic_network_analyzer import BasicNetworkAnalyzer

if __name__ == "__main__":
    if len(sys.argv) ==2:
        print sys.argv[1] # if True, then use raw_data csvs and store as ego_nets
        data = get_data(bool(sys.argv[1]))
    else:
        data = get_data()
    net_analyzer = BasicNetworkAnalyzer(data)
    #net_analyzer.show_basic_stats()

na = AdoptShareComparer(data)