def compare_scores_histogram(run_mode,
                             algorithms_list=[],
                             evaluated_domain_list=None):
    removed_domains_f = '/home/michal/SALSA_files/tmp/remove_domains_from_results'
    for alg in algorithms_list:

        print '\n--- main: ' + alg
        sys.stdout.flush()
        if 'pagerank' in alg:
            scores_dict = gm.read_object_from_file(
                gm.get_general_file_path(run_mode,
                                         '_'.join([alg, 'a_dict_pickle']),
                                         evaluated_domain_list))
            gm.histogram_of_dict(scores_dict, fn=removed_domains_f, bins=150)
        else:
            a_scores_dict = gm.read_object_from_file(
                gm.get_general_file_path(run_mode,
                                         '_'.join([alg, 'a_dict_pickle']),
                                         evaluated_domain_list))
            print '--- main: authorities'
            sys.stdout.flush()
            gm.histogram_of_dict(a_scores_dict, fn=removed_domains_f, bins=150)
            print '\n--- main: combined'
            sys.stdout.flush()
    scores_dict = combine_scores(algorithms_list)
    gm.histogram_of_dict(scores_dict, fn=removed_domains_f, bins=150)

    return
Example #2
0
def plotting_fine_tuning():
    aFile = '/home/michal/SALSA_files/tmp/real_run/salsa_a_dict_pickle'
    hFile = '/home/michal/SALSA_files/tmp/real_run/salsa_h_dict_pickle'
    a = gm.read_object_from_file(aFile)
    h = gm.read_object_from_file(hFile)
    gm.saveDict('/home/michal/SALSA_files/tmp/real_run/salsa_a_dict', a)
    gm.saveDict('/home/michal/SALSA_files/tmp/real_run/salsa_h_dict', h)
    print max(a.values())
    gm.histogram_of_dict(a)
    return
def generate_combined_scores(run_mode,
                             algorithms_list=[],
                             evaluated_domain_list=None):
    startTime = datetime.now()
    dicts_list = []
    for alg in algorithms_list:
        dicts_list.append(
            gm.read_object_from_file(
                gm.get_general_file_path(run_mode,
                                         '_'.join([alg, 'a_dict_pickle']),
                                         evaluated_domain_list)))

    combine_types = {'max': 'gm.create_max_dict_from_dicts(dicts_list)',\
                       'avg': 'gm.create_avg_dict_from_dicts(dicts_list)',\
                       'top_3_avg': 'gm.create_avg_dict_from_dicts(dicts_list,n=3)',\
                       'top_2_avg': 'gm.create_avg_dict_from_dicts(dicts_list,n=2)'}
    for k, v in combine_types.items():
        out_file = gm.get_general_file_path(run_mode,
                                            k,
                                            evaluated_domain_list,
                                            dir='outputs')
        comb_score_dict = eval(v)
        u_pct_dict, l_pct_dict = gm.get_percentiles(comb_score_dict)
        gm.write_union_of_dicts_ordered_by_value_to_file(
            comb_score_dict, [u_pct_dict, l_pct_dict], out_file)
    print '\n--- main: combined scores generation and evaluation took: ' + str(
        datetime.now() - startTime)
    sys.stdout.flush()
    return
def compare_scores_histogram(run_mode,algorithms_list=[],evaluated_domain_list=None):
    removed_domains_f = '/home/michal/SALSA_files/tmp/remove_domains_from_results'
    for alg in algorithms_list:
        
        print '\n--- main: '+alg; sys.stdout.flush()
        if 'pagerank' in alg:
            scores_dict = gm.read_object_from_file( gm.get_general_file_path(run_mode,'_'.join([alg,'a_dict_pickle']),evaluated_domain_list) )
            gm.histogram_of_dict(scores_dict, fn=removed_domains_f,bins=150)
        else:
            a_scores_dict = gm.read_object_from_file( gm.get_general_file_path(run_mode,'_'.join([alg,'a_dict_pickle']),evaluated_domain_list) )
            print '--- main: authorities'; sys.stdout.flush()
            gm.histogram_of_dict(a_scores_dict, fn=removed_domains_f,bins=150)
            print '\n--- main: combined'; sys.stdout.flush()
    scores_dict = combine_scores(algorithms_list)
    gm.histogram_of_dict(scores_dict, fn=removed_domains_f,bins=150)
            
        
    return
def combine_scores(algorithms_list,fn=None):
    dicts_list = []
    for alg in algorithms_list:
        dicts_list.append(gm.read_object_from_file( ''.join(['/home/michal/SALSA_files/tmp/real_run/',alg,'_a_dict_pickle']) )) 
    #dict_f = '/home/michal/SALSA_files/tmp/real_run/combined.csv'
    d = gm.create_max_dict_from_dicts(dicts_list)#, dict_f)
    
    u_pct_dict, l_pct_dict = gm.get_percentiles(d)
    #pct_dict_f = '/home/michal/SALSA_files/outputs/real_run/combined.csv'
    if fn:
        gm.write_union_of_dicts_ordered_by_value_to_file(d, [u_pct_dict,l_pct_dict], fn)#pct_dict_f)
    return d
Example #6
0
def main():
    from sklearn import cross_validation
    import stats
    import os
    startTime = datetime.now() 

    domains_risk_dict_f = gm.get_general_file_path(run_mode, 'mal_d/domains_risk', dir='tmp')
    
    # If the domains-label file not exist, run a 'full run' for creating the file (for stratified Kfolds)
    if not os.path.exists(domains_risk_dict_f):
        # run entire flow with empty evaluated domains list- will create the file of labeled domains risk dict (1=mal,0=else)
        run_entire_flow(run_mode,algorithms_list,[],\
                        redirect_ref=redirect_ref,redirect_weight=redirect_weight,\
                        link_ref=link_ref,link_weight=link_weight,\
                        nstart_flag=nstart_flag,wo_users=wo_users)
        #folds_stats_list =  run_entire_flow(run_mode,algorithms_list,[],wo_users=True)
    src_mal_domains = gm.get_general_file_path(run_mode, 'mal_d/src_mal_domains', dir='tmp') 
    mal_list = np.array(gm.read_list_from_file(src_mal_domains))#[line.strip() for line in open(src_mal_domains,'r')])
    '''if run_mode == 'real_run':  '''
    #mal_domains_list = []
    tests_list = []
    if len(mal_list):    #if src_mal_domains file not empty
        #kf = cross_validation.KFold(len(mal_list), n_folds=k_folds, shuffle=True)
        uzip_d_risk = zip(*gm.read_object_from_file(domains_risk_dict_f).items())
        kf = cross_validation.StratifiedKFold(list(uzip_d_risk[1]), n_folds=min(k_folds,sum(uzip_d_risk[1])))
        for train_index, test_index in kf:
            # test_dict is the test fold dict
            test = [np.asarray(uzip_d_risk[0])[test_index],np.asarray(uzip_d_risk[1])[test_index]]
            tests_list.append(test)
            #print test_dict; print 'XXXXX',len(tests_list),'\n',tests_list,'\n\n\n'
            #mal_domains_list.append(list(mal_list[test_index]))
       
    folds_stats_list =  run_entire_flow(run_mode,algorithms_list,tests_list,\
                                        redirect_ref=redirect_ref,redirect_weight=redirect_weight,\
                                        link_ref=link_ref,link_weight=link_weight,\
                                        nstart_flag=nstart_flag,wo_users=wo_users,\
                                        multiproc_flag=multiproc_flag)#,)
    #folds_stats_list =  run_entire_flow(run_mode,algorithms_list,mal_domains_list,redirect_ref=False)#,link_weight=1,wo_users=False)
    #folds_stats_list =  run_entire_flow(run_mode,algorithms_list,tests_list,wo_users=True)
    '''else:
        mal_domains_list = mal_list
        print mal_domains_list
        folds_stats_list =  run_entire_flow_iteration(run_mode,algorithms_list,mal_domains_list,redirect_ref=True,redirect_weight=0.5,link_ref=True,link_weight=0.2)#,wo_users=True)'''
    #run_scores_histogram(run_mode,algorithms_list)
    
    out_fn = gm.get_general_file_path(run_mode, 'eval_union_stats', dir='outputs')
    if len(folds_stats_list):   # if folds_stats_list not empty- means there was K fold cross validation run (not just BL)
        stats.stats_union(folds_stats_list, out_fn, raw_flag=True)
    print 'EVALUATION MAIN: Total time: ',startTime-datetime.now(); sys.stdout.flush()

    return
Example #7
0
 def create_white_list_for_post_filtering(self,fn):
     ''' Creates a file of white list domains (if not already exits) where there was high incoming traffic,
     high means more than the averaged of the number of users entered to domain by all its incoming edges. '''
     import os, math, tldextract
     if not os.path.exists(fn):
         b_domains_set = {}
         '''b_domains_set = {'xhamster.com','adultism.com','dojki.com','mygames.com.ua','bigpoint.com','flashgames.ru',\
                          'a10.com','xnxx.com','nick.de','porngaytube.net','amazingwildtube.com','youporn.com',\
                          'redtube.com','besttubeclips.net','youjizz.com','jacquieetmichel.net','dudesnude.com',\
                          'sourceforge.net','kinox.to','hentai.ms','imagefap.com','movie2k.to','sexuria.com',\
                          'gallerysex.net','ashemaletube.com','xvideos.com','escort-ireland.com','tube8.com',\
                          'perveden.com','gidonline.ru','met-art.com','oddassy.com','cam4.com'}'''
         g_domains_set = {'google','facebook','ebay','twitter','yandex','paypal','citibank','bing','yahoo','youtube',\
                          'apple','tripadvisor'} 
         whiteList_set = {'babylon.com','walla.co.il','flickr.com','netflix.com','firefox.com','conduit.com',\
                          'washingtonpost.com','endomondo.com','linkedin.com','avg.com','ask.com','reddit.com'}  # set var not dict.
         
         added_whiteList = [d for d in self.G.nodes_iter() if tldextract.extract(d).domain in g_domains_set]
         whiteList_set.update(added_whiteList)   # insert the new domains from added_whiteList (list) to the whiteList_set (set)
         del added_whiteList[:]  # clear added_whiteList
         
         
         g_traffic_dict = self.G.in_degree(weight=self.e_attr.good)
         #gm.write_dict_ordered_by_value_to_file(g_traffic_dict, '_'.join([fn,'DEBUG_goodTraffic']))
         avg = math.ceil(float(sum(g_traffic_dict.values()))/len(g_traffic_dict))
         trh = 10 * avg
         added_whiteList = [k for k,v in g_traffic_dict.items() if v>=trh]
         print 'create_white_list_for_post_filtering: avg=',avg,' , trh=',trh,' , added_whiteList before cleaning: len=',len(added_whiteList)#,' , added_whiteList=',added_whiteList
         # Delete known risky domains from the whiteList:
         added_whiteList[:] = [d for d in added_whiteList if self.G.node[d][self.n_attr.risk] < gm.risk_threshold and d not in b_domains_set] # remove from the good users popular domains the ones which known as risky (from blacklists) or known as p**n/problematic domains (b_domains_set)
         print 'create_white_list_for_post_filtering: added_whiteList AFTER cleaning: len=',len(added_whiteList)#,' , added_whiteList=',added_whiteList
         whiteList_set.update(added_whiteList)   # insert the new domains from added_whiteList (list) to the whiteList_set (set)                   
         
         # for DEBUG:
         self.export_traffic_info_DEBUG('_'.join([fn,'DEBUG_traffic_info']))
         WL_dict = dict((k,v) for k,v in g_traffic_dict.items() if k in added_whiteList)
         gm.write_dict_ordered_by_value_to_file(WL_dict, '_'.join([fn,'DEBUG'])) # mainly for debug!! 
         # end DEBUG
         
         '''white_flag_dict = {d:0 for d in self.G.nodes_iter()}    # init dict of all nodes with value zero     
         for d in whiteList_set: white_flag_dict[d] = 1          # changes the domains exist in whiteList_set to 1 in the white_flag_dict 
         gm.write_object_to_file(white_flag_dict, fn)  # write the whiteList to using pickle for later loading
         '''
     else: 
         whiteList_set = gm.read_object_from_file(fn)
         
     #self.add_nodes_attr(self.n_attr.is_white, white_flag_dict)
     return whiteList_set
def combine_scores(algorithms_list, fn=None):
    dicts_list = []
    for alg in algorithms_list:
        dicts_list.append(
            gm.read_object_from_file(''.join([
                '/home/michal/SALSA_files/tmp/real_run/', alg, '_a_dict_pickle'
            ])))
    #dict_f = '/home/michal/SALSA_files/tmp/real_run/combined.csv'
    d = gm.create_max_dict_from_dicts(dicts_list)  #, dict_f)

    u_pct_dict, l_pct_dict = gm.get_percentiles(d)
    #pct_dict_f = '/home/michal/SALSA_files/outputs/real_run/combined.csv'
    if fn:
        gm.write_union_of_dicts_ordered_by_value_to_file(
            d, [u_pct_dict, l_pct_dict], fn)  #pct_dict_f)
    return d
def generate_combined_scores(run_mode,algorithms_list=[],evaluated_domain_list=None):
    startTime = datetime.now()
    dicts_list = []
    for alg in algorithms_list:
        dicts_list.append(gm.read_object_from_file( gm.get_general_file_path(run_mode,'_'.join([alg,'a_dict_pickle']),evaluated_domain_list) )) 

    combine_types = {'max': 'gm.create_max_dict_from_dicts(dicts_list)',\
                       'avg': 'gm.create_avg_dict_from_dicts(dicts_list)',\
                       'top_3_avg': 'gm.create_avg_dict_from_dicts(dicts_list,n=3)',\
                       'top_2_avg': 'gm.create_avg_dict_from_dicts(dicts_list,n=2)'}
    for k,v in combine_types.items():
        out_file = gm.get_general_file_path(run_mode,k,evaluated_domain_list,dir='outputs') 
        comb_score_dict = eval(v)
        u_pct_dict, l_pct_dict = gm.get_percentiles(comb_score_dict)
        gm.write_union_of_dicts_ordered_by_value_to_file(comb_score_dict, [u_pct_dict,l_pct_dict], out_file)
    print '\n--- main: combined scores generation and evaluation took: ' + str(datetime.now()-startTime); sys.stdout.flush();
    return