def plot_ROC(to_detect, to_learn, expected, noise, outfile, coefficient=None, radius=None): to_learn_sites = CD.ObservedSites() read_proto_from_file(to_learn_sites, to_learn) expected_sites = CD.ObservedSites() read_proto_from_file(expected_sites, expected) print "number of expected_sites" print len(expected_sites.site) to_detect_sites = CD.ObservedSites() read_proto_from_file(to_detect_sites, to_detect) noise_sites = CD.ObservedSites() read_proto_from_file(noise_sites, noise) filt_to_detect_sites = remove_noise(to_detect_sites, noise_sites) filt_to_learn_sites = remove_noise(to_learn_sites, noise_sites) expected_set = sites_name_set(expected_sites) """ Following are Stratified-K-Fold. """ kfold = 5 total_site_name_list = list() to_detect_sites_map = dict() for site in filt_to_detect_sites.site: total_site_name_list.append(site.name) to_detect_sites_map[site.name] = site to_learn_sites_map = dict() for site in filt_to_learn_sites.site: to_learn_sites_map[site.name] = site # prepare the train and test to_detect_sites_list = list() to_learn_sites_list = list() Y = list() for name in total_site_name_list: to_detect_sites_list.append(to_detect_sites_map[name]) to_learn_sites_list.append(to_learn_sites_map[name]) if name in expected_set: Y.append(1) else: Y.append(0) to_detect_sites_list = np.array(to_detect_sites_list) to_learn_sites_list = np.array(to_learn_sites_list) Y = np.array(Y) skf = StratifiedKFold(Y, kfold) # scan the parameters and output the FPR and TPR params = dict() params['train_inconsistent'] = coefficient[0] params['test_inconsistent'] = coefficient[1] # params['test_diameter'] = radius[0] outf = open(outfile, 'a') for r in np.arange(radius[0], radius[1], 1): tpr = np.zeros(5, dtype=np.float) fpr = np.zeros(5, dtype=np.float) params['test_diameter'] = r count = 0 for train, test in skf: print "train size: {0}, test size: {1}".format(len(train), len(test)) learn_train, learn_test = to_learn_sites_list[train], to_learn_sites_list[test] detect_train, detect_test = to_detect_sites_list[train], to_detect_sites_list[test] y_train, y_test = Y[train], Y[test] estimator = Estimator(filt_to_detect_sites.config) estimator.score(learn_test, detect_test, y_test, params) tpr[count] = estimator.rate[0] fpr[count] = estimator.rate[1] count += 1 res_str = ",".join([ str(params["train_inconsistent"]), str(params["test_inconsistent"]), str(params["test_diameter"]) ]) + "\n" res_str += ",".join([ str(tpr.mean()), str(fpr.mean()) ]) + "\n" outf.write(res_str)
def integrated_plot_ROC(text_input, text_train, dom_input, dom_train, expected, noise, outfile, c_text_train, c_text_test, c_dom_train, c_dom_test, r_text_start, r_text_end, r_dom_start, r_dom_end): text_to_learn_sites = CD.ObservedSites() read_proto_from_file(text_to_learn_sites, text_train) dom_to_learn_sites = CD.ObservedSites() read_proto_from_file(dom_to_learn_sites, dom_train) expected_sites = CD.ObservedSites() read_proto_from_file(expected_sites, expected) print "number of expected_sites" print len(expected_sites.site) text_to_detect_sites = CD.ObservedSites() read_proto_from_file(text_to_detect_sites, text_input) dom_to_detect_sites = CD.ObservedSites() read_proto_from_file(dom_to_detect_sites, dom_input) noise_sites = CD.ObservedSites() read_proto_from_file(noise_sites, noise) filt_text_to_detect_sites = remove_noise(text_to_detect_sites, noise_sites) filt_text_to_learn_sites = remove_noise(text_to_learn_sites, noise_sites) filt_dom_to_detect_sites = remove_noise(dom_to_detect_sites, noise_sites) filt_dom_to_learn_sites = remove_noise(dom_to_learn_sites, noise_sites) expected_set = sites_name_set(expected_sites) """ Following are Stratified-K-Fold. """ kfold = 5 total_site_name_list = list() text_to_detect_sites_map = dict() for site in filt_text_to_detect_sites.site: total_site_name_list.append(site.name) text_to_detect_sites_map[site.name] = site text_to_learn_sites_map = dict() for site in filt_text_to_learn_sites.site: text_to_learn_sites_map[site.name] = site dom_to_detect_sites_map = dict() for site in filt_dom_to_detect_sites.site: dom_to_detect_sites_map[site.name] = site dom_to_learn_sites_map = dict() for site in filt_dom_to_learn_sites.site: dom_to_learn_sites_map[site.name] = site # prepare the train and test text_to_detect_sites_list = list() text_to_learn_sites_list = list() dom_to_detect_sites_list = list() dom_to_learn_sites_list = list() Y = list() for name in total_site_name_list: text_to_detect_sites_list.append(text_to_detect_sites_map[name]) text_to_learn_sites_list.append(text_to_learn_sites_map[name]) dom_to_detect_sites_list.append(dom_to_detect_sites_map[name]) dom_to_learn_sites_list.append(dom_to_learn_sites_map[name]) if name in expected_set: Y.append(1) else: Y.append(0) text_to_detect_sites_list = np.array(text_to_detect_sites_list) text_to_learn_sites_list = np.array(text_to_learn_sites_list) dom_to_detect_sites_list = np.array(dom_to_detect_sites_list) dom_to_learn_sites_list = np.array(dom_to_learn_sites_list) Y = np.array(Y) skf = StratifiedKFold(Y, kfold) # scan the parameters and output the FPR and TPR text_params = dict() text_params['train_inconsistent'] = c_text_train text_params['test_inconsistent'] = c_text_test dom_params = dict() dom_params['train_inconsistent'] = c_dom_train dom_params['test_inconsistent'] = c_dom_test # params['test_diameter'] = radius[0] outf = open(outfile + "_text_r_" + str(r_text_start), 'a') for t_r in np.arange(r_text_start, r_text_end, 1): text_params['test_diameter'] = t_r for d_r in np.arange(r_dom_start, r_dom_end, 1): dom_params['test_diameter'] = d_r tpr = np.zeros(5, dtype=np.float) fpr = np.zeros(5, dtype=np.float) count = 0 for train, test in skf: print "train size: {0}, test size: {1}".format(len(train), len(test)) text_learn_train, text_learn_test = text_to_learn_sites_list[train], text_to_learn_sites_list[test] text_detect_train, text_detect_test = text_to_detect_sites_list[train], text_to_detect_sites_list[test] dom_learn_train, dom_learn_test = dom_to_learn_sites_list[train], dom_to_learn_sites_list[test] dom_detect_train, dom_detect_test = dom_to_detect_sites_list[train], dom_to_detect_sites_list[test] y_train, y_test = Y[train], Y[test] text_estimator = Estimator(filt_text_to_detect_sites.config) text_estimator.score(text_learn_test, text_detect_test, y_test, text_params) dom_estimator = Estimator(simhash_config = filt_dom_to_detect_sites.config, cloaking_sites = text_estimator.cloaking_sites) dom_estimator.score(dom_learn_test, dom_detect_test, y_test, dom_params) tpr[count] = dom_estimator.rate[0] fpr[count] = dom_estimator.rate[1] count += 1 res_str = ",".join([ str(dom_params["train_inconsistent"]), str(dom_params["test_inconsistent"]), str(dom_params["test_diameter"]) ]) + "\n" res_str += ",".join([ str(tpr.mean()), str(fpr.mean()) ]) + "\n" outf.write(res_str)
def cross_validation(to_detect, to_learn, expected, noise, outfile, coefficient=None, radius=None): to_learn_sites = CD.ObservedSites() read_proto_from_file(to_learn_sites, to_learn) expected_sites = CD.ObservedSites() read_proto_from_file(expected_sites, expected) print "number of expected_sites" print len(expected_sites.site) to_detect_sites = CD.ObservedSites() read_proto_from_file(to_detect_sites, to_detect) noise_sites = CD.ObservedSites() read_proto_from_file(noise_sites, noise) filt_to_detect_sites = remove_noise(to_detect_sites, noise_sites) filt_to_learn_sites = remove_noise(to_learn_sites, noise_sites) expected_set = sites_name_set(expected_sites) """ Following are Stratified-K-Fold. """ kfold = 5 total_site_name_list = list() to_detect_sites_map = dict() for site in filt_to_detect_sites.site: total_site_name_list.append(site.name) to_detect_sites_map[site.name] = site to_learn_sites_map = dict() for site in filt_to_learn_sites.site: to_learn_sites_map[site.name] = site # prepare the train and test to_detect_sites_list = list() to_learn_sites_list = list() Y = list() for name in total_site_name_list: to_detect_sites_list.append(to_detect_sites_map[name]) to_learn_sites_list.append(to_learn_sites_map[name]) if name in expected_set: Y.append(1) else: Y.append(0) to_detect_sites_list = np.array(to_detect_sites_list) to_learn_sites_list = np.array(to_learn_sites_list) Y = np.array(Y) skf = StratifiedKFold(Y, kfold) scores = list() best_params = list() for train, test in skf: print "train size: {0}, test size: {1}".format(len(train), len(test)) learn_train, learn_test = to_learn_sites_list[train], to_learn_sites_list[test] detect_train, detect_test = to_detect_sites_list[train], to_detect_sites_list[test] y_train, y_test = Y[train], Y[test] estimator = Estimator(filt_to_detect_sites.config, outfile, coefficient, radius) estimator.fit(learn_train, detect_train, y_train) best_params.append(estimator.best_params) scores.append(estimator.score(learn_test, detect_test, y_test)) # filt_cloaking_sites = remove_noise(both_detected, noise_sites) # cross_val_score(svc, X_digits, y_digits, cv=kfold, n_jobs=-1) print "best_parameters" print best_params print scores