def score_learned(self, learned_sites, detect_test, y_test, test_params): valid_instance(learned_sites, CD.LearnedSites) valid_instance(detect_test, CD.ObservedSites) total_size = len(y_test) # if X_test is ObservedSites, list_to_sites does nothing or # selection (y_test not None) X_test = list_to_sites(detect_test, self.simhash_config) X_expected = list_to_sites(detect_test, self.simhash_config, y_test) detection_config = CD.DetectionConfig() detection_config.algorithm = CD.DetectionConfig.INCONSISTENT_COEFFICIENT detection_config.min_radius = test_params['test_diameter'] detection_config.inconsistent_coefficient = test_params['test_inconsistent'] detection_config.simhash_type = self.simhash_config.simhash_type detector = CloakingDetection(detection_config, learned_sites) cloaking_sites = detector.detect(X_test) if self.cloaking_sites: cloaking_sites = intersect_observed_sites_util(self.cloaking_sites, cloaking_sites) else: """ This is only useful for integrated testing, text detection set cloaking_sites, dom detection read this and intersect its cloaking_sites with the one from text and evaluate it. """ self.cloaking_sites = CD.ObservedSites() self.cloaking_sites.CopyFrom(cloaking_sites) rate, pr, errors = compute_metrics(cloaking_sites, X_expected, total_size) logger = logging.getLogger("global") logger.warning(test_params) logger.warning(rate) logger.warning(pr) logger.warning(errors) """ Record the latest test scores for further analysis """ self.rate = rate self.pr = pr self.errors = errors res_str = ",".join([ str(test_params["train_inconsistent"]), str(test_params["test_inconsistent"]), str(test_params["test_diameter"]) ]) + "\n" res_str += ",".join([ str(rate[0]), str(rate[1]) ]) + "\n" if self.outf: self.outf.write(res_str) return self.objective(errors)
def learn(self, observed_sites_filenames, cluster_config=None): """ Learn clusters from observed_sites_filenames using cluster_config @parameter observed_sites_filenames: list of observed_sites to be learned cluster_config: configuration for clustering @return learned_sites: the learned clusters """ if (not cluster_config) and (not self.cluster_config): raise Exception("Cluster config missing") elif cluster_config and valid_instance(cluster_config, CD.ClusterConfig): self.cluster_config = CD.ClusterConfig() self.cluster_config.CopyFrom(cluster_config) # learn the clusters if isinstance(observed_sites_filenames, list): observed_sites = merge_observed_sites(observed_sites_filenames) elif valid_instance(observed_sites_filenames, CD.ObservedSites): observed_sites = observed_sites_filenames else: raise Exception("Wrong argument for learn!") print "in learning phase" print "before de noise {0}".format(len(observed_sites.site)) de_noise_config = CD.DeNoiseConfig() observed_sites = de_noise(observed_sites, de_noise_config) print "after de noise {0}".format(len(observed_sites.site)) learned_sites = CD.LearnedSites() cluster_config.simhash_type = observed_sites.config.simhash_type for observed_site in observed_sites.site: # either TEXT or DOM can be handled now. TEXT_DOM is not supported. if cluster_config.algorithm.name == CD.Algorithm.HAMMING_THRESHOLD: result = HammingTreshold(cluster_config, observed_site) if cluster_config.algorithm.name == CD.Algorithm.K_MEANS: result = KMeans(cluster_config, observed_site) if cluster_config.algorithm.name == CD.Algorithm.SPECTRAL_CLUSTERING: result = SpectralClustering(cluster_config, observed_site) if cluster_config.algorithm.name == CD.Algorithm.HIERARCHICAL_CLUSTERING: # result = HierarchicalClustering(cluster_config, observed_site) result = ScipyHierarchicalClustering(cluster_config, observed_site) # If no pattern can be extracted, return None if result: learned_site = learned_sites.site.add() learned_site.CopyFrom(result) return learned_sites
def compute_simhash(self, site_list_filenames, simhash_config): """ Compute simhash for site_list_filenames using configuration simhash_config @parameter site_list_filenames: a list of site_list_filename (CrawlLog) simhash_config: the configuration for simhash computing @return observed_sites: site observations aggregated by site """ # Input is a list of site_list_filename valid_instance(simhash_config, CD.SimhashConfig) if simhash_config.crawl_log_landing_url_as_observation_landing_url: url_field = "landing_url" else: url_field = "url" observed_sites, path_list = load_observed_sites(site_list_filenames, url_field) simhash_computer = HtmlSimhashComputer(simhash_config) thread_computer = ThreadComputer(simhash_computer, 'compute_simhash', path_list) path_simhash_dict = dict() for p, s in thread_computer.result: path_simhash_dict[p] = s observed_sites.config.CopyFrom(simhash_config) for site in observed_sites.site: count = 0 for observation in site.observation: if not observation.file_path in path_simhash_dict: # If simhash computation of this observation failed, # just ignore this one. Because we don't have sample # and it is not marked as failure. del site.observation[count] continue result = path_simhash_dict[observation.file_path] if simhash_config.simhash_type in [CD.TEXT, CD.TEXT_DOM]: observation.text_simhash = result[0][0].value observation.text_feature_count = result[0][1] if simhash_config.simhash_type in [CD.DOM, CD.TEXT_DOM]: observation.dom_simhash = result[-1][0].value observation.dom_feature_count = result[-1][1] count += 1 if not simhash_config.discard_failure: observed_sites = add_failure(observed_sites, site_list_filenames) return observed_sites
def build_by_dom(self, html_dom): if valid_instance(html_dom, CD.HtmlDom): features = list() if self.simhash_config.usage.gram: for feature in html_dom.node: features.append(feature.name) if self.simhash_config.usage.bi_gram: for feature in html_dom.bi_node: features.append(feature.name) if self.simhash_config.usage.tri_gram: for feature in html_dom.tri_node: features.append(feature.name) return [self.build_by_features(features), len(features)]
def build_by_text(self, html_text): if valid_instance(html_text, CD.HtmlText): # weighted features are not supported by now features = list() if self.simhash_config.usage.gram: for feature in html_text.word: features.append(feature.name) if self.simhash_config.usage.bi_gram: for feature in html_text.bi_gram: features.append(feature.name) if self.simhash_config.usage.tri_gram: for feature in html_text.tri_gram: features.append(feature.name) return [self.build_by_features(features), len(features)]
def build_by_features(self, features): # print len(features) if valid_instance(features, collections.Iterable): return Simhash(features)
def __init__(self, simhash_config): if valid_instance(simhash_config, CD.SimhashConfig): self.simhash_config = simhash_config
def __init__(self, cluster_config=None): if not cluster_config: self.cluster_config = None elif valid_instance(cluster_config, CD.ClusterConfig): self.cluster_config = CD.ClusterConfig() self.cluster_config.CopyFrom(cluster_config)