def fetch_clustered_data(self, ): self.clustered_ad_objs = ad_object.parse_ad_objects( CLUSTERED_ADOBJECTS_PATH) self.res_clustered_ad_objs = ad_object.parse_ad_objects( RES_CLUSTERED_ADOBJECTS_PATH) with open(CLUSTERS_PATH) as f: self.clusters = json.loads(f.readline()) with open(RES_CLUSTERS_PATH) as f: self.res_clusters = json.loads(f.readline()) print len(self.clusters) print len(self.clustered_ad_objs)
def fetch_data(self,): ad_obj_files = os.listdir(ADOBJECTS_PATH) for ad_obj_file in ad_obj_files: ad_obj_fpath = os.path.join(ADOBJECTS_PATH, ad_obj_file) ad_objs = ad_object.parse_ad_objects(ad_obj_fpath) self.ad_objects = self.ad_objects + ad_objs print "Done loading all ad_objects into memory; %s ads" % (len(self.ad_objects),)
def main(): clustered_ad_objects = ad_object.parse_ad_objects(CLUSTERED_ADOBJECTS_PATH) with open(CLUSTERS_PATH) as f: clusters = json.loads(f.readline()) import ipdb; ipdb.set_trace() for label in clusters: fetch_an_image(label, clustered_ad_objects, clusters)
def identify_ad_network_mp(ad_path, ad_dict, mal_ad_dict, mal_ad_hashes): ad_objs = ad_object.parse_ad_objects(ad_path) for ad_obj in ad_objs: ani = AdNetworkIdentifier(ad_obj) an = ani.identify() if ad_obj.screenshot_hash in mal_ad_hashes: mal_ad_dict[an] = mal_ad_dict[an] + 1 ad_dict[an] = ad_dict[an] + 1
def __init__(self, ): clustered_ad_objects = ad_object.parse_ad_objects( CLUSTERED_ADOBJECTS_PATH) with open(CLUSTERS_PATH) as f: clusters = json.loads(f.readline()) # print len(clusters) # print sorted([int(x) for x in clusters.keys()]) # hash_sample_member --> (label, all_hashes_list) for a cluster self.sample_hashes_dict = {} # Matches hash to cluster self.hash_cluster_dict = {} for label, ad_idx_list in clusters.iteritems(): img_hash_set = set() for ad_idx in ad_idx_list: img_hash = clustered_ad_objects[ad_idx].screenshot_hash self.hash_cluster_dict[img_hash] = label img_hash_set.add(img_hash) self.sample_hashes_dict[list(img_hash_set)[0]] = ( label, list(img_hash_set))
def residential_clusters(): total_ads = 0 cao_idx = 0 clustered_ad_objects = [] clusters = collections.defaultdict(list) cm = find_matching_cluster.ClusterMatcher() ad_obj_files = os.listdir(RES_ADOBJECTS_PATH) for ad_obj_file in ad_obj_files: ad_obj_fpath = os.path.join(RES_ADOBJECTS_PATH, ad_obj_file) ad_objs = ad_object.parse_ad_objects(ad_obj_fpath) for ad in ad_objs: total_ads += 1 mr = cm.find_matching_cluster(ad.screenshot_hash) if mr != -1: clusters[mr].append(cao_idx) clustered_ad_objects.append(ad) cao_idx += 1 print len(clustered_ad_objects), len(clusters), total_ads ad_object.dump_ad_objects(RES_CLUSTERED_ADOBJECTS_PATH, clustered_ad_objects) with open(RES_CLUSTERS_PATH, "wb") as f: f.write(json.dumps(clusters))
def main(): tld_extractor = tldextract.TLDExtract(suffix_list_urls=None) ad_objs = ad_object.parse_ad_objects(CLUSTERED_ADOBJECTS_PATH) ad_domains = set() image_hashes = set() milking_url_domains = set() upstream_domains = set() milking_urls = [] for ad in ad_objs: ad_domain, milking_url, milking_domain, curr_upstream_domains = get_milking_url( ad, tld_extractor) if milking_domain and milking_domain not in milking_url_domains: # print "Milking url: ", milking_url # import ipdb; ipdb.set_trace() milking_url_domains.add(milking_domain) milking_urls.append(milking_url) image_hashes.add(ad.screenshot_hash) ad_domains.add(ad_domain) upstream_domains = upstream_domains.union(curr_upstream_domains) # home_domain = self.tld_extract(ad).registered_domain print len(ad_objs) print "# Ad domains: ", len(ad_domains) print "# Image hashes: ", len(image_hashes) print "# Milking URLS: ", len(milking_urls) print "# Upstream domains: ", len(upstream_domains) print "# All domains:", len( ad_domains.union(milking_url_domains).union(upstream_domains)) dump_object = { "ad_domains": list(ad_domains), "image_hashes": list(image_hashes), "milking_urls": milking_urls, "upstream_domains": list(upstream_domains) } with open(MILKING_URLS_PATH, "wb") as f: dump_str = json.dumps(dump_object) f.write(dump_str)
def fetch_ad_network_counts(self, ): ans = [ "RevenueHits", "PopCash", "AdCash", "PopAds", "PopMyAds", "AdSterra", "HilltopAds", "Clicksor", "Propeller", "Clickadu", "AdMaven", "Unknown" ] self.ad_counts = {} self.mal_ad_counts = {} # multiprocessing stuff # pm = multiprocessing.Manager() # self.ad_counts = pm.dict() # self.mal_ad_counts = pm.dict() for an in ans: self.ad_counts[an] = 0 self.mal_ad_counts[an] = 0 # self.mal_ad_hashes_dict = pm.dict() # For use with mp as there are no sets in # Manager() # for ih in self.mal_ad_hashes: # self.mal_ad_hashes_dict[ih] = 1 # pool = multiprocessing.Pool(N_JOBS) # results = [] # for ad_obj_file in ad_obj_files: # ad_obj_fpath = os.path.join(ADOBJECTS_PATH, ad_obj_file) # r = pool.apply_async(identify_ad_network_mp, # (ad_obj_fpath, self.ad_counts, # self.mal_ad_counts, # self.mal_ad_hashes_dict)) # r = identify_ad_network_mp(ad_obj_fpath, self.ad_counts, # self.mal_ad_counts, # self.mal_ad_hashes_dict) # results.append(r) # ad_objs = ad_object.parse_ad_objects(ad_path) # for ad_obj in ad_objs: # ani = AdNetworkIdentifier(ad_obj) # an = ani.identify() # for r in results: # r.wait() # single processing ad_net_domains = collections.defaultdict(set) for ad_obj_fpath in all_ad_obj_paths: ad_objs = ad_object.parse_ad_objects(ad_obj_fpath) # if len(ad_objs) > 1: # print len(ad_objs) for ad in ad_objs: an, ad_net_domain = identify_ad_network(ad) if an: if an != "Unknown": ad_net_domains[an].add(ad_net_domain) if ad.screenshot_hash in self.mal_ad_hashes: self.mal_ad_counts[an] = self.mal_ad_counts[an] + 1 self.ad_counts[an] = self.ad_counts[an] + 1 print "Ad net domains:" for ad_net, domains in ad_net_domains.iteritems(): if len(domains) > 10: print ad_net, len(domains), random.sample(domains, 10) else: print ad_net, len(domains), domains # print "Ad net domains:", pprint.pprint(ad_net_domains) # print "Ad network:", an # import ipdb; ipdb.set_trace() print "Malicious ad network counts:", self.mal_ad_counts print "Ad network counts:", self.ad_counts print "Total # of SE attacks", sum( [x for x in self.mal_ad_counts.values()]) print "Total # of landing pages", sum( [x for x in self.ad_counts.values()])
def explore(fpath): ad_objects = ad_object.parse_ad_objects(fpath) for ad in ad_objects: print ad print "*" * 50
gsb_key = '' # gsb_db_path = '/home/phani/se-hunter/milking/gsb_vt/gsb_v4.db' gsb_db_path = 'gsb_v4.db' CLUSTERED_ADOBJECTS_PATH = "/home/phani/se_hunter_results/clustered_ad_objects_v4.json" CLUSTERS_PATH = "/home/phani/se_hunter_results/filtered_clusters_v4.json" RES_CLUSTERED_ADOBJECTS_PATH = "/home/phani/se_hunter_results/res_clustered_ad_objects_v4.json" RES_CLUSTERS_PATH = "/home/phani/se_hunter_results/res_clusters_v4.json" CATEGORY_DICT = "/home/phani/se-hunter/selenium/job_handling/categories/category_dict.json" ALL_OBJECT_STATS = "/home/phani/se_hunter_results/all_object_dump.json" POPULARITY_RANKS = "/home/phani/se-hunter/seeds/seed_rankings.txt" PUBLISHER_SITES_ATTEMPTED = "/home/phani/se-hunter/selenium/job_handling/done_jobs_domains.txt" POPULARTIY_GRAPH_DATA = "/home/phani/se_hunter_results/popularity_graph_data.json" SEED_DOMAIN_DATA = "/home/phani/se_hunter_results/misc_ad_object_info.txt" SEED_DOMAIN_DATA_2 = "/home/phani/se_hunter_results/misc_ad_object_info_2.txt" clustered_ad_objects = ad_object.parse_ad_objects(CLUSTERED_ADOBJECTS_PATH) with open(CATEGORY_DICT) as f: categories_dict = json.loads(f.readline()) with open(CLUSTERS_PATH) as f: clusters = json.loads(f.readline()) res_clustered_ad_objects = ad_object.parse_ad_objects(RES_CLUSTERED_ADOBJECTS_PATH) with open(RES_CLUSTERS_PATH) as f: res_clusters = json.loads(f.readline()) with open(ALL_OBJECT_STATS) as f: all_object_stats = json.loads(f.readline()) with open(POPULARITY_RANKS) as f: popularity_ranks = json.load(f) with open(PUBLISHER_SITES_ATTEMPTED) as f: publisher_sites_attempted = json.loads(f.readline()) with open(SEED_DOMAIN_DATA) as f: seed_domain_data = json.load(f)