def fetch_clustered_data(self, ):
     self.clustered_ad_objs = ad_object.parse_ad_objects(
         CLUSTERED_ADOBJECTS_PATH)
     self.res_clustered_ad_objs = ad_object.parse_ad_objects(
         RES_CLUSTERED_ADOBJECTS_PATH)
     with open(CLUSTERS_PATH) as f:
         self.clusters = json.loads(f.readline())
     with open(RES_CLUSTERS_PATH) as f:
         self.res_clusters = json.loads(f.readline())
     print len(self.clusters)
     print len(self.clustered_ad_objs)
Example #2
0
 def fetch_data(self,):
     ad_obj_files = os.listdir(ADOBJECTS_PATH)
     for ad_obj_file in ad_obj_files:
         ad_obj_fpath = os.path.join(ADOBJECTS_PATH, ad_obj_file)
         ad_objs = ad_object.parse_ad_objects(ad_obj_fpath)
         self.ad_objects = self.ad_objects + ad_objs
     print "Done loading all ad_objects into memory; %s ads" % (len(self.ad_objects),)
Example #3
0
def main():
    clustered_ad_objects = ad_object.parse_ad_objects(CLUSTERED_ADOBJECTS_PATH)
    with open(CLUSTERS_PATH) as f:
        clusters = json.loads(f.readline())
    import ipdb; ipdb.set_trace()
    for label in clusters:
        fetch_an_image(label, clustered_ad_objects, clusters)
Example #4
0
def identify_ad_network_mp(ad_path, ad_dict, mal_ad_dict, mal_ad_hashes):
    ad_objs = ad_object.parse_ad_objects(ad_path)
    for ad_obj in ad_objs:
        ani = AdNetworkIdentifier(ad_obj)
        an = ani.identify()
        if ad_obj.screenshot_hash in mal_ad_hashes:
            mal_ad_dict[an] = mal_ad_dict[an] + 1
        ad_dict[an] = ad_dict[an] + 1
    def __init__(self, ):
        clustered_ad_objects = ad_object.parse_ad_objects(
            CLUSTERED_ADOBJECTS_PATH)
        with open(CLUSTERS_PATH) as f:
            clusters = json.loads(f.readline())

        # print len(clusters)
        # print sorted([int(x) for x in clusters.keys()])
        # hash_sample_member --> (label, all_hashes_list) for a cluster
        self.sample_hashes_dict = {}

        # Matches hash to cluster
        self.hash_cluster_dict = {}
        for label, ad_idx_list in clusters.iteritems():
            img_hash_set = set()
            for ad_idx in ad_idx_list:
                img_hash = clustered_ad_objects[ad_idx].screenshot_hash
                self.hash_cluster_dict[img_hash] = label
                img_hash_set.add(img_hash)
            self.sample_hashes_dict[list(img_hash_set)[0]] = (
                label, list(img_hash_set))
def residential_clusters():
    total_ads = 0
    cao_idx = 0
    clustered_ad_objects = []
    clusters = collections.defaultdict(list)
    cm = find_matching_cluster.ClusterMatcher()
    ad_obj_files = os.listdir(RES_ADOBJECTS_PATH)
    for ad_obj_file in ad_obj_files:
        ad_obj_fpath = os.path.join(RES_ADOBJECTS_PATH, ad_obj_file)
        ad_objs = ad_object.parse_ad_objects(ad_obj_fpath)
        for ad in ad_objs:
            total_ads += 1
            mr = cm.find_matching_cluster(ad.screenshot_hash)
            if mr != -1:
                clusters[mr].append(cao_idx)
                clustered_ad_objects.append(ad)
                cao_idx += 1
    print len(clustered_ad_objects), len(clusters), total_ads
    ad_object.dump_ad_objects(RES_CLUSTERED_ADOBJECTS_PATH,
                              clustered_ad_objects)
    with open(RES_CLUSTERS_PATH, "wb") as f:
        f.write(json.dumps(clusters))
def main():
    tld_extractor = tldextract.TLDExtract(suffix_list_urls=None)
    ad_objs = ad_object.parse_ad_objects(CLUSTERED_ADOBJECTS_PATH)
    ad_domains = set()
    image_hashes = set()
    milking_url_domains = set()
    upstream_domains = set()
    milking_urls = []
    for ad in ad_objs:
        ad_domain, milking_url, milking_domain, curr_upstream_domains = get_milking_url(
            ad, tld_extractor)
        if milking_domain and milking_domain not in milking_url_domains:
            # print "Milking url: ", milking_url
            # import ipdb; ipdb.set_trace()
            milking_url_domains.add(milking_domain)
            milking_urls.append(milking_url)
        image_hashes.add(ad.screenshot_hash)
        ad_domains.add(ad_domain)
        upstream_domains = upstream_domains.union(curr_upstream_domains)
        # home_domain = self.tld_extract(ad).registered_domain
    print len(ad_objs)
    print "# Ad domains: ", len(ad_domains)
    print "# Image hashes: ", len(image_hashes)
    print "# Milking URLS: ", len(milking_urls)
    print "# Upstream domains: ", len(upstream_domains)
    print "# All domains:", len(
        ad_domains.union(milking_url_domains).union(upstream_domains))
    dump_object = {
        "ad_domains": list(ad_domains),
        "image_hashes": list(image_hashes),
        "milking_urls": milking_urls,
        "upstream_domains": list(upstream_domains)
    }
    with open(MILKING_URLS_PATH, "wb") as f:
        dump_str = json.dumps(dump_object)
        f.write(dump_str)
    def fetch_ad_network_counts(self, ):
        ans = [
            "RevenueHits", "PopCash", "AdCash", "PopAds", "PopMyAds",
            "AdSterra", "HilltopAds", "Clicksor", "Propeller", "Clickadu",
            "AdMaven", "Unknown"
        ]

        self.ad_counts = {}
        self.mal_ad_counts = {}
        # multiprocessing stuff
        # pm = multiprocessing.Manager()
        # self.ad_counts = pm.dict()
        # self.mal_ad_counts = pm.dict()
        for an in ans:
            self.ad_counts[an] = 0
            self.mal_ad_counts[an] = 0
        # self.mal_ad_hashes_dict = pm.dict()  # For use with mp as there are no sets in
        # Manager()
        # for ih in self.mal_ad_hashes:
        # self.mal_ad_hashes_dict[ih] = 1

        # pool = multiprocessing.Pool(N_JOBS)
        # results = []

        # for ad_obj_file in ad_obj_files:
        # ad_obj_fpath = os.path.join(ADOBJECTS_PATH, ad_obj_file)
        # r = pool.apply_async(identify_ad_network_mp,
        # (ad_obj_fpath, self.ad_counts,
        # self.mal_ad_counts,
        # self.mal_ad_hashes_dict))
        # r = identify_ad_network_mp(ad_obj_fpath, self.ad_counts,
        # self.mal_ad_counts,
        # self.mal_ad_hashes_dict)
        # results.append(r)

        # ad_objs = ad_object.parse_ad_objects(ad_path)
        # for ad_obj in ad_objs:
        # ani = AdNetworkIdentifier(ad_obj)
        # an = ani.identify()
        # for r in results:
        # r.wait()
        # single processing
        ad_net_domains = collections.defaultdict(set)
        for ad_obj_fpath in all_ad_obj_paths:
            ad_objs = ad_object.parse_ad_objects(ad_obj_fpath)
            # if len(ad_objs) > 1:
            # print len(ad_objs)
            for ad in ad_objs:
                an, ad_net_domain = identify_ad_network(ad)
                if an:
                    if an != "Unknown":
                        ad_net_domains[an].add(ad_net_domain)
                    if ad.screenshot_hash in self.mal_ad_hashes:
                        self.mal_ad_counts[an] = self.mal_ad_counts[an] + 1
                    self.ad_counts[an] = self.ad_counts[an] + 1
        print "Ad net domains:"
        for ad_net, domains in ad_net_domains.iteritems():
            if len(domains) > 10:
                print ad_net, len(domains), random.sample(domains, 10)
            else:
                print ad_net, len(domains), domains
        # print "Ad net domains:", pprint.pprint(ad_net_domains)

        # print "Ad network:", an
        # import ipdb; ipdb.set_trace()
        print "Malicious ad network counts:", self.mal_ad_counts
        print "Ad network counts:", self.ad_counts
        print "Total # of SE attacks", sum(
            [x for x in self.mal_ad_counts.values()])
        print "Total # of landing pages", sum(
            [x for x in self.ad_counts.values()])
def explore(fpath):
    ad_objects = ad_object.parse_ad_objects(fpath)
    for ad in ad_objects:
        print ad
        print "*" * 50
Example #10
0
gsb_key = ''
# gsb_db_path = '/home/phani/se-hunter/milking/gsb_vt/gsb_v4.db'
gsb_db_path = 'gsb_v4.db'
CLUSTERED_ADOBJECTS_PATH = "/home/phani/se_hunter_results/clustered_ad_objects_v4.json"
CLUSTERS_PATH = "/home/phani/se_hunter_results/filtered_clusters_v4.json"
RES_CLUSTERED_ADOBJECTS_PATH = "/home/phani/se_hunter_results/res_clustered_ad_objects_v4.json"
RES_CLUSTERS_PATH = "/home/phani/se_hunter_results/res_clusters_v4.json"
CATEGORY_DICT = "/home/phani/se-hunter/selenium/job_handling/categories/category_dict.json"
ALL_OBJECT_STATS = "/home/phani/se_hunter_results/all_object_dump.json"
POPULARITY_RANKS = "/home/phani/se-hunter/seeds/seed_rankings.txt"
PUBLISHER_SITES_ATTEMPTED = "/home/phani/se-hunter/selenium/job_handling/done_jobs_domains.txt"
POPULARTIY_GRAPH_DATA = "/home/phani/se_hunter_results/popularity_graph_data.json"
SEED_DOMAIN_DATA = "/home/phani/se_hunter_results/misc_ad_object_info.txt"
SEED_DOMAIN_DATA_2 = "/home/phani/se_hunter_results/misc_ad_object_info_2.txt"

clustered_ad_objects = ad_object.parse_ad_objects(CLUSTERED_ADOBJECTS_PATH)
with open(CATEGORY_DICT) as f:
    categories_dict = json.loads(f.readline())
with open(CLUSTERS_PATH) as f:
    clusters = json.loads(f.readline())
res_clustered_ad_objects = ad_object.parse_ad_objects(RES_CLUSTERED_ADOBJECTS_PATH)
with open(RES_CLUSTERS_PATH) as f:
    res_clusters = json.loads(f.readline())
with open(ALL_OBJECT_STATS) as f:
    all_object_stats = json.loads(f.readline())
with open(POPULARITY_RANKS) as f:
    popularity_ranks = json.load(f)
with open(PUBLISHER_SITES_ATTEMPTED) as f:
    publisher_sites_attempted = json.loads(f.readline())
with open(SEED_DOMAIN_DATA) as f:
    seed_domain_data = json.load(f)