Beispiel #1
0
    def __init__(self, dataset, date, entry,prefix, eps, cluster_rank,crawl_size, rank_algo="bfs"):
        self.dataset = dataset
        self.date = date
        self.eps = eps
        self.cluster_rank = cluster_rank
        self.rank_algo = rank_algo
        self.crawl_size = crawl_size
        self.rules = self.get_rules()
        self.entry, self.prefix = entry,prefix
        self.history_set = set()
        self.path_prefix = "../../Crawler/{}_samples/{}/".format(date,dataset.replace("new_",""))
        self.folder_path = ["../../Crawler/{}_samples/{}/".format(date,dataset.replace("new_",""))]
        self.sitemap = pageCluster(dataset,date,self.folder_path,0)
        self.full_folder = "../../Crawler/full_data/" + dataset
        self.trans = {}
        self.queue = {}
        self.crawled_cluster_count = defaultdict(int)
        self.trans_dict = read_trans_dict(dataset,date)


        #self.cluster_dict = get_cluster_dict(dataset,date)
        #self.cluster_num = int(self.sitemap.DBSCAN(eps_val=self.eps))
        self.cluster_num = int(self.sitemap.DBSCAN())
        self.build_gold_cluster_dict()
        self.cluster_xpath_trans = self.get_xpath_transition()
        self.trans_prob_mat = self.calculate_trans_prob_mat()
        self.max_score = 500
        self.target_cluster = self.get_sample_cluster()
Beispiel #2
0
    def get_xpath_transition(self):

        sampled_urls = self.gold_dict.keys()
        counts_dict = defaultdict(int)  # (cluster_id, xpath) -> int
        xpath_counts_dict = defaultdict(lambda : defaultdict(float)) # (cluster_id, xpath) - > dict[cluster_id] -> int

        trans_dict = read_trans_dict(self.dataset,self.date)  # [page][xpath] = [url list] ->[cluster][xpath] = {probability list}
        print "sample_url", sampled_urls
        for page_path, trans in trans_dict.iteritems():
            page_url = page_path.replace(".html","").replace("_","/")
            if page_url not in self.gold_dict:
                #print "?" + page_url, " is missing"
                continue
            else:
                cluster_id = self.cluster_dict[page_url]

            for xpath,url_list in trans.iteritems():
                length = len(url_list)
                count = 0
                for path in url_list:
                    url = path.replace(".html","").replace("_","/")
                    if url in sampled_urls:
                        count += 1

                #print "for xpath: {0} --- {1} out of {2} have been crawled and have cluster id".format(xpath,count, length)
                if count == 0:
                    continue
                else:
                    #if cluster_id == 1:
                    #    print page_path, xpath, url_list, "xpath_url_list in train"
                    key = (cluster_id,xpath)
                    #if key == (1,"/html/body/div/div/div/div/div/div/div/div/div/div/ul/li/div/div/div/div/div/div/div/ul/li/div/div/div/h3/a[yt-uix-sessionlink yt-uix-tile-link  spf-link  yt-ui-ellipsis yt-ui-ellipsis-2]"):
                    #    print page_path,url_list, "why 9 not 7???"
                    counts_dict[key] += 1
                    ratio = float(length)/float(count)
                    for path in url_list:
                        url = path.replace(".html","").replace("_","/")
                        if url in sampled_urls:
                            destination_id = self.cluster_dict[url]
                            #print url, destination_id
                            xpath_counts_dict[key][destination_id] += 1 * ratio
                    #if cluster_id == 1:
                    #    print ""

        # average
        for key,count in counts_dict.iteritems():
            for destination_id in xpath_counts_dict[key]:
                xpath_counts_dict[key][destination_id] /= count

        print "Micro average entropy is " + str(self.entropy(xpath_counts_dict))

        ''' output
        for key in xpath_counts_dict:
            if key[0] == 1:
                print key, xpath_counts_dict[key]
        '''
        print "=========== end of training ============"
        self.xpath_counts_dict = xpath_counts_dict
        return xpath_counts_dict
Beispiel #3
0
    def __init__(self,
                 dataset,
                 date,
                 entry,
                 prefix,
                 eps,
                 cluster_rank,
                 crawl_size,
                 rank_algo="bfs"):
        self.dataset = dataset
        self.date = date
        self.eps = eps
        self.cluster_rank = cluster_rank
        self.rank_algo = rank_algo
        self.crawl_size = crawl_size
        self.rules = self.get_rules()
        self.entry, self.prefix = entry, prefix
        self.history_set = set()
        self.path_prefix = "../../Crawler/{}_samples/{}/".format(
            date, dataset.replace("new_", ""))
        self.folder_path = [
            "../../Crawler/{}_samples/{}/".format(date,
                                                  dataset.replace("new_", ""))
        ]
        self.sitemap = pageCluster(dataset, date, self.folder_path, 0)
        self.full_folder = "../../Crawler/full_data/" + dataset
        self.trans = {}
        self.queue = {}
        self.crawled_cluster_count = defaultdict(int)
        self.trans_dict = read_trans_dict(dataset, date)

        #self.cluster_dict = get_cluster_dict(dataset,date)
        #self.cluster_num = int(self.sitemap.DBSCAN(eps_val=self.eps))
        self.cluster_num = int(self.sitemap.DBSCAN())
        self.build_gold_cluster_dict()
        self.cluster_xpath_trans = self.get_xpath_transition()
        self.trans_prob_mat = self.calculate_trans_prob_mat()
        self.max_score = 500
        self.target_cluster = self.get_sample_cluster()
Beispiel #4
0
    def get_xpath_transition(self):

        sampled_urls = self.gold_dict.keys()
        counts_dict = defaultdict(int)  # (cluster_id, xpath) -> int
        xpath_counts_dict = defaultdict(lambda: defaultdict(
            float))  # (cluster_id, xpath) - > dict[cluster_id] -> int

        trans_dict = read_trans_dict(
            self.dataset, self.date
        )  # [page][xpath] = [url list] ->[cluster][xpath] = {probability list}
        print "sample_url", sampled_urls
        for page_path, trans in trans_dict.iteritems():
            page_url = page_path.replace(".html", "").replace("_", "/")
            if page_url not in self.gold_dict:
                #print "?" + page_url, " is missing"
                continue
            else:
                cluster_id = self.cluster_dict[page_url]

            for xpath, url_list in trans.iteritems():
                length = len(url_list)
                count = 0
                for path in url_list:
                    url = path.replace(".html", "").replace("_", "/")
                    if url in sampled_urls:
                        count += 1

                #print "for xpath: {0} --- {1} out of {2} have been crawled and have cluster id".format(xpath,count, length)
                if count == 0:
                    continue
                else:
                    #if cluster_id == 1:
                    #    print page_path, xpath, url_list, "xpath_url_list in train"
                    key = (cluster_id, xpath)
                    #if key == (1,"/html/body/div/div/div/div/div/div/div/div/div/div/ul/li/div/div/div/div/div/div/div/ul/li/div/div/div/h3/a[yt-uix-sessionlink yt-uix-tile-link  spf-link  yt-ui-ellipsis yt-ui-ellipsis-2]"):
                    #    print page_path,url_list, "why 9 not 7???"
                    counts_dict[key] += 1
                    ratio = float(length) / float(count)
                    for path in url_list:
                        url = path.replace(".html", "").replace("_", "/")
                        if url in sampled_urls:
                            destination_id = self.cluster_dict[url]
                            #print url, destination_id
                            xpath_counts_dict[key][destination_id] += 1 * ratio
                    #if cluster_id == 1:
                    #    print ""

        # average
        for key, count in counts_dict.iteritems():
            for destination_id in xpath_counts_dict[key]:
                xpath_counts_dict[key][destination_id] /= count

        print "Micro average entropy is " + str(
            self.entropy(xpath_counts_dict))
        ''' output
        for key in xpath_counts_dict:
            if key[0] == 1:
                print key, xpath_counts_dict[key]
        '''
        print "=========== end of training ============"
        self.xpath_counts_dict = xpath_counts_dict
        return xpath_counts_dict