Example #1
0
    def __init__(self, site_url, html):
        language_table = stats_lib.set_up_character_ranges_table()

        tags = gt.get_tags(html)
        self.title = gt.get_title( tags )
        self.outgoing_links = gt.get_links(site_url, tags)
        self.outgoing_link_count = len( self.outgoing_links )
        self.scripts = gt.get_scripts(tags)
        self.number_of_scripts = len(self.scripts)
        self.langs = dict()
        self.alphabets_on_site = []
        self.incoming_link_count = 0
        text = tags.getText()
        
        #print( text)
        for ch in text:
            lang = stats_lib.check_ranges_from_table( ch ,language_table )
            #print( ch , lang )
            if lang in self.langs:
                self.langs[lang] += 1
            else:
                self.langs[lang] = 1

        for key in self.langs:
            if self.langs[key] > len(text)/100 and key not in self.alphabets_on_site:
                self.alphabets_on_site.append(key)
Example #2
0
    def __init__(self, site_url, html):
        language_table = stats_lib.set_up_character_ranges_table()

        tags = gt.get_tags(html)
        self.url = site_url
        self.title = gt.get_title( tags )
        self.outgoing_links = gt.get_links(site_url, tags)
        self.outgoing_link_count = len( self.outgoing_links )
        self.scripts = gt.get_scripts(tags)
        self.number_of_scripts = len(self.scripts)
        self.langs = dict()
        self.hash = make_hash(site_url)
        self.alphabets_on_site = []
        for script in tags(['script', 'style']):
            script.extract()
        self.body = strip_non_space_whitespace( tags.getText() )
        text = self.body
        
        #print( text)
        for ch in text:
            lang = stats_lib.check_ranges_from_table( ch, language_table )
            #print( ch , lang )
            if lang in self.langs:
                self.langs[lang] += 1
            else:
                self.langs[lang] = 1

        for key in self.langs:
            if self.langs[key] > len(text)/70 and key not in self.alphabets_on_site:
                self.alphabets_on_site.append(key)

        self.n_grams = stats_lib.count_n_grams( self.body, 5 )
        self.symbol_freq = stats_lib.count_symbol_frequency( self.body )
        self.symbol_entropy = stats_lib.calculate_symbol_entropy( self.symbol_freq )
        self.raw_html = html