def __init__(self, site_url, html): language_table = stats_lib.set_up_character_ranges_table() tags = gt.get_tags(html) self.title = gt.get_title( tags ) self.outgoing_links = gt.get_links(site_url, tags) self.outgoing_link_count = len( self.outgoing_links ) self.scripts = gt.get_scripts(tags) self.number_of_scripts = len(self.scripts) self.langs = dict() self.alphabets_on_site = [] self.incoming_link_count = 0 text = tags.getText() #print( text) for ch in text: lang = stats_lib.check_ranges_from_table( ch ,language_table ) #print( ch , lang ) if lang in self.langs: self.langs[lang] += 1 else: self.langs[lang] = 1 for key in self.langs: if self.langs[key] > len(text)/100 and key not in self.alphabets_on_site: self.alphabets_on_site.append(key)
def all_alphabets(robot): import stats_lib language_table = stats_lib.set_up_character_ranges_table() cursor = robot.get_data_from_table('fish_sites', 'url,body_text') alphabets = dict() for row in cursor: text = row[1] url = row[0] langs = dict() for ch in text: lang = stats_lib.check_ranges_from_table( ch, language_table ) #print( ch , lang ) if lang in langs: langs[lang] += 1 else: langs[lang] = 1 for key in langs: if langs[key] > len(text)/50.0: if key not in alphabets: alphabets[key] = 1 else: alphabets[key] += 1 robot.close_connection() return alphabets
def __init__(self, site_url, html): language_table = stats_lib.set_up_character_ranges_table() tags = gt.get_tags(html) self.url = site_url self.title = gt.get_title( tags ) self.outgoing_links = gt.get_links(site_url, tags) self.outgoing_link_count = len( self.outgoing_links ) self.scripts = gt.get_scripts(tags) self.number_of_scripts = len(self.scripts) self.langs = dict() self.hash = make_hash(site_url) self.alphabets_on_site = [] for script in tags(['script', 'style']): script.extract() self.body = strip_non_space_whitespace( tags.getText() ) text = self.body #print( text) for ch in text: lang = stats_lib.check_ranges_from_table( ch, language_table ) #print( ch , lang ) if lang in self.langs: self.langs[lang] += 1 else: self.langs[lang] = 1 for key in self.langs: if self.langs[key] > len(text)/70 and key not in self.alphabets_on_site: self.alphabets_on_site.append(key) self.n_grams = stats_lib.count_n_grams( self.body, 5 ) self.symbol_freq = stats_lib.count_symbol_frequency( self.body ) self.symbol_entropy = stats_lib.calculate_symbol_entropy( self.symbol_freq ) self.raw_html = html