def __init__(self, site_url, html): language_table = stats_lib.set_up_character_ranges_table() tags = gt.get_tags(html) self.url = site_url self.title = gt.get_title( tags ) self.outgoing_links = gt.get_links(site_url, tags) self.outgoing_link_count = len( self.outgoing_links ) self.scripts = gt.get_scripts(tags) self.number_of_scripts = len(self.scripts) self.langs = dict() self.hash = make_hash(site_url) self.alphabets_on_site = [] for script in tags(['script', 'style']): script.extract() self.body = strip_non_space_whitespace( tags.getText() ) text = self.body #print( text) for ch in text: lang = stats_lib.check_ranges_from_table( ch, language_table ) #print( ch , lang ) if lang in self.langs: self.langs[lang] += 1 else: self.langs[lang] = 1 for key in self.langs: if self.langs[key] > len(text)/70 and key not in self.alphabets_on_site: self.alphabets_on_site.append(key) self.n_grams = stats_lib.count_n_grams( self.body, 5 ) self.symbol_freq = stats_lib.count_symbol_frequency( self.body ) self.symbol_entropy = stats_lib.calculate_symbol_entropy( self.symbol_freq ) self.raw_html = html
def increment_link_counter(self, table_name, url): #print(url) #url = url.replace("'", '') #shouldn't need this right? request = "UPDATE " + table_name + " SET n_incoming_links = n_incoming_links + 1 WHERE hash='" + make_hash.make_hash(url) + "';" self.cursor.execute(request)
return validate_func(url) if __name__ == "__main__": argparser = argparse.ArgumentParser(description="get the raw html content of a site.") argparser.add_argument('-u', dest='url', default='None', help="The URL to try.") argparser.add_argument('--output', dest='output', default='stdout', help="the type of ouptut. stdout or file") argparser.add_argument("-f", dest='filename', help='the filename to write to') args= argparser.parse_args() #set output type, file or stdout if args.output == 'file': filename = "" if args.filename: filename = args.filename else: filename = make_hash( args.url ) args.output = open( DATADIR+filename+'.pu', 'w' ) else: args.output = sys.stdout # visit then write out if validate_with(is_tip_top_level, args.url): uni_version = get_html(args.url) uni_version = unicodedata.normalize('NFC', uni_version).encode("ascii", 'xmlcharrefreplace') args.output.write( args.url + '\n' ) args.output.write( uni_version ) else: print "Sorry, requires URLS of the form 'http://www.<somesite>.<domain>' for now." exit(-2) ##close that shit out.