Ejemplo n.º 1
0
    def __init__(self, site_url, html):
        language_table = stats_lib.set_up_character_ranges_table()

        tags = gt.get_tags(html)
        self.url = site_url
        self.title = gt.get_title( tags )
        self.outgoing_links = gt.get_links(site_url, tags)
        self.outgoing_link_count = len( self.outgoing_links )
        self.scripts = gt.get_scripts(tags)
        self.number_of_scripts = len(self.scripts)
        self.langs = dict()
        self.hash = make_hash(site_url)
        self.alphabets_on_site = []
        for script in tags(['script', 'style']):
            script.extract()
        self.body = strip_non_space_whitespace( tags.getText() )
        text = self.body
        
        #print( text)
        for ch in text:
            lang = stats_lib.check_ranges_from_table( ch, language_table )
            #print( ch , lang )
            if lang in self.langs:
                self.langs[lang] += 1
            else:
                self.langs[lang] = 1

        for key in self.langs:
            if self.langs[key] > len(text)/70 and key not in self.alphabets_on_site:
                self.alphabets_on_site.append(key)

        self.n_grams = stats_lib.count_n_grams( self.body, 5 )
        self.symbol_freq = stats_lib.count_symbol_frequency( self.body )
        self.symbol_entropy = stats_lib.calculate_symbol_entropy( self.symbol_freq )
        self.raw_html = html
Ejemplo n.º 2
0
 def increment_link_counter(self, table_name, url):
     #print(url)
     #url = url.replace("'", '') #shouldn't need this right?
     request = "UPDATE " + table_name + " SET n_incoming_links = n_incoming_links + 1 WHERE hash='" + make_hash.make_hash(url) + "';"
     self.cursor.execute(request)
Ejemplo n.º 3
0
    return validate_func(url)

if __name__ == "__main__":
    argparser = argparse.ArgumentParser(description="get the raw html content of a site.")
    argparser.add_argument('-u', dest='url', default='None', help="The URL to try.")
    argparser.add_argument('--output', dest='output', default='stdout', help="the type of ouptut. stdout or file")
    argparser.add_argument("-f", dest='filename', help='the filename to write to')
    args= argparser.parse_args()

    #set output type, file or stdout
    if args.output == 'file':
        filename = ""
        if args.filename:
            filename = args.filename
        else:
            filename = make_hash( args.url )
        args.output = open( DATADIR+filename+'.pu', 'w' )
    else:
        args.output = sys.stdout

    # visit then write out
    if validate_with(is_tip_top_level, args.url):
        uni_version = get_html(args.url)
        uni_version = unicodedata.normalize('NFC', uni_version).encode("ascii", 'xmlcharrefreplace')
        args.output.write( args.url + '\n' )
        args.output.write( uni_version )
    else:
        print "Sorry, requires URLS of the form 'http://www.<somesite>.<domain>' for now."
        exit(-2)

    ##close that shit out.