Ejemplo n.º 1
0
def test_one(url):
    myHTML = HTMLObject();
    myHTML.getHTML(url);
    parser = myParser(myurl = url, domain = getDomain(url));
    parser.parse(myHTML.page);
    styled_objs, no_style_objs = main_style_parse(parser.styled_objects, parser.full_style, True);
    
    for key in styled_objs.keys():
        print key + ' ==> ' + str(styled_objs[key]);
Ejemplo n.º 2
0
    def parsePage(self, url):
        webpageBytes = urllib.request.urlopen(url).read()
        gbkCharset = re.compile('charset\s*=\s*(gb2312|gbk)',re.IGNORECASE)
        mGbk = re.search(gbkCharset, str(webpageBytes))
        if mGbk is not None:
            webpage = webpageBytes.decode('gbk','ignore')
        else:
            utf8Charset = re.compile('charset\s*=\s*utf-8',re.IGNORECASE)
            mUtf8 = re.search(utf8Charset, str(webpageBytes))
            if mUtf8 is not None:
                webpage = webpageBytes.decode('utf-8','ignore')
            else:
                webpage = webpageBytes.decode('gbk','ignore')
    #===============================================================================
    #    webpage = re.sub(r"\\'", r"'", webpage)
    #    webpage = re.sub(r'\\[rn]', r' ', webpage)
    #===============================================================================
        webpage = re.sub('\s+',' ', webpage)
        
        doubleQuotation = re.compile(r'("[^=><"]*)"([^=><"]*)"([^=><"]*")')
        oldList = doubleQuotation.findall(webpage)
        for oldItem in oldList:
            webpage = webpage.replace(oldItem[0]+'"'+oldItem[1]+'"'+oldItem[2], oldItem[0]+"'"+oldItem[1]+"'"+oldItem[2])
        
        tp = myParser(self.divClass)
        tp.feed(webpage)
        newlink = ''
        for newlink in tp.getlinklist():
            noFollow = 0 
            for nf in self.nofollowList:
                if newlink.find(nf) != -1:
                    noFollow = 1
            if noFollow:
                continue
            if str.find(str.lower(newlink), 'mailto:') != -1:
                continue
            #other domain
            if newlink[:7] == 'http://' and newlink.find(self.domainName) == -1:
                continue
#some links like 'www.xxx.com', then the final link would be 'http://www.abc.com/www.xxx.com, error!
            if newlink[:7] != 'http://' and newlink[0] != '/':
                continue
            if newlink[:4] != 'http' and newlink.find('://') == -1:
                newlink = 'http://'+self.domainName+newlink
            if newlink not in self.linkQueue and newlink not in self.linkDone:
                self.linkQueue.append(newlink)
                print(newlink)

        if len(tp.getcontent()) == 0:
            fNothing = open('../nocontent.log','a+')
            fNothing.write(url+'\n')
            print("nothing!")
            return None
        sqlitehandle = sqlite.sqlite()
        sqlitehandle.insertContent(url, tp.gettitle(), tp.getcontent() )
#===============================================================================
#        f = open('../content.txt', 'a+')
#        f.write(url+'\n')
#        f.write(tp.gettitle()+'\n')
#        f.write(tp.getcontent()+'\n\n')
#===============================================================================
        print(tp.gettitle())
        print(tp.getcontent())