def test_one(url): myHTML = HTMLObject(); myHTML.getHTML(url); parser = myParser(myurl = url, domain = getDomain(url)); parser.parse(myHTML.page); styled_objs, no_style_objs = main_style_parse(parser.styled_objects, parser.full_style, True); for key in styled_objs.keys(): print key + ' ==> ' + str(styled_objs[key]);
def parsePage(self, url): webpageBytes = urllib.request.urlopen(url).read() gbkCharset = re.compile('charset\s*=\s*(gb2312|gbk)',re.IGNORECASE) mGbk = re.search(gbkCharset, str(webpageBytes)) if mGbk is not None: webpage = webpageBytes.decode('gbk','ignore') else: utf8Charset = re.compile('charset\s*=\s*utf-8',re.IGNORECASE) mUtf8 = re.search(utf8Charset, str(webpageBytes)) if mUtf8 is not None: webpage = webpageBytes.decode('utf-8','ignore') else: webpage = webpageBytes.decode('gbk','ignore') #=============================================================================== # webpage = re.sub(r"\\'", r"'", webpage) # webpage = re.sub(r'\\[rn]', r' ', webpage) #=============================================================================== webpage = re.sub('\s+',' ', webpage) doubleQuotation = re.compile(r'("[^=><"]*)"([^=><"]*)"([^=><"]*")') oldList = doubleQuotation.findall(webpage) for oldItem in oldList: webpage = webpage.replace(oldItem[0]+'"'+oldItem[1]+'"'+oldItem[2], oldItem[0]+"'"+oldItem[1]+"'"+oldItem[2]) tp = myParser(self.divClass) tp.feed(webpage) newlink = '' for newlink in tp.getlinklist(): noFollow = 0 for nf in self.nofollowList: if newlink.find(nf) != -1: noFollow = 1 if noFollow: continue if str.find(str.lower(newlink), 'mailto:') != -1: continue #other domain if newlink[:7] == 'http://' and newlink.find(self.domainName) == -1: continue #some links like 'www.xxx.com', then the final link would be 'http://www.abc.com/www.xxx.com, error! if newlink[:7] != 'http://' and newlink[0] != '/': continue if newlink[:4] != 'http' and newlink.find('://') == -1: newlink = 'http://'+self.domainName+newlink if newlink not in self.linkQueue and newlink not in self.linkDone: self.linkQueue.append(newlink) print(newlink) if len(tp.getcontent()) == 0: fNothing = open('../nocontent.log','a+') fNothing.write(url+'\n') print("nothing!") return None sqlitehandle = sqlite.sqlite() sqlitehandle.insertContent(url, tp.gettitle(), tp.getcontent() ) #=============================================================================== # f = open('../content.txt', 'a+') # f.write(url+'\n') # f.write(tp.gettitle()+'\n') # f.write(tp.getcontent()+'\n\n') #=============================================================================== print(tp.gettitle()) print(tp.getcontent())