def __init__(self): try: self.urlManager = urlManager.UrlManager() self.htmlLoader = htmlLoader.HtmlLoader() self.htmlParser = htmlParser.HtmlParser() except Exception as e: SpiderLogger.log(e)
def __init__(self,root_url,category_Nums,categories_Name,path): self.searchUrlsManger=urlM.UrlManager() self.crawlUrlsManger=urlM.UrlManager() self.htmlParser = htmlP.HtmlParser() ## construct the seach urls for pagesNum in range(int(category_Nums/10)): self.searchUrlsManger.add_new_url(root_url+"word="+categories_Name+"&pn="+str(pagesNum)) htmlDownloader = htmlD.HtmlDownloader() ## search all the questions dataOutput = DataOutput() for i in range(self.searchUrlsManger.get_urls_num()): tmp_searchUrl=self.searchUrlsManger.get_new_url() tmp_content=htmlDownloader.download(tmp_searchUrl) tmp_datas=self.htmlParser.parse(tmp_content,i) dataOutput.output_excel(tmp_datas,path)
def __init__(self): self.smtp = mysendmail.MyMail() self.htmlLoader = htmlLoader.HtmlLoader() self.urlManager = urlManager.UrlManager() self.htmlParser = htmlParser.HtmlParser()
def __init__(self): self.urls = urlManager.UrlManager() self.downloader = htmlDown.HtmlDownloader() self.parser = htmlParser.HtmlParser()
def __init__(self): self.urls = urlManager.UrlManager(); self.downloader = htmlDownloader.HtmlDownloader(); self.parser = htmlParser.HtmlParser(); self.outputer = htmlOutputer.HtmlOutputer();