def __init__(self, glbl): """crawler constructor 1. register signal to control crawler suspend、resume、stop 2. init downloader、parser、urlmanager、monitor 3. use list to manage downloader、parser、urlmanager、outputer、monitor 4. init lock to control multi thread or process 5. use Queue to transmit urls and htmls between threads/processes and threads/processes """ self.glbl = glbl self.isSuspend = False self.isStop = False # register signal signal.signal(signal.SIGINT, self.suspendResume) signal.signal(signal.SIGTSTP, self.stop) # init downloader、parser、urlmanager object self.downloader = dl.Downloader() self.parser = ps.Parser(cfg.urlREs, cfg.exceptUrlREs) self.urlmanager = um.UrlManager() # init lists to manage downloader、parser、urlmanager、outputer、monitor self.downloaderList = [] self.parserList = [] self.outputerList = [] self.urlmanagerList = [] self.monitorList = [] # thread or process if cfg.isMultiProcess: self.Concurrency = multiprocessing.Process self.Lock = multiprocessing.Lock self.Queue = multiprocessing.Queue else: self.Concurrency = threading.Thread self.Lock = threading.Lock self.Queue = Queue.Queue # Queue and Lock self.inUrlQueue = self.Queue() self.outUrlQueue = self.Queue() self.htmlQueue = self.Queue() self.contentQueue = self.Queue() # init start urls for url in cfg.startUrls: self.inUrlQueue.put(url)
def __init__(self): self.urls = urlmanager.UrlManager() self.downloader = htmldownloader.HtmlDownloader() self.outputer = html_outputer.Html_Outputer() self.parser = html_parser.HtmlParser()
def __init__(self): self.root_url = r'https://sh.zu.anjuke.com/?pi=baidu-cpchz-sh-hexin1&kwid=63651556880&utm_term=%E7%A7%9F%E6%88%BF' self.htmlDownloader = htmldownloader.HtmlDownloader() self.urlmanager = urlmanager.UrlManager() self.anjukeParser = htmlparser.HtmlParser() self.log = open('log.txt', 'w', encoding='utf-8')
def __init__(self): self.urls = urlmanager.UrlManager() self.download = htmldownload.HtmlDownLoad() self.parser = htmlparser.HtmlParser() self.output = htmlout.HtmlOut()
def __init__(self): self.urlmanager = urlmanager.UrlManager() self.htmldownloader = htmldownload.HtmlDownload() self.htmlparser = htmlparser.HtmlParser() self.outputer = outputer.Outputer()
def __init__(self): self.manager = urlmanager.UrlManager() self.download = htmldownload.HtmlDownload() self.dataoutput = dataoutput.DataOutput() self.parser = htmlparser.HtmlParser()
def __init__(self): """初始化你每个模块的实例""" self.manager = urlmanager.UrlManager() self.parse = htmlparser.HtmlParse() self.downLoader = htmldownload.HtmlDownLoad() self.dataOut = dataoutput.DataOutput()
# landCrawler.close_cur_tab() # landCrawler.get_main_page() # start_date = input(u"请输入开始日期yyyy-mm-dd,以回车结束:".encode("gbk")) # terminal_date = input(u"请输入结束日期yyyy-mm-dd,以回车结束:".encode("gbk")) start_date = '2019-1-1' terminal_date = '2019-1-10' # try: # indent_days = int(input(u"请输入初始时间跨度(天),以回车结束;直接敲回车将使用默认时间间隔:".encode("gbk"))) # except ValueError: # pass #ip = input(u"请输入数据库ip,以回车结束;直接敲回车将使用本地数据库:".encode("gbk")) # if (not isinstance(ip, str)) or ip == "": # ip = "localhost" um = urlmanager.UrlManager() # mysqlhelper.DAO(ip) d = excelhelper.DAO() tr = timeranger.TimeRanger(indent_days, terminal_date) # 将标准输出、错误重定向至文件 redirect = redirectstd.RedirectStd() starttime = str(time.time()) redirect.outToFile("out" + starttime + ".txt") redirect.errToFile("err" + starttime + ".txt") landCrawler = LandCrawler(um, d, tr) tl = tool.Tool() try: landCrawler.get_main_page()