Example #1
0
 def __init__(self, glbl):
     """crawler constructor
         1. register signal to control crawler suspend、resume、stop
         2. init downloader、parser、urlmanager、monitor
         3. use list to manage downloader、parser、urlmanager、outputer、monitor
         4. init lock to control multi thread or process
         5. use Queue to transmit urls and htmls between threads/processes and threads/processes
     """
     self.glbl = glbl
     self.isSuspend = False
     self.isStop = False
     # register signal
     signal.signal(signal.SIGINT, self.suspendResume)
     signal.signal(signal.SIGTSTP, self.stop)
     # init downloader、parser、urlmanager object
     self.downloader = dl.Downloader()
     self.parser = ps.Parser(cfg.urlREs, cfg.exceptUrlREs)
     self.urlmanager = um.UrlManager()
     # init lists to manage downloader、parser、urlmanager、outputer、monitor
     self.downloaderList = []
     self.parserList = []
     self.outputerList = []
     self.urlmanagerList = []
     self.monitorList = []
     # thread or process
     if cfg.isMultiProcess:
         self.Concurrency = multiprocessing.Process
         self.Lock = multiprocessing.Lock
         self.Queue = multiprocessing.Queue
     else:
         self.Concurrency = threading.Thread
         self.Lock = threading.Lock
         self.Queue = Queue.Queue
     # Queue and Lock
     self.inUrlQueue = self.Queue()
     self.outUrlQueue = self.Queue()
     self.htmlQueue = self.Queue()
     self.contentQueue = self.Queue()
     # init start urls
     for url in cfg.startUrls:
         self.inUrlQueue.put(url)
Example #2
0
 def __init__(self):
     self.urls = urlmanager.UrlManager()
     self.downloader = htmldownloader.HtmlDownloader()
     self.outputer = html_outputer.Html_Outputer()
     self.parser = html_parser.HtmlParser()
Example #3
0
 def __init__(self):
     self.root_url = r'https://sh.zu.anjuke.com/?pi=baidu-cpchz-sh-hexin1&kwid=63651556880&utm_term=%E7%A7%9F%E6%88%BF'
     self.htmlDownloader = htmldownloader.HtmlDownloader()
     self.urlmanager = urlmanager.UrlManager()
     self.anjukeParser = htmlparser.HtmlParser()
     self.log = open('log.txt', 'w', encoding='utf-8')
Example #4
0
 def __init__(self):
     self.urls = urlmanager.UrlManager()
     self.download = htmldownload.HtmlDownLoad()
     self.parser = htmlparser.HtmlParser()
     self.output = htmlout.HtmlOut()
Example #5
0
 def __init__(self):
     self.urlmanager = urlmanager.UrlManager()
     self.htmldownloader = htmldownload.HtmlDownload()
     self.htmlparser = htmlparser.HtmlParser()
     self.outputer = outputer.Outputer()
Example #6
0
 def __init__(self):
     self.manager = urlmanager.UrlManager()
     self.download = htmldownload.HtmlDownload()
     self.dataoutput = dataoutput.DataOutput()
     self.parser = htmlparser.HtmlParser()
Example #7
0
 def __init__(self):
     """初始化你每个模块的实例"""
     self.manager = urlmanager.UrlManager()
     self.parse = htmlparser.HtmlParse()
     self.downLoader = htmldownload.HtmlDownLoad()
     self.dataOut = dataoutput.DataOutput()
Example #8
0
    # landCrawler.close_cur_tab()
    # landCrawler.get_main_page()
    # start_date = input(u"请输入开始日期yyyy-mm-dd,以回车结束:".encode("gbk"))
    # terminal_date = input(u"请输入结束日期yyyy-mm-dd,以回车结束:".encode("gbk"))
    start_date = '2019-1-1'
    terminal_date = '2019-1-10'
    # try:
    #  indent_days = int(input(u"请输入初始时间跨度(天),以回车结束;直接敲回车将使用默认时间间隔:".encode("gbk")))
    # except ValueError:
    #  pass

    #ip = input(u"请输入数据库ip,以回车结束;直接敲回车将使用本地数据库:".encode("gbk"))
    # if (not isinstance(ip, str)) or ip == "":
    #  ip = "localhost"

    um = urlmanager.UrlManager()
    # mysqlhelper.DAO(ip)
    d = excelhelper.DAO()
    tr = timeranger.TimeRanger(indent_days, terminal_date)
    # 将标准输出、错误重定向至文件
    redirect = redirectstd.RedirectStd()
    starttime = str(time.time())

    redirect.outToFile("out" + starttime + ".txt")
    redirect.errToFile("err" + starttime + ".txt")

    landCrawler = LandCrawler(um, d, tr)
    tl = tool.Tool()

    try:
        landCrawler.get_main_page()