def crawl(): crawler = Crawler() crawler.crawlAllUrl(FLAG_OUTPUT2DISK,CRAWL_AMOUNT_LIMIT)
def main(): initResult = init.initGlobal() crawler = Crawler() if(initResult != False): #input print("Please enter your keyword") keyword = raw_input() keyword = keyword.replace(' ','+') #start crawling from search engine crawler = Crawler() startTime = time.time() crawler.loadRecord(LOG_OF_CRAWLED_URL) crawler.loadRecord(LOG_OF_CRAWLED_CONTENT) crawler.addSearchEngineUrl(keyword) htmlcode = crawler.crawlUrl(GOOGLE) parser = LinkParser() parser.setFlag(GOOGLE) parser.feed(htmlcode) top10 = parser.hrefsList crawler.addUrlList(top10,GOOGLE) parser.close() threadPool = [] # run the work with THREAD_NUM threads while len(threadPool) <= THREAD_NUM: th = threading.Thread(None,crawl) threadPool.append(th) for item in threadPool: item.start() for item in threadPool: item.join() crawler.flush() endTime = time.time() print("time used:") print(endTime-startTime) keyword = raw_input()