def main(): # get agvs global TOTSIZE, THREADCOUNT, TOTAL404, keyWords, CRAWEDSIZE try: keyWords = sys.argv[1:-1] TOTSIZE = int(sys.argv[-1]) except Exception: print JColors.FAIL + "input arguments format: keyword1 keyword2 ... seedSize" sys.exit(0) seeds = getSeeds(keyWords) # put em in a queue q = PQ() for s in seeds: item = dict() item["url"] = s # item['status']='unvisited' lst = s.split("/") item["priority"] = 0 item["domain"] = lst[0] + "//" + lst[2] try: q.put((item["priority"], item)) except Exception: print "This is never gonna happen..." # downloaded pages queue pq = Queue() stat = JStats(TOTSIZE) parseThread = threading.Thread(target=parsePage, args=(q, pq)) parseThread.start() # parseThread.join() for i in range(1, THREADCOUNT): downloadThread = threading.Thread(target=downloadPage, args=(q, pq)) downloadThread.start() downloadThread.join() JLogger.log(JColors.OKBLUE + "Parser: got enough page, writing pages in queue to REPOFILE...") while not pq.empty(): page = pq.get() if page["score"] == -1: calcScore(page) writeRepo(page, keyWords) CRAWEDSIZE += 1 stat.report(TOTAL404, RELCNT, keyWords)
def downloadPage(q, pq): """ @para q : Priority queue storing url objects, priority is calculated by : pri=pri_of_current_page+10-#keywords_appeared_in_url @para pq: queue storing downloaded page objects This function fetched url from priority queue, checks 1. if it's visited 2. check if url returns 404 3. check MIME type 4.download page if previous conditions are satisfied. This function will be put into multiple threads, so a synchronized priority queue is used. """ global CRAWEDSIZE, TOTSIZE, TOTAL404, keyWords while True: # check if q has more urls to get, sleep 3 seconds if q is empty if q.qsize() < 1: print JColors.BOLD + "Download thread: No more URL to download, go to sleep..." time.sleep(3) continue # stop if enough pages are downloaded if CRAWEDSIZE + pq.qsize() >= TOTSIZE: return print JColors.OKBLUE + "Downloader: Start fetching from URL..." curUrl = dict() # fetch next un-visited url while q.qsize() > 0: curUrl = q.get()[1] if not isVisited(curUrl): break # start downloading if curUrl and not isVisited(curUrl): try: response = urllib2.urlopen( curUrl["url"], timeout=5 ) # timeout 5 sec to avoid 'stucking' into some un-responding pages if response.code == 404: TOTAL404 += 1 JLogger.log("Got a 404 response!") if ( response.code == 200 and response.info().type == "text/html" ): # only download pages with code 200 and MIME type html page_item = dict() page_item["url"] = curUrl["url"] page_item["time"] = str(datetime.datetime.now()) page_item["data"] = response.read() response.close() page_item["domain"] = curUrl["domain"] page_item["priority"] = curUrl["priority"] page_item["score"] = -1 pq.put(page_item, False) JLogger.log(JColors.OKGREEN + "Download " + curUrl["url"] + " succeeded!") print "pq length:" + str(pq.qsize()) except Exception: JLogger.log(JColors.WARNING + "Download " + curUrl["url"] + " failed!")
def parsePage(q, pq): """ @para q : Priority queue storing url objects, priority is calculated by : pri=pri_of_current_page+10-#keywords_appeared_in_url @para pq: queue storing downloaded page objects 1. parse URL from page 2. save page and meta data into repofile 3. calculate actual score of a page by : Sum(# of key word appearance in a page) """ global CRAWEDSIZE, TOTSIZE, keyWords while CRAWEDSIZE + pq.qsize() < TOTSIZE: # fetch page data from pq to parse, go to sleep if pq is empty if pq.qsize() < 1: print JColors.BOLD + "Parser thread: No more page to parse, go to sleep..." time.sleep(3) continue if CRAWEDSIZE + pq.qsize() >= TOTSIZE or q.qsize() > 1.5 * TOTSIZE: return print JColors.OKBLUE + "Parser: fetching and parsing page..." curPage = pq.get() data = curPage["data"] # =================================================== # for test & debug: save current processing page to file try: temp = open("PROCESSINGFILE", "w") temp.write(data) temp.close() except IOError: print "Failed to open PROCESSINGFILE" # =================================================== lines = data.splitlines() score = 0 # process line by line for line in lines: # add line score to page score for wd in keyWords: score += line.count(wd) n = line.find("href") if CRAWEDSIZE + pq.qsize() < TOTSIZE and n != -1: ll = line[n:-1].split('"') if len(ll) > 2: url = ll[1] else: continue urlItem = dict() if url.find("http") == -1: url = curPage["domain"] + url urlItem["domain"] = curPage["domain"] else: lst = url.split("/") try: urlItem["domain"] = lst[0] + "//" + lst[2] except Exception: print lst # parse robots.txt rp = robotparser.RobotFileParser() rp.set_url(urlItem.get("domain") + "/robots.txt") try: rp.read() if not rp.can_fetch("*", url): print JColors.WARNING + "" + url + " Forbidden by robot.txt, skipped!" continue except Exception: print "Load robot failed" urlItem["url"] = url # calculate url priority : according to keyword count in url itself url_priority = curPage["priority"] + 10 for wd in keyWords: url_priority - url.count(wd) urlItem["priority"] = url_priority if CRAWEDSIZE + pq.qsize() < TOTSIZE: q.put((url_priority, urlItem)) print q.qsize() if q.qsize() > 1.5 * TOTSIZE: return JLogger.log( JColors.OKGREEN + "Parser: new URL " + url + " added to URL queue! Priority:" + str(url_priority) ) # wirte page to file and increase counter curPage["score"] = score if CRAWEDSIZE + pq.qsize() < TOTSIZE: JLogger.log(JColors.OKBLUE + "Parser: writing processed page...") writeRepo(curPage, keyWords) CRAWEDSIZE = CRAWEDSIZE + 1 JLogger.log(JColors.OKBLUE + "Parser: Current craw status " + str(CRAWEDSIZE) + "/" + str(TOTSIZE))
def report(self, TOTAL404, RELCNT, keyWords): keyString = "&".join(keyWords) fsize = str(self.get_size(keyString) >> 20) + "Mb" elapsed_time = time.time() - self.start JLogger.log(JColors.BOLD + "Total crawled file size:") JLogger.log(JColors.BOLD + fsize) print "Total 404 encountered: " + str(TOTAL404) JLogger.log(JColors.BOLD + "Total crawling time:") JLogger.log(str(elapsed_time)) JLogger.log(JColors.BOLD + "Avg time per page:") JLogger.log(str(elapsed_time / self.TOTSIZE)) JLogger.log(JColors.BOLD + "Total related page count:") JLogger.log(str(RELCNT)) JLogger.log(JColors.BOLD + "Precision :") JLogger.log(str(RELCNT / (self.TOTSIZE * 1.0))) try: repoList = open("REPOLIST_" + keyString, "a") repoList.write( "\n".join( [ "Total crawled file size:", fsize, "Total 404 encountered:", str(TOTAL404), "Total crawling time:", str(elapsed_time), "Avg time per page:", str(elapsed_time / self.TOTSIZE), "Total related page count:", str(RELCNT), "Precision :", str(RELCNT / (self.TOTSIZE * 1.0)), ] ) ) except IOError: print "Failed to write statistics to repolist file!"