Esempio n. 1
0
def main():
    # get agvs
    global TOTSIZE, THREADCOUNT, TOTAL404, keyWords, CRAWEDSIZE
    try:
        keyWords = sys.argv[1:-1]
        TOTSIZE = int(sys.argv[-1])
    except Exception:
        print JColors.FAIL + "input arguments format: keyword1 keyword2 ... seedSize"
        sys.exit(0)

    seeds = getSeeds(keyWords)

    # put em in a queue
    q = PQ()
    for s in seeds:
        item = dict()
        item["url"] = s
        # item['status']='unvisited'
        lst = s.split("/")
        item["priority"] = 0
        item["domain"] = lst[0] + "//" + lst[2]
        try:
            q.put((item["priority"], item))
        except Exception:
            print "This is never gonna happen..."

    # downloaded pages queue
    pq = Queue()

    stat = JStats(TOTSIZE)
    parseThread = threading.Thread(target=parsePage, args=(q, pq))
    parseThread.start()
    # parseThread.join()
    for i in range(1, THREADCOUNT):
        downloadThread = threading.Thread(target=downloadPage, args=(q, pq))
        downloadThread.start()
        downloadThread.join()

    JLogger.log(JColors.OKBLUE + "Parser: got enough page, writing pages in queue to REPOFILE...")
    while not pq.empty():
        page = pq.get()
        if page["score"] == -1:
            calcScore(page)
        writeRepo(page, keyWords)
        CRAWEDSIZE += 1
    stat.report(TOTAL404, RELCNT, keyWords)
Esempio n. 2
0
def downloadPage(q, pq):
    """
    @para q : Priority queue storing url objects, priority is calculated by : pri=pri_of_current_page+10-#keywords_appeared_in_url
    @para pq: queue storing downloaded page objects
    This function fetched url from priority queue, checks 1. if it's visited 2. check if url returns 404 3. check MIME type
    4.download page if previous conditions are satisfied.
    This function will be put into multiple threads, so a synchronized priority queue is used.
    """
    global CRAWEDSIZE, TOTSIZE, TOTAL404, keyWords
    while True:
        # check if q has more urls to get, sleep 3 seconds if q is empty
        if q.qsize() < 1:
            print JColors.BOLD + "Download thread: No more URL to download, go to sleep..."
            time.sleep(3)
            continue
        # stop if enough pages are downloaded
        if CRAWEDSIZE + pq.qsize() >= TOTSIZE:
            return
        print JColors.OKBLUE + "Downloader: Start fetching from URL..."
        curUrl = dict()
        # fetch next un-visited url
        while q.qsize() > 0:
            curUrl = q.get()[1]
            if not isVisited(curUrl):
                break
        # start downloading
        if curUrl and not isVisited(curUrl):
            try:
                response = urllib2.urlopen(
                    curUrl["url"], timeout=5
                )  # timeout 5 sec to avoid 'stucking' into some un-responding pages
                if response.code == 404:
                    TOTAL404 += 1
                    JLogger.log("Got a 404 response!")
                if (
                    response.code == 200 and response.info().type == "text/html"
                ):  # only download pages with code 200 and MIME type html
                    page_item = dict()
                    page_item["url"] = curUrl["url"]
                    page_item["time"] = str(datetime.datetime.now())
                    page_item["data"] = response.read()
                    response.close()
                    page_item["domain"] = curUrl["domain"]
                    page_item["priority"] = curUrl["priority"]
                    page_item["score"] = -1
                    pq.put(page_item, False)
                    JLogger.log(JColors.OKGREEN + "Download " + curUrl["url"] + " succeeded!")
                    print "pq length:" + str(pq.qsize())
            except Exception:
                JLogger.log(JColors.WARNING + "Download " + curUrl["url"] + " failed!")
Esempio n. 3
0
def parsePage(q, pq):
    """
    @para q : Priority queue storing url objects, priority is calculated by : pri=pri_of_current_page+10-#keywords_appeared_in_url
    @para pq: queue storing downloaded page objects
    1. parse URL from page
    2. save page and meta data into repofile
    3. calculate actual score of a page by : Sum(# of key word appearance in a page)
    """
    global CRAWEDSIZE, TOTSIZE, keyWords
    while CRAWEDSIZE + pq.qsize() < TOTSIZE:
        # fetch page data from pq to parse, go to sleep if pq is empty
        if pq.qsize() < 1:
            print JColors.BOLD + "Parser thread: No more page to parse, go to sleep..."
            time.sleep(3)
            continue
        if CRAWEDSIZE + pq.qsize() >= TOTSIZE or q.qsize() > 1.5 * TOTSIZE:
            return
        print JColors.OKBLUE + "Parser: fetching and parsing page..."
        curPage = pq.get()
        data = curPage["data"]

        # ===================================================
        # for test & debug: save current processing page to file
        try:
            temp = open("PROCESSINGFILE", "w")
            temp.write(data)
            temp.close()
        except IOError:
            print "Failed to open PROCESSINGFILE"
        # ===================================================

        lines = data.splitlines()
        score = 0
        # process line by line
        for line in lines:
            # add line score to page score
            for wd in keyWords:
                score += line.count(wd)
            n = line.find("href")
            if CRAWEDSIZE + pq.qsize() < TOTSIZE and n != -1:
                ll = line[n:-1].split('"')
                if len(ll) > 2:
                    url = ll[1]
                else:
                    continue
                urlItem = dict()
                if url.find("http") == -1:
                    url = curPage["domain"] + url
                    urlItem["domain"] = curPage["domain"]
                else:
                    lst = url.split("/")
                    try:
                        urlItem["domain"] = lst[0] + "//" + lst[2]
                    except Exception:
                        print lst
                # parse robots.txt
                rp = robotparser.RobotFileParser()
                rp.set_url(urlItem.get("domain") + "/robots.txt")
                try:
                    rp.read()
                    if not rp.can_fetch("*", url):
                        print JColors.WARNING + "" + url + " Forbidden by robot.txt, skipped!"
                        continue
                except Exception:
                    print "Load robot failed"
                urlItem["url"] = url
                # calculate url priority : according to keyword count in url itself
                url_priority = curPage["priority"] + 10
                for wd in keyWords:
                    url_priority - url.count(wd)
                urlItem["priority"] = url_priority
                if CRAWEDSIZE + pq.qsize() < TOTSIZE:
                    q.put((url_priority, urlItem))
                    print q.qsize()
                    if q.qsize() > 1.5 * TOTSIZE:
                        return
                    JLogger.log(
                        JColors.OKGREEN
                        + "Parser: new URL "
                        + url
                        + " added to URL queue! Priority:"
                        + str(url_priority)
                    )
        # wirte page to file and increase counter
        curPage["score"] = score
        if CRAWEDSIZE + pq.qsize() < TOTSIZE:
            JLogger.log(JColors.OKBLUE + "Parser: writing processed page...")
            writeRepo(curPage, keyWords)
            CRAWEDSIZE = CRAWEDSIZE + 1
            JLogger.log(JColors.OKBLUE + "Parser: Current craw status " + str(CRAWEDSIZE) + "/" + str(TOTSIZE))
Esempio n. 4
0
 def report(self, TOTAL404, RELCNT, keyWords):
     keyString = "&".join(keyWords)
     fsize = str(self.get_size(keyString) >> 20) + "Mb"
     elapsed_time = time.time() - self.start
     JLogger.log(JColors.BOLD + "Total crawled file size:")
     JLogger.log(JColors.BOLD + fsize)
     print "Total 404 encountered: " + str(TOTAL404)
     JLogger.log(JColors.BOLD + "Total crawling time:")
     JLogger.log(str(elapsed_time))
     JLogger.log(JColors.BOLD + "Avg time per page:")
     JLogger.log(str(elapsed_time / self.TOTSIZE))
     JLogger.log(JColors.BOLD + "Total related page count:")
     JLogger.log(str(RELCNT))
     JLogger.log(JColors.BOLD + "Precision :")
     JLogger.log(str(RELCNT / (self.TOTSIZE * 1.0)))
     try:
         repoList = open("REPOLIST_" + keyString, "a")
         repoList.write(
             "\n".join(
                 [
                     "Total crawled file size:",
                     fsize,
                     "Total 404 encountered:",
                     str(TOTAL404),
                     "Total crawling time:",
                     str(elapsed_time),
                     "Avg time per page:",
                     str(elapsed_time / self.TOTSIZE),
                     "Total related page count:",
                     str(RELCNT),
                     "Precision :",
                     str(RELCNT / (self.TOTSIZE * 1.0)),
                 ]
             )
         )
     except IOError:
         print "Failed to write statistics to repolist file!"