Example #1
0
    def run(self):
        global queue_href, mutex_href_get, mutex_href_put
        mutex_href_get.acquire()

        while queue_href.qsize() > 0:
            # 在线程池中取得链接
            viewHref = str(queue_href.get())
            mutex_href_get.release()

            # 调用get_page函数
            result = get_page(viewHref)
            print('1111111111111111111111111111111111111122222222222222')
            print(result)
            print('111' * 10)
            try:
                mutex_href_put.acquire()
                print(str(type(result)))
                if str(type(result)) == "<class 'list'>":
                    # 存储
                    print(len(result))
                    saveUrls(result)
                elif result == 1:  #连接错误
                    logUrlConnectError(viewHref)
                elif result == 2:  #格式错误
                    logUrlFormError(viewHref)

                mutex_href_put.release()
            except:
                traceback.print_exc()
                print('shittttttttttttttttttt')
                mutex_href_put.release()
                mutex_href_get.acquire()
                continue
            mutex_href_get.acquire()
        mutex_href_get.release()
Example #2
0
    def run(self):
        global queue_date, mutex_date_get, mutex_date_put
        mutex_date_get.acquire()

        while queue_date.qsize() > 0:
            # 在线程池中取得链接和序号
            crawldate = str(queue_date.get())
            mutex_date_get.release()

            # 调用get_page函数
            result = get_page(crawldate)
            print('1111111111111111111111111111111111111122222222222222')
            print(result)
            print('111' * 10)
            try:
                mutex_date_put.acquire()
                print(str(type(result)))
                if str(type(result)) == "<class 'list'>":
                    #存储
                    print(len(result))
                    saveUrls(result)
                elif result == 1:
                    logUrlConnectError(crawldate)
                mutex_date_put.release()
            except:
                traceback.print_exc()
                print('shittttttttttttttttttt')
                mutex_date_put.release()
                mutex_date_get.acquire()
                continue
            mutex_date_get.acquire()
        mutex_date_get.release()
Example #3
0
def get_new(spiderUrl, localLatestDate):
    pageNum = 1
    nowPageDate = "1970-01-01"  #初始化信息的发布日期
    errCount = 0  #连接错误次数
    while errCount <= 3:  #错误次数超过三次,直接退出
        time.sleep(2)
        viewHref = spiderUrl + str(pageNum)
        result = get_page(viewHref)
        if str(type(result)) == "<class 'list'>":
            print(len(result))
            nowPageDate = result[-1][-2]  #获取最后一条信息的发布日期
            print(nowPageDate)
            saveUrls(result)

            errCount = 0
            if nowPageDate < localLatestDate:  #已经到本地最新的日期,停止爬
                break
            else:
                pageNum = pageNum + 1  #继续爬下一页
        elif result == 1:  # 连接错误
            logUrlConnectError(viewHref)
            errCount = errCount + 1
        elif result == 2:  # 格式错误
            logUrlFormError(viewHref)
            errCount = 0
Example #4
0
def getErrorUrlAgain():
    datelist = readErrorUrl()
    for crawldate in datelist:
        result = get_page(crawldate)
        try:
            if str(type(result)) == "<class 'list'>":
                # 存储
                saveUrls(result)
                deleteErrorUrl(crawldate)
            elif result == 1:
                logUrlConnectError(crawldate)
        except:
            traceback.print_exc()