Ejemplo n.º 1
0
def filterContentUrlFunc(website_id, website_url, xpath):
    """
    @summary: 筛选出网站的内容url
    """
    try:
        spiderRes = Spider().chromedriver(website_url)
        html_selector = spiderRes.selector
        if html_selector is None:
            log.logMsg(LogType.htmlSelectorNone,
                       "[FilterContentUrlThread] %s %s" % (website_url, "html_selector is None."))
            return False

        hrefs = filterHrefs(website_url, xpath, html_selector)
        if len(hrefs) == 0:
            return False

        flag = False
        for href in hrefs:
            if not Cache.listItemExist(cache.oldContent_list, href) and \
                    not Cache.listItemExist(cache.unrecognized_contentUrl_dict, href):
                Cache.putQueue(cache.freshContentUrl_queue, (website_id, href))
                flag = True
        if not flag:
            # 如果没有新数据,则延迟15分钟的爬取时间
            incrDelay_time(website_id, 900)
        return True
    except Exception as e:
        log.logMsg(LogType.error, "[FilterContentUrlThread] %s %s" % (website_url, traceback.format_exc()))
    return False
Ejemplo n.º 2
0
def saveThumbnail(url, imgname):
    try:
        pil_img = _getPilImg(url)
        width, height = pil_img.size
        box = ()
        if width >= height:
            scale = height / width
            if scale == thumbnail_standard:
                box = (0, 0, width, height)
            else:
                leng = height / 3 if scale < thumbnail_standard else width / 4
        else:
            scale = width / height
            leng = width / 4 if scale <= thumbnail_standard else height / 3

        if not box:
            x_padding = (width - leng * 4) / 2
            y_padding = (height - leng * 3) / 2
            box = (x_padding, y_padding, leng * 4 + x_padding,
                   leng * 3 + y_padding)

        region = pil_img.crop(box)
        region.thumbnail(thumbnail_size, Image.ANTIALIAS)
        region.save(os.path.join(Thumbnail_DIR, imgname))
        return True
    except Exception as e:
        log.logMsg(log.LogType.error, "[FormThumbnail] %s" % repr(e))
    return False
Ejemplo n.º 3
0
def saveWebsiteDelaytime():
    """
    @summary: 保存网站爬取延迟到数据库中
    """
    try:
        for website_id in Cache.keys(cache.websiteDelay_dict):
            delaytime = Cache.getDict(cache.websiteDelay_dict, website_id)
            db.saveDelay_time(website_id, delaytime)
    except Exception as e:
        log.logMsg(LogType.error, "[saveWebsiteDelaytime] %s" % (repr(e)))
Ejemplo n.º 4
0
 def run(self):
     while not global_EXIT:
         url = ""
         try:
             website_id, url = Cache.getQueue(cache.freshContentUrl_queue, False)
             res = filterContentInfoFunc(website_id, url)
             if res == SpiderResType.success or res == SpiderResType.alreadyExist:
                 Cache.appendList(cache.oldContent_list, url)
             else:
                 Cache.setDict(cache.unrecognized_contentUrl_dict, url, website_id)
         except Exception as e:
             if type(e) is not queue.Empty:
                 log.logMsg(LogType.error, "[FilterContentInfoThread] %s %s" % (url, traceback.format_exc()))
Ejemplo n.º 5
0
 def run(self):
     while not global_EXIT:
         website_url = ""
         try:
             website_id, website_url, xpath = Cache.getQueue(cache.websiteUrl_queue, False)
             if not filterContentUrlFunc(website_id, website_url, xpath):
                 Cache.setDict(cache.unrecognized_websiteUrl_dict, website_id, (website_url, xpath))
         except Exception as e:
             if type(e) is not queue.Empty:
                 log.logMsg(LogType.error, "[FilterContentUrlThread.freshHandler] %s %s"%(website_url, traceback.format_exc()))
             else:
                 for i in range(10):
                     if global_EXIT: break
                     time.sleep(1)
Ejemplo n.º 6
0
def resetDelay_time():
    """
    @summary: 重置各网站的爬取延迟
    """
    db = None
    try:
        db = mysql.Mysql()
        for website_id in Cache.keys(cache.websiteDelay_dict):
            record = Cache.getDict(cache.websiteDelay_dict, website_id)
            Cache.setDict(cache.websiteDelay_dict, website_id, (record[0], 0))
            db.saveDelay_time(website_id, 0)
    except Exception as e:
        log.logMsg(LogType.error, "[saveWebsiteDelaytime] %s" % (repr(e)))
    finally:
        if db: db.dispose()
Ejemplo n.º 7
0
def main():
    thread_count = 3
    pre_threads = []

    initdb()                                            # 初始化redis数据库
    initGlobalArgs()
    initContentUrl_dict()                               # 初始化去重表

    log_thread = log.LogThread()                        # 启动日志记录线程
    log_thread.start()

    QueryWebsiteUrl_thread = QueryWebsiteUrlThread()    # 启动读取网站地址线程
    QueryWebsiteUrl_thread.start()
    pre_threads.append(QueryWebsiteUrl_thread)

    filterContentUrl_thread = FilterContentUrlThread()  # 启动爬取内容地址线程
    filterContentUrl_thread.start()
    pre_threads.append(filterContentUrl_thread)

    for i in range(thread_count):
        thread = FilterContentInfoThread()
        thread.start()
        pre_threads.append(thread)

    unrecognizedWebsiteUrl_thread = UnrecognizedWebsiteUrl_Thread()
    unrecognizedWebsiteUrl_thread.start()
    pre_threads.append(unrecognizedWebsiteUrl_thread)

    unrecognizedContentUrl_thread = UnrecognizedContentUrl_Thread()
    unrecognizedContentUrl_thread.start()
    pre_threads.append(unrecognizedContentUrl_thread)


    while not global_EXIT: pass

    time.sleep(5)

    saveWebsiteDelaytime()              # 保存各网站的延迟时间

    for t in pre_threads:
        t.join()

    log.logMsg(LogType.success, "--------------------bye---------------------\n")
    while not Cache.qempty(cache.log_queue): pass  # 等待把所有日志写到文件中
    Cache.setDict(cache.globalArgs_dict, "LogThread_EXIT", True)
    log_thread.join()

    if db: db.dispose()
Ejemplo n.º 8
0
 def run(self):
     while not global_EXIT:
         url = ""
         try:
             url = Cache.randomKey(cache.unrecognized_contentUrl_dict)
             if url:
                 website_id = Cache.getDict(cache.unrecognized_contentUrl_dict, url)
                 res = filterContentInfoFunc(website_id, url)
                 if res == SpiderResType.success or res == SpiderResType.alreadyExist:
                     Cache.removeDict(cache.unrecognized_contentUrl_dict, url)
                     Cache.appendList(cache.oldContent_list, url)
             for i in range(300):
                 if global_EXIT: break
                 time.sleep(1)
         except Exception as e:
             log.logMsg(LogType.error, "[FilterContentInfoThread.freshHandler] %s %s" % (url, traceback.format_exc()))
Ejemplo n.º 9
0
    def run(self):
        while not global_EXIT:
            try:
                if Cache.qempty(cache.websiteUrl_queue):
                    records = mysql.Mysql.queryWebsiteUrl()
                    for record in records:  # record: id,url,xpath,detail,delay_time
                        record = [str(item) for item in record]
                        self.initWebsite_delay_dict(record)
                        t = threading.Thread(target=self.putRecord, args=(record,))
                        t.setDaemon(True)
                        t.start()

            except Exception as e:
                log.logMsg(LogType.error, "[QueryWebsiteUrlThread] %s" % (traceback.format_exc()))
            for i in range(60):
                if global_EXIT: break
                time.sleep(1)
Ejemplo n.º 10
0
    def run(self):
        while not global_EXIT:
            website_url = ""
            if not Cache.dempty(cache.unrecognized_websiteUrl_dict):
                try:
                    website_id = Cache.randomKey(cache.unrecognized_websiteUrl_dict)
                    if not website_id:
                        for i in range(30):
                            if global_EXIT: break
                            time.sleep(1)
                            continue

                    website_url, xpath = Cache.getDict(cache.unrecognized_websiteUrl_dict, website_id)
                    if (website_id, website_url, xpath):
                        Cache.removeDict(cache.unrecognized_websiteUrl_dict, website_id)

                except Exception as e:
                    log.logMsg(LogType.error, "[FilterContentUrlThread.unrecognizedHandler] %s %s" % (website_url, traceback.format_exc()))
Ejemplo n.º 11
0
def filterContentInfoFunc(website_id, content_url):
    """
    @summary: 筛选内容中的信息
    """
    try:
        xpaths = mysql.Mysql.queryContentXpath(website_id)  # 返回字段循序: title_xpath,author_xpath,time_xpath,content_xpath,id
        contents = ['', '', '', '']                     # 记录 title, author, time, content内容的列表

        spiderRes = global_Chrome and Spider().chromedriver(content_url) or Spider().urllib(content_url)
        html_selector = spiderRes.selector
        if not html_selector:
            log.logMsg(LogType.htmlSelectorNone, "[filterContentInfoFunc] %s的html_selector为空" % content_url)
            return SpiderResType.htmlSelctorNone


        for xpath in xpaths:    # 筛选出最完整的信息
            items = filterPureTag(html_selector, xpath)
            if items[0] and items[3]:
                if (len(items[3]) > len(contents[3])) or (len(items[3]) == len(contents[3])) and (
                                    len(items[0]) > len(contents[0]) or len(items[1]) > len(contents[1]) or len(
                            items[2]) > len(contents[2])
                ):
                    contents = [item for item in items]
                    contents.append(xpath[-1])

        if contents[0] and contents[3]:
            if db.cursorIsClose():
                log.logMsg(LogType.error, "[filterContentInfoFunc] cursor already closed")
                return SpiderResType.cursorClose

            contents[3] = imgSrcHandler(content_url, contents[3])  # 处理图片src指向
            contents[3] = hrefHandler(content_url, contents[3])     # 处理超链接指向

            imgs = randomImg(contents[3])                        # 获取前三张图片
            contents = [*contents[:4], *imgs, brief(contents[3]), content_url, contents[4]]
            contents = spaceHandler(contents, [0, 1, 2, 7])

            if db.cursorIsClose():
                log.logMsg(LogType.error, "[filterContentInfoFunc] cursor already closed")
                return SpiderResType.cursorClose

            if contents[4]:
                imgname = str(uuid.uuid1()) + ".jpg"
                if thumbnail.saveThumbnail(contents[4], imgname):
                    contents[4] = imgname
                else:
                    contents[4] = ""

            db.callproc('dataInsert_proc', contents)
            log.logMsg(LogType.success, "[filterContentInfoFunc] %s" % content_url)
        else:
            log.logMsg(LogType.partialNone, "[filterContentInfoFunc] %s's title or content is None."%content_url, website_id, content_url)
            return SpiderResType.unrecognized
    except Exception as e:
        if type(e) is not queue.Empty:
            log.logMsg(LogType.error, "[filterContentInfoFunc] %s %s" % (content_url, traceback.format_exc()))
            return SpiderResType.otherError
    return SpiderResType.success