Ejemplo n.º 1
0
def filterContentUrlFunc(website_id, website_url, xpath):
    """
    @summary: 筛选出网站的内容url
    """
    try:
        spiderRes = Spider().chromedriver(website_url)
        html_selector = spiderRes.selector
        if html_selector is None:
            log.logMsg(LogType.htmlSelectorNone,
                       "[FilterContentUrlThread] %s %s" % (website_url, "html_selector is None."))
            return False

        hrefs = filterHrefs(website_url, xpath, html_selector)
        if len(hrefs) == 0:
            return False

        flag = False
        for href in hrefs:
            if not Cache.listItemExist(cache.oldContent_list, href) and \
                    not Cache.listItemExist(cache.unrecognized_contentUrl_dict, href):
                Cache.putQueue(cache.freshContentUrl_queue, (website_id, href))
                flag = True
        if not flag:
            # 如果没有新数据,则延迟15分钟的爬取时间
            incrDelay_time(website_id, 900)
        return True
    except Exception as e:
        log.logMsg(LogType.error, "[FilterContentUrlThread] %s %s" % (website_url, traceback.format_exc()))
    return False
Ejemplo n.º 2
0
 def putRecord(self, record):
     """
     @summary: 把record添加到正在等待的网站队列中
     """
     website_id, website_url, xpath = record[:3]
     if not Cache.listItemExist(cache.workingWebsite_list, website_id) and \
             not Cache.keyExist(cache.unrecognized_websiteUrl_dict, website_id):
         Cache.appendList(cache.workingWebsite_list, website_id)
         Cache.putQueue(cache.websiteUrl_queue, (website_id, website_url, xpath))
         sleep_time = Cache.getDict(cache.websiteDelay_dict, website_id)
         for i in range(int(sleep_time)):
             if global_EXIT: return
             time.sleep(1)
         Cache.removeList(cache.workingWebsite_list, website_id)
Ejemplo n.º 3
0
def logMsg(logType, msg, website_id="", content_url=""):
    """
    @summary:               把日志放到redis中(partialNone要放到数据库中)
    :param logType:         日志类型
    :param msg:             日志内容
    :param website_id:      网站id
    :param content_url:     内容url
    :return:
    """
    if logType == LogType.error and msg:
        msg = "》Error《:%s" % msg
    elif logType == LogType.htmlSelectorNone or logType == LogType.partialNone:
        msg = "?Warning?:%s" % msg
    elif logType == LogType.success:
        msg = "【Success】:%s" % msg
    else:
        msg = "--Other--:%s" % msg
    if logType == LogType.partialNone:
        Mysql.writeWebsiteMsg(website_id, content_url)
    Cache.putQueue(cache.log_queue, msg)