Python Cache.getDict Examples

Programming Language: Python

Namespace/Package Name: Spider.cache

Class/Type: Cache

Method/Function: getDict

Examples at hotexamples.com: 11

Python Cache.getDict - 11 examples found. These are the top rated real world Python examples of Spider.cache.Cache.getDict extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

getDict(11)

setDict(8)

appendList(4)

getQueue(3)

keys(3)

putQueue(3)

keyExist(2)

listItemExist(2)

qempty(2)

randomKey(2)

removeDict(2)

Cache(1)

dempty(1)

flushdb(1)

listLength(1)

removeList(1)

Example #1

Show file

    def _requests_getPagesource(page_source_q,
                                url,
                                method,
                                data,
                                use_proxy=False):
        while not Cache.getDict(cache.globalArgs_dict,
                                "global_EXIT") and page_source_q.empty():
            try:
                headers = {"User-agent": user_agent()}
                if use_oa_proxy:
                    proxies, ip, port = Spider._getproxy()

                if method == "POST":
                    res = requests.post(url,
                                        data=data,
                                        proxies=proxies,
                                        headers=headers)
                elif method == "GET":
                    res = requests.get(url,
                                       data=data,
                                       proxies=proxies,
                                       headers=headers)
                if res.status_code == 200 and Spider._pagesourceLegal(
                        res.text):
                    page_source_q.put(res.text)
            except Exception as e:
                print(e)
                if ip: redis_client.delete(ip)

Example #2

Show file

File: WeNeW_Spider.py Project: zi-ming/WeNeW

def saveWebsiteDelaytime():
    """
    @summary: 保存网站爬取延迟到数据库中
    """
    try:
        for website_id in Cache.keys(cache.websiteDelay_dict):
            delaytime = Cache.getDict(cache.websiteDelay_dict, website_id)
            db.saveDelay_time(website_id, delaytime)
    except Exception as e:
        log.logMsg(LogType.error, "[saveWebsiteDelaytime] %s" % (repr(e)))

Example #3

Show file

File: WeNeW_Spider.py Project: zi-ming/WeNeW

def show_delay_time():
    """
    @summary: 显示各网站的爬取延迟
    """
    records = []
    keys = Cache.keys(cache.websiteDelay_dict) or []
    for website_id in keys:
        record = mysql.Mysql.queryWebsiteUrl(website_id)    # id,url,xpath,detail,delay_time
        records.append((record[0][0], record[0][3] or record[0][1], Cache.getDict(cache.websiteDelay_dict, website_id)))
    headers = ["id", "url", "delay-time(s)"]
    print(tabulate(records, headers=headers))

Example #4

Show file

File: WeNeW_Spider.py Project: zi-ming/WeNeW

 def putRecord(self, record):
     """
     @summary: 把record添加到正在等待的网站队列中
     """
     website_id, website_url, xpath = record[:3]
     if not Cache.listItemExist(cache.workingWebsite_list, website_id) and \
             not Cache.keyExist(cache.unrecognized_websiteUrl_dict, website_id):
         Cache.appendList(cache.workingWebsite_list, website_id)
         Cache.putQueue(cache.websiteUrl_queue, (website_id, website_url, xpath))
         sleep_time = Cache.getDict(cache.websiteDelay_dict, website_id)
         for i in range(int(sleep_time)):
             if global_EXIT: return
             time.sleep(1)
         Cache.removeList(cache.workingWebsite_list, website_id)

Example #5

Show file

File: WeNeW_Spider.py Project: zi-ming/WeNeW

def resetDelay_time():
    """
    @summary: 重置各网站的爬取延迟
    """
    db = None
    try:
        db = mysql.Mysql()
        for website_id in Cache.keys(cache.websiteDelay_dict):
            record = Cache.getDict(cache.websiteDelay_dict, website_id)
            Cache.setDict(cache.websiteDelay_dict, website_id, (record[0], 0))
            db.saveDelay_time(website_id, 0)
    except Exception as e:
        log.logMsg(LogType.error, "[saveWebsiteDelaytime] %s" % (repr(e)))
    finally:
        if db: db.dispose()

Example #6

Show file

File: WeNeW_Spider.py Project: zi-ming/WeNeW

 def run(self):
     while not global_EXIT:
         url = ""
         try:
             url = Cache.randomKey(cache.unrecognized_contentUrl_dict)
             if url:
                 website_id = Cache.getDict(cache.unrecognized_contentUrl_dict, url)
                 res = filterContentInfoFunc(website_id, url)
                 if res == SpiderResType.success or res == SpiderResType.alreadyExist:
                     Cache.removeDict(cache.unrecognized_contentUrl_dict, url)
                     Cache.appendList(cache.oldContent_list, url)
             for i in range(300):
                 if global_EXIT: break
                 time.sleep(1)
         except Exception as e:
             log.logMsg(LogType.error, "[FilterContentInfoThread.freshHandler] %s %s" % (url, traceback.format_exc()))

Example #7

Show file

 def run(self):
     while not Cache.getDict(cache.globalArgs_dict, "LogThread_EXIT"):
         try:
             info = Cache.getQueue(cache.log_queue, False)
             if os.path.exists(self.getFilename()):
                 log_size = os.path.getsize(
                     self.getFilename()) / 1024 / 1024  # 日志大小超过1M时另建新的日志文件
                 if log_size > 1:
                     self.index += 1
             with open(self.getFilename(), 'a') as f:
                 info += '<%s>\n' % (
                     datetime.datetime.now().strftime("%H:%M:%S"))
                 f.write(info)
         except Exception as e:
             if type(e) is not queue.Empty:
                 print("Log Error: %s" % e)

Example #8

Show file

File: WeNeW_Spider.py Project: zi-ming/WeNeW

    def run(self):
        while not global_EXIT:
            website_url = ""
            if not Cache.dempty(cache.unrecognized_websiteUrl_dict):
                try:
                    website_id = Cache.randomKey(cache.unrecognized_websiteUrl_dict)
                    if not website_id:
                        for i in range(30):
                            if global_EXIT: break
                            time.sleep(1)
                            continue

                    website_url, xpath = Cache.getDict(cache.unrecognized_websiteUrl_dict, website_id)
                    if (website_id, website_url, xpath):
                        Cache.removeDict(cache.unrecognized_websiteUrl_dict, website_id)

                except Exception as e:
                    log.logMsg(LogType.error, "[FilterContentUrlThread.unrecognizedHandler] %s %s" % (website_url, traceback.format_exc()))

Example #9

Show file

    def _chrome_getPagesource(page_source_q, url, timeout):
        driver, ip, port = None, None, None
        while not Cache.getDict(cache.globalArgs_dict,
                                "global_EXIT") and page_source_q.empty():
            try:
                if system == "Linux":
                    chrome_options = Options()
                    chrome_options.add_argument('--headless')
                    chrome_options.add_argument('--disable-gpu')
                else:
                    os.environ["webdriver.chrome.driver"] = chromedriver
                    chrome_options = webdriver.ChromeOptions()

                if Spider._useProxy(url):
                    proxies, ip, port = Spider._getproxy()

                if ip and port:
                    chrome_options.add_argument("--proxy-server=http://%s:%s" %
                                                (ip, port))

                if system == "Linux":
                    driver = webdriver.Chrome(chrome_options=chrome_options)
                else:
                    driver = webdriver.Chrome(chromedriver,
                                              chrome_options=chrome_options)

                driver.get(url)
                time.sleep(timeout)
                js = "document.body.scrollTop=1000"
                driver.execute_script(js)
                time.sleep(3)
                page_source = driver.page_source
                driver.close()
                driver.quit()
                if page_source and Spider._pagesourceLegal(page_source):
                    page_source_q.put(page_source)
            except Exception as e:
                if ip: redis_client.delete(ip)
                if driver:
                    driver.close()
                    driver.quit()

Example #10

Show file

    def _urllib_getPagesource(q, url):
        while not Cache.getDict(cache.globalArgs_dict,
                                "global_EXIT") and q.empty():
            proxies, ip, port = None, None, None
            try:
                if Spider._useProxy(url):
                    proxies, ip, port = Spider._getproxy()
                if proxies:
                    proxy_handler = urllib.request.ProxyHandler(proxies)
                    opener = urllib.request.build_opener(proxy_handler)
                    opener.addheaders = [('User-agent', user_agent())]
                    res = opener.open(url, timeout=5)
                    page_source = res.read().decode("utf8")
                else:
                    req = urllib.request.Request(
                        url, headers={"User-agent": user_agent()})
                    resp = urllib.request.urlopen(req)
                    page_source = resp.read().decode("utf8")

                if page_source and Spider._pagesourceLegal(page_source):
                    q.put(page_source)
            except Exception as e:
                if ip: redis_client.delete(ip)

Example #11

Show file

File: common.py Project: zi-ming/WeNeW

def incrDelay_time(website_id, timeout):
    """
    @summary: 对网站增加timeout个时间延迟
    """
    record = Cache.getDict(cache.websiteDelay_dict, website_id)
    Cache.setDict(cache.websiteDelay_dict, website_id, int(record) + timeout)