Exemple #1
0
def redis_fetch_dailylist(date):
    prefix_url = 'http://news.at.zhihu.com/api/4/news/before/'
    cache = redis.Redis(host='localhost', port=6379, db=0)
    url = ''.join([prefix_url, date])
    content = fetch_url_content(url=url, port=80, timeout=15)
    if content:
        cache.set(date, content)
        return True
    return False
Exemple #2
0
def redis_fetch_article(article_id):
    article_id = str(article_id)
    cache = redis.Redis(host='localhost', port=6379, db=1)
    url = gen_article_url(article_id)
    content = fetch_url_content(url=url, port=80, timeout=15)
    if content:
        cache.set(article_id, content)
        return True
    return False
Exemple #3
0
def redis_fetch_dailylist(date):
    prefix_url = 'http://news.at.zhihu.com/api/4/news/before/'
    cache = redis.Redis(host='localhost', port=6379, db=0)
    url = ''.join([prefix_url, date])
    content = fetch_url_content(url=url, port=80, timeout=15)
    if content:
        cache.set(date, content)
        return True
    return False
Exemple #4
0
def redis_fetch_article(article_id):
    article_id = str(article_id)
    cache = redis.Redis(host='localhost', port=6379, db=1)
    url = gen_article_url(article_id)
    content = fetch_url_content(url=url, port=80, timeout=15)
    if content:
        cache.set(article_id, content)
        return True
    return False
Exemple #5
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import redis
from crawlerutils import fetch_url_content
from crawlerutils import gen_list_url


def get_invalid_dates(host, port, db):
    date_list = []
    cache = redis.Redis(host=host, port=port, db=db)
    for date in cache.keys():
        content = cache[date]
        if len(content) < 10:
            date_list.append([date, content])

    return date_list


if __name__ == '__main__':
    invalidset = get_invalid_dates(host='localhost', port=6379, db=0)
    print 'invalidset: ', invalidset
    for item in invalidset:
        date = item[0]
        url = gen_list_url(date)
        content = fetch_url_content(url=url, port=80, timeout=15)
        print 'refetch content:{content}'.format(content=content)