Example #1
0
def read_task():
    # _list_exec = json.loads(file.read(task_path + "task-exec-" + util.get_hash(__filename) + ".tsk"))
    _list_set = json.loads(file.read(task_path + "task-set-" + util.get_hash(__filename) + ".tsk"))
    _list_his = json.loads(file.read(task_path + "task-history-" + __filename + ".tsk"))
    history_list.extend(_list_his)
    next_set.update(_list_set)
    next_task.update(next_set - set(history_list))
Example #2
0
def crawl(_dst_url: str):
    global __count
    history_list.append(_dst_url)
    __count = __count + 1
    filename = util.get_hash(_dst_url)
    cache_html_path = cache_page_path + filename + ".html"
    if file.exist(cache_html_path):
        html = file.read(cache_html_path)
        print("缓存读取----->" + filename)
    else:
        print("请求网络----->" + filename)
        html = spider.get_html(_dst_url)
        file.write(cache_html_path, html)
    links = spider.get_links(html)
    img_list = spider.get_img(html)
    # print(img_list)

    __image_list.extend(img_list)

    json_str = json.dumps(img_list, ensure_ascii=False)

    file.write(cache_link_path + filename + ".json", json_str)

    global_list.extend(links)

    global_set = set(global_list)

    new_set = set()

    for x in global_set:
        v = str(x).split("#")[0]
        if _root in v:
            new_set.add(v)
        else:
            if str(v).find('http') == -1:
                new_set.add(urljoin.url_path_join(_base_url, v))

    return new_set
Example #3
0
def store():
    file.write(task_path + "task-exec-" + util.get_hash(__filename) + ".tsk",
               json.dumps(list(next_task), ensure_ascii=False))
    file.write(task_path + "task-set-" + util.get_hash(__filename) + ".tsk",
               json.dumps(list(next_set), ensure_ascii=False))
    file.write(task_path + "task-history-" + __filename + ".tsk", json.dumps(history_list, ensure_ascii=False))
Example #4
0
def task_exist():
    return file.exist(task_path + "task-exec-" + util.get_hash(__filename) + ".tsk")
Example #5
0
from core import spilder, util, db
from urllib.error import HTTPError, URLError
from helper import file
import urljoin
import re
import time

data_path = "data/"
cache_path = data_path + "cache/"

# _root = 'www.netbian.com'
_root = 'desk.zol.com.cn/'
_base_url = 'http://' + _root

__filename = util.get_hash(_root)

cache_page_path = cache_path + "pages/" + __filename + "/"
cache_link_path = cache_path + "links/" + __filename + "/"
image_path = data_path + "images/" + __filename + "/"
task_path = data_path + "task/" + __filename + "/"

util.check_dirs((cache_link_path, cache_page_path, task_path))
DB = db.DB()

dstUrl = 'http://' + _root + '/'
next_set = set()
global_list = []
history_list = []
start_time = time.time()
next_list = list(next_set)