def __init__(self, url, path, overwrite=False, spider=Crawler()): self.url = url self.path = path self.tmp_path = self.path + '.t' self.name = os.path.split(self.path)[-1] self.overwrite = overwrite self.spider = spider self._status = INITIALIZED self.total = 0 self.size = 0
def __init__(self, num_thread, segment_size, overwrite=False, spider=Crawler()): self.files = [] self.pool = ThreadPool(num_thread) self.overwrite = overwrite self.spider = spider self.segment_size = segment_size
def __init__(self, url, path, segment_size=10 * 1024 * 1024, overwrite=False, spider=Crawler()): self.url = url self.path = path self.name = os.path.split(self.path)[-1] self.overwrite = overwrite self.spider = spider self.segment_size = segment_size self._status = INITIALIZED self.segmentable = False self.total = 0 self.segments = [] self._get_head() self._segmentation()
import re import os import sys import time from urllib.parse import urlencode from bs4 import BeautifulSoup from utils.crawler import Crawler from utils.config import Config from utils.thread import ThreadPool from utils.common import Task, repair_filename, touch_dir, size_format from utils.playlist import Dpl from utils.downloader import FileManager spider = Crawler() VIDEO, PDF, RICH_TEXT = 1, 3, 4 COURSEWARE = { VIDEO: 'Video', PDF: 'PDF', RICH_TEXT: 'Rich_text' } headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36', } spider.headers.update(headers) CONFIG = Config() def login(username, password):
# -*- coding: utf-8 -*- """网易公开课""" import time from bs4 import BeautifulSoup from Crypto.Cipher import AES from moocs.utils import * from utils.crawler import Crawler name = "open_163" need_cookies = False CANDY = Crawler() CONFIG = {} FILES = {} VIDEOS = [] exports = {} __all__ = ["name", "need_cookies", "start", "exports"] def get_summary(url): """从课程主页面获取信息""" res = CANDY.get(url).text soup = BeautifulSoup(res, 'html.parser') links = [] if re.match(r'https?://open.163.com/special/', url): # 从课程主页解析各课程链接 names = soup.find_all('div', class_='g-container')[1] organization = names.find('a').string.strip()
def __init__(self): self.redis = RedisClient() self.crawler = Crawler()