Ejemplo n.º 1
0
 def __init__(self,
              num_thread,
              segment_size,
              overwrite=False,
              spider=Crawler()):
     self.files = []
     self.pool = ThreadPool(num_thread)
     self.overwrite = overwrite
     self.spider = spider
     self.segment_size = segment_size
Ejemplo n.º 2
0
 def __init__(self, url, path, overwrite=False, spider=Crawler()):
     self.url = url
     self.path = path
     self.tmp_path = self.path + '.t'
     self.name = os.path.split(self.path)[-1]
     self.overwrite = overwrite
     self.spider = spider
     self._status = INITIALIZED
     self.total = 0
     self.size = 0
Ejemplo n.º 3
0
 def __init__(self,
              url,
              path,
              segment_size=10 * 1024 * 1024,
              overwrite=False,
              spider=Crawler()):
     self.url = url
     self.path = path
     self.name = os.path.split(self.path)[-1]
     self.overwrite = overwrite
     self.spider = spider
     self.segment_size = segment_size
     self._status = INITIALIZED
     self.segmentable = False
     self.total = 0
     self.segments = []
     self._get_head()
     self._segmentation()
Ejemplo n.º 4
0
class Getter:
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        :return:
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                for proxy in proxies:
                    self.redis.add(proxy)
Ejemplo n.º 5
0
import re
import os
import sys
import time

from urllib.parse import urlencode
from bs4 import BeautifulSoup

from utils.crawler import Crawler
from utils.config import Config
from utils.thread import ThreadPool
from utils.common import Task, repair_filename, touch_dir, size_format
from utils.playlist import Dpl
from utils.downloader import FileManager

spider = Crawler()
VIDEO, PDF, RICH_TEXT = 1, 3, 4
COURSEWARE = {
    VIDEO: 'Video',
    PDF: 'PDF',
    RICH_TEXT: 'Rich_text'
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36',
}
spider.headers.update(headers)
CONFIG = Config()


def login(username, password):
Ejemplo n.º 6
0
# -*- coding: utf-8 -*-
"""网易公开课"""

import time

from bs4 import BeautifulSoup
from Crypto.Cipher import AES

from moocs.utils import *
from utils.crawler import Crawler

name = "open_163"
need_cookies = False
CANDY = Crawler()
CONFIG = {}
FILES = {}
VIDEOS = []
exports = {}
__all__ = ["name", "need_cookies", "start", "exports"]


def get_summary(url):
    """从课程主页面获取信息"""

    res = CANDY.get(url).text
    soup = BeautifulSoup(res, 'html.parser')
    links = []
    if re.match(r'https?://open.163.com/special/', url):
        # 从课程主页解析各课程链接
        names = soup.find_all('div', class_='g-container')[1]
        organization = names.find('a').string.strip()
Ejemplo n.º 7
0
 def __init__(self):
     self.redis = RedisClient()
     self.crawler = Crawler()
Ejemplo n.º 8
0
import json
import re
import os

from bs4 import BeautifulSoup
from utils.crawler import Crawler
from utils.config import Config
from utils.db import SQLite, BigintField, StringField, DoubleField, Model
from utils.filer import touch_dir

spider = Crawler()
CONFIG = Config('jd_spider').conf
GLOBAL = Config('jd_spider').glob

GLOBAL['data_dir'] = touch_dir(CONFIG['data_dir'])


class AirConditioning(Model):

    skuid = BigintField('skuid', primary_key=True, not_null=True)
    brand = StringField('brand')
    kind = StringField('kind')
    horsepower = StringField('horsepower')
    mode = StringField('mode')
    EEI = BigintField('EEI')
    EER = DoubleField('EER')
    rfc = BigintField('rfc')
    rfp = BigintField('rfp')
    noise = BigintField('noise')
    price = BigintField('price')
    vip_price = BigintField('vip_price')
Ejemplo n.º 9
0
import os
import sys
import time

from urllib.parse import urlencode
from bs4 import BeautifulSoup

from utils.crawler import Crawler
from utils.config import Config
from utils.thread import ThreadPool
from utils.common import Task, repair_filename, touch_dir, size_format
from utils.playlist import Dpl
from utils.downloader import FileManager
from utils.ffmpeg import FFmpeg

spider = Crawler()
spider.trust_env = False
VIDEO, PDF, RICH_TEXT = 1, 3, 4
COURSEWARE = {VIDEO: "Video", PDF: "PDF", RICH_TEXT: "Rich_text"}

headers = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36",
}
spider.headers.update(headers)
CONFIG = Config()


def login(username, password):
    """ 登录获取 token """
    pd = hashlib.md5()
import json
import os
import time
from urllib import parse

from utils.common import store_cookies
from utils.crawler import Crawler

spider = Crawler()


def get_index(words, start_date, end_date):
    """ 获取在某个时间范围内的指数信息 """
    wordlist = ""
    for n in range(len(words)):
        wordlist += '&wordlist%5B{}%5D={}'.format(n, words[n])
    url = 'http://index.baidu.com/Interface/Newwordgraph/getIndex?region=0&startdate={}&enddate={}{}'\
        .format(start_date, end_date, wordlist)
    res = spider.get(url)
    return res.json()


def decrypto(origin, key):
    """ 解密指数信息 """
    s = ''
    for c in origin:
        if c:
            s += key[key.index(c) + len(key) // 2]
    data = []
    for i in s.split(','):
        data.append(int(i))
Ejemplo n.º 11
0
import json
import hashlib
import re
import os
import sys

from bs4 import BeautifulSoup

from utils.crawler import Crawler
from utils.config import Config
from utils.filer import repair_filename, touch_dir, Dpl
from utils.thread import ThreadPool
from utils.async_lib.utils import Task

spider = Crawler()
VIDEO, PDF, RICH_TEXT = 1, 3, 4

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36',
}
srt_types = ["zh-cn", "en"]
spider.headers.update(headers)
CONFIG = Config()


def login(username, password):
    """ 登录获取 token """
    pd = hashlib.md5()
    pd.update(password.encode('utf-8'))
    passwd = pd.hexdigest()
Ejemplo n.º 12
0
import os
import sys
import time

from urllib.parse import urlencode
from bs4 import BeautifulSoup

from utils.crawler import Crawler
from utils.config import Config
from utils.thread import ThreadPool
from utils.common import Task, repair_filename, touch_dir, size_format
from utils.playlist import Dpl
from utils.downloader import FileManager
from utils.ffmpeg import FFmpeg

spider = Crawler()
spider.trust_env = False
VIDEO, PDF, RICH_TEXT = 1, 3, 4
COURSEWARE = {VIDEO: "Video", PDF: "PDF", RICH_TEXT: "Rich_text"}

headers = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36",
}
spider.headers.update(headers)
CONFIG = Config()


def login(username, password):
    """ 登录获取 token """
    pd = hashlib.md5()