Beispiel #1
0
def test_crawler():
    with TaskScheduler() as ts, requests.Session() as sess:
        ts.start(n_worker=1)
        crawler = Crawler(ts=ts, sess=sess)
        crawler.login()

        _ = [
            '20cnwm',
            'advanced-modeling',
            'algorithms-part1',
            'complex-analysis',
            'crypto',
            'game-theory-1',
            'genetic-lab',
            'happiness',
            'learning-how-to-learn',
            'lisan-youhua-jianmo-jichupian',
            'ma-ke-si',
            'machine-design1',
            'machine-learning',
            'mathematical-thinking',
            'modern-postmodern-1',
            'networks-illustrated',
            'renqun-wangluo',
            'shengwu-yanhua',
            'understanding-arguments',
            'yoga',
        ]
        crawler.crawl(slug=random.choice(_), isSpec=False)
Beispiel #2
0
def download_ts(dl_tasks, how):
    file_json = _file_json_dl_tasks_failed(how)

    with TaskScheduler() as ts:
        ts.start(n_worker=4)

        _cls_downloader = {
            'builtin': DownloaderBuiltin,
            'curl': DownloaderCurl,
            'aria2': DownloaderAria2
        }[how]
        dl_tasks_failed = _cls_downloader(dl_tasks=dl_tasks, ts=ts).download()

    with open(file_json, 'w', encoding='UTF-8') as ofs:
        json.dump(dl_tasks_failed, ofs)
def crawl(cookies_file, slug, isSpec, outdir, n_worker):
    file_pkl = _file_pkl_crawl(outdir, slug)
    if os.path.exists(file_pkl):
        with open(file_pkl, 'rb') as ifs:
            return pickle.load(ifs)

    with requests.Session() as sess:
        Crawler._login(sess, cookies_file=cookies_file)

        # Check whether the specialization/course exists

        if isSpec:
            if 'elements' not in sess.get(URL_SPEC(slug)).json():
                raise SpecNotExistExcepton(slug)
        else:
            if 'elements' not in sess.get(URL_COURSE_1(slug)).json():
                raise CourseNotExistExcepton(slug)

        # Check whether the cookies_file expires

        course = Course(slug=COURSE_0)
        d = sess.get(URL_COURSE_1(course['slug'])).json()
        course['id'] = d['elements'][0]['id']

        d = sess.get(URL_COURSE_REFERENCES(course['id'])).json()
        if d.get('errorCode') == 'Not Authorized':
            raise CookiesExpiredException()
        assert 'errorCode' not in d

    with TaskScheduler() as ts, requests.Session() as sess:
        ts.start(n_worker=n_worker)
        crawler = Crawler(ts=ts, sess=sess, cookies_file=cookies_file)
        soc = crawler.crawl(slug=slug, isSpec=isSpec)

    with open(file_pkl, 'wb') as ofs:
        pickle.dump(soc, ofs)

    file_json = change_ext(file_pkl, 'json')
    with open(file_json, 'w', encoding='UTF-8') as ofs:
        ofs.write(soc.to_json())

    return soc
def download_ts(dl_tasks, slug, outdir, how):
    file_json = _file_json_download_dl_tasks_failed(outdir, slug)
    if os.path.exists(file_json):
        with open(file_json, encoding='UTF-8') as ifs:
            dl_tasks = json.load(ifs)

    if len(dl_tasks) == 0:
        return

    with TaskScheduler() as ts:
        ts.start(n_worker=4)

        _cls_downloader = {
            'builtin': DownloaderBuiltin,
            'curl': DownloaderCurl,
            'aria2': DownloaderAria2
        }[how]
        dl_tasks_failed = _cls_downloader(dl_tasks=dl_tasks, ts=ts).download()

    with open(file_json, 'w', encoding='UTF-8') as ofs:
        json.dump(dl_tasks_failed, ofs)
import time
import random
import threading

from dl_coursera.lib.TaskScheduler import TaskScheduler

ts = TaskScheduler()


@ts.register_task
def f(*, s):
    d = ts.d

    if d.get('xxx') is None:
        d['xxx'] = [s] * 3

    for _ in d['xxx']:
        time.sleep(random.uniform(0.5, 1.5))
        print(threading.current_thread().name, '-', _)


def main():
    ts.start(n_worker=3)

    for s in ['Alice', 'Bob', 'Cindy', 'Dave']:
        f(s=s)

    ts.wait()


if __name__ == '__main__':
import time

from dl_coursera.lib.TaskScheduler import TaskScheduler

ts = TaskScheduler()


@ts.register_task
def f(*, n):
    time.sleep(1)
    print('f', n)

    g(n=n+1)
    g(n=n+1)


@ts.register_task
def g(*, n):
    time.sleep(1)
    print('g', n)

    h(n=n+1)
    h(n=n+1)


@ts.register_task
def h(*, n):
    time.sleep(1)
    print('h', n)