Ejemplo n.º 1
0
def download_many_4():
    '''多进程,按进程数 并行 下载所有图片
    使用concurrent.futures.ProcessPoolExecutor()
    Executor.map()使用Future而不是返回Future,它返回迭代器,
    迭代器的__next__()方法调用各个Future的result()方法,因此我们得到的是各个Future的结果,而非Future本身

    注意Executor.map()限制了download_one()只能接受一个参数,所以images是字典构成的列表
    '''
    down_path = setup_down_path()
    links = get_links()

    images = []
    for linkno, link in enumerate(links, 1):
        image = {'path': down_path, 'linkno': linkno, 'link': link}
        images.append(image)

    # with语句将调用executor.__exit__()方法,而这个方法会调用executor.shutdown(wait=True)方法,它会在所有进程都执行完毕前阻塞主进程
    with futures.ProcessPoolExecutor(
            max_workers=16
    ) as executor:  # 不指定max_workers时,进程池中进程个数默认为os.cpu_count()
        # executor.map()效果类似于内置函数map(),但download_one()函数会在多个进程中并行调用
        # 它的返回值res是一个迭代器<itertools.chain object>,我们后续可以迭代获取各个被调用函数的返回值
        res = executor.map(download_one, images)  # 传一个序列

    return len(list(res))  # 如果有进程抛出异常,异常会在这里抛出,类似于迭代器中隐式调用next()的效果
Ejemplo n.º 2
0
def download_many_3():
    '''多进程,按进程数 并行 下载所有图片
    使用multiprocessing.Pool.starmap(download_one_1, images),它是Python-3.3添加的
    可以给download_one_1()函数传元组组成的序列,会自动解包元组给函数的多个参数
    由于下载每张图片时的保存目录都相同,可以使用functools.partial()固定住这个参数
    '''
    down_path = setup_down_path()
    links = get_links()

    # 固定住保存的路径,不用每次调用下载图片函数时都传同样的down_path参数
    download_one_1_partial = partial(download_one_1, down_path)

    images = []
    for linkno, link in enumerate(links, 1):
        images.append((linkno, link))  # 每个元组将不包含保存的目录

    with Pool(4) as p:
        p.starmap(download_one_1_partial, images)  # 链接带序号

    logger.info('Waiting for all subprocesses done...')
    # p.close()
    # p.join()
    logger.info('All subprocesses done.')

    return len(links)
Ejemplo n.º 3
0
def download_many_6():
    '''多进程,按进程数 并行 下载所有图片
    使用concurrent.futures.ProcessPoolExecutor()
    不使用Executor.map(),而使用Executor.submit()和concurrent.futures.as_completed()
    Executor.submit()方法会返回Future,而Executor.map()是使用Future
    '''
    down_path = setup_down_path()
    links = get_links()

    # 固定住保存的路径,不用每次调用下载图片函数时都传同样的down_path参数
    download_one_1_partial = partial(download_one_1, down_path)

    with futures.ProcessPoolExecutor(max_workers=16) as executor:
        to_do = []
        # 创建并且排定Future
        for linkno, link in enumerate(links, 1):  # 链接带序号
            future = executor.submit(download_one_1_partial, linkno, link)
            to_do.append(future)
            logger.debug('Scheduled for No.{} {}: {}'.format(
                linkno, link, future))

        results = []
        # 获取Future的结果,futures.as_completed(to_do)的参数是Future列表,返回迭代器,
        # 只有当有Future运行结束后,才产出future
        for future in futures.as_completed(
                to_do):  # future变量表示已完成的Future对象,所以后续future.result()绝不会阻塞
            res = future.result()
            results.append(res)
            logger.debug('{} result: {!r}'.format(future, res))

    return len(results)
Ejemplo n.º 4
0
def download_many():
    '''多线程,按线程数 并发(非并行) 下载所有图片'''
    down_path = setup_down_path()
    links = get_links()

    # 创建队列
    queue = Queue()

    # 创建多个线程
    for i in range(64):
        worker = ThreadWorker(queue)
        worker.daemon = True  # 如果工作线程在等待更多的任务时阻塞了,主线程也可以正常退出
        worker.start()  # 启动线程

    # 往队列中投放任务
    for linkno, link in enumerate(links, 1):  # 链接带序号
        logger.info('Queueing No.{} {}'.format(linkno, link))
        queue.put((down_path, linkno, link))

    logger.info('Waiting for all subthread done...')
    # Causes the main thread to wait for the queue to finish processing all the tasks
    queue.join()
    logger.info('All subthread done.')

    return len(links)
Ejemplo n.º 5
0
def download_many():
    '''依序下载所有图片,同步阻塞'''
    down_path = setup_down_path()
    links = get_links()

    for linkno, link in enumerate(links, 1):
        image = {
            'path': down_path,
            'linkno': linkno,  # 图片序号,方便日志输出时,正在下载哪一张
            'link': link
        }
        download_one(image)

    return len(links)
Ejemplo n.º 6
0
def download_many():
    '''多进程,按进程数 并行 下载所有图片
    使用multiprocessing.Pool.apply_async()
    '''
    down_path = setup_down_path()
    links = get_links()

    p = Pool(4)  # 指定进程池中的进程数
    for linkno, link in enumerate(links, 1):
        image = {'path': down_path, 'linkno': linkno, 'link': link}
        p.apply_async(download_one, args=(image, ))

    logger.info('Waiting for all subprocesses done...')
    p.close()  # 关闭进程池
    p.join()  # 主进程等待进程池中的所有子进程结束
    logger.info('All subprocesses done.')

    return len(links)
Ejemplo n.º 7
0
def news_parser(url):
    """新闻解析器,从指定Url中获取新闻存入Solr"""
    links = common.get_links(url)
    news_list = []
    for link in reversed(links):
        if not urlset.has_url('news', link.url):
            try:
                news_list.append(News(url=link.url,
                                 title=link.title,
                                 content=extract(requests.get(link.url).text),
                                 category=u'默认',
                                 update_time=time.strftime('%Y-%m-%dT%XZ', time.gmtime())))
            except:
                print 'error adding', link.url
            print 'content', link.url
    try:
        solr.add('news', news_list)
    except:
        print 'error adding news'
Ejemplo n.º 8
0
def download_many_5():
    '''多进程,按进程数 并行 下载所有图片
    使用concurrent.futures.ProcessPoolExecutor()
    Executor.map()中的调用函数如果要接受多个参数,可以给Executor.map()传多个序列
    参考:https://yuanjiang.space/threadpoolexecutor-map-method-with-multiple-parameters
    '''
    down_path = setup_down_path()
    links = get_links()

    # 固定住保存的路径,不用每次调用下载图片函数时都传同样的down_path参数
    download_one_1_partial = partial(download_one_1, down_path)

    # 创建包含所有linkno的序列
    linknos = [i for i in range(len(links))]

    with futures.ProcessPoolExecutor(max_workers=16) as executor:
        res = executor.map(download_one_1_partial, linknos,
                           links)  # 给Executor.map()传多个序列

    return len(list(res))
Ejemplo n.º 9
0
def download_many_starmap():
    '''
    多进程,按进程数 并行 下载所有图片
    使用multiprocessing.Pool.starmap(download_one_starmap, images),它是Python-3.3添加的
    可以给download_one_starmap()函数传元组组成的序列,会自动解包元组给函数的多个参数
    '''
    down_path = setup_down_path()
    links = get_links()

    images = []
    for linkno, link in enumerate(links, 1):
        images.append((down_path, linkno, link))

    with Pool(4) as p:
        p.starmap(download_one_starmap, images)  #链接带序号

    logger.info('Waiting for all subprocesses done...')
    logger.info('All subprocesses done.')

    return len(links)
Ejemplo n.º 10
0
def news_parser(url):
    """新闻解析器,从指定Url中获取新闻存入Solr"""
    links = common.get_links(url)
    news_list = []
    for link in reversed(links):
        if not urlset.has_url('news', link.url):
            try:
                news_list.append(
                    News(url=link.url,
                         title=link.title,
                         content=extract(requests.get(link.url).text),
                         category=u'默认',
                         update_time=time.strftime('%Y-%m-%dT%XZ',
                                                   time.gmtime())))
            except:
                print 'error adding', link.url
            print 'content', link.url
    try:
        solr.add('news', news_list)
    except:
        print 'error adding news'
Ejemplo n.º 11
0
def download_many_2():
    '''多线程,按线程数 并发(非并行) 下载所有图片
    使用concurrent.futures.ThreadPoolExecutor()
    Executor.map()中的调用函数如果要接受多个参数,可以给Executor.map()传多个序列
    参考:https://yuanjiang.space/threadpoolexecutor-map-method-with-multiple-parameters
    '''
    down_path = setup_down_path()
    links = get_links()

    # 固定住保存的路径,不用每次调用下载图片函数时都传同样的down_path参数
    download_one_1_partial = partial(download_one_1, down_path)

    # 创建包含所有linkno的序列
    linknos = [i for i in range(len(links))]

    workers = min(64, len(links))  # 保证线程池中的线程不会多于总的下载任务数
    with futures.ThreadPoolExecutor(workers) as executor:
        res = executor.map(download_one_1_partial, linknos,
                           links)  # 给Executor.map()传多个序列

    return len(list(res))
Ejemplo n.º 12
0
def download_many_1():
    '''多进程,按进程数 并行 下载所有图片
    使用multiprocessing.Pool.map(download_one, images)
    注意Pool.map()限制了download_one()只能接受一个参数,所以images是字典构成的列表
    '''
    down_path = setup_down_path()
    links = get_links()

    images = []
    for linkno, link in enumerate(links, 1):
        image = {'path': down_path, 'linkno': linkno, 'link': link}
        images.append(image)

    with Pool(4) as p:
        p.map(download_one, images)  # 将images序列依次映射给download_one()函数

    logger.info('Waiting for all subprocesses done...')
    # p.close()  # 使用with语句和Pool.map()后,会自动调用Pool.close()和Pool.join()
    # p.join()
    logger.info('All subprocesses done.')

    return len(links)
def download_many():
    '''多线程,按线程数 并发(非并行) 下载所有图片'''
    down_path = setup_down_path()
    links = get_links()

    find_images = len(links)  # 发现的总图片链接数
    ignored_images = 0  # 被忽略的图片数
    visited_images = 0  # 请求成功的图片数
    failed_images = 0  # 请求失败的图片数

    images = []
    for linkno, link in enumerate(links, 1):
        image = {
            'path': down_path,
            'linkno': linkno,
            'link': link
        }
        images.append(image)

    workers = min(64, len(links))
    with futures.ThreadPoolExecutor(workers) as executor:
        to_do = [executor.submit(download_one, image) for image in images]
        # 获取Future的结果,futures.as_completed(to_do)的参数是Future列表,返回迭代器。只有当有Future运行结束后,才产出future
        done_iter = futures.as_completed(to_do)
        with progressbar.ProgressBar(max_value=len(to_do)) as bar:
            for i, future in enumerate(done_iter):  # future变量表示已完成的Future对象,所以后续future.result()绝不会阻塞
                result = future.result()
                if result.get('ignored'):
                    ignored_images += 1
                else:
                    if result.get('failed'):
                        failed_images += 1
                    else:
                        visited_images += 1
                bar.update(i)

    logger.critical('Find [{}] images, ignored [{}] images, visited [{}] images, failed [{}] images'.format(find_images, ignored_images, visited_images, failed_images))
Ejemplo n.º 14
0
def download_many_1():
    '''多线程,按线程数 并发(非并行) 下载所有图片
    使用concurrent.futures.ThreadPoolExecutor()
    Executor.map()使用Future而不是返回Future,它返回迭代器,
    迭代器的__next__()方法调用各个Future的result()方法,因此我们得到的是各个Future的结果,而非Future本身

    注意Executor.map()限制了download_one()只能接受一个参数,所以images是字典构成的列表
    '''
    down_path = setup_down_path()
    links = get_links()

    images = []
    for linkno, link in enumerate(links, 1):
        image = {'path': down_path, 'linkno': linkno, 'link': link}
        images.append(image)

    workers = min(64, len(links))  # 保证线程池中的线程不会多于总的下载任务数
    # with语句将调用executor.__exit__()方法,而这个方法会调用executor.shutdown(wait=True)方法,它会在所有进程都执行完毕前阻塞主进程
    with futures.ThreadPoolExecutor(workers) as executor:
        # executor.map()效果类似于内置函数map(),但download_one()函数会在多个线程中并发调用
        # 它的返回值res是一个迭代器<itertools.chain object>,我们后续可以迭代获取各个被调用函数的返回值
        res = executor.map(download_one, images)  # 传一个序列

    return len(list(res))  # 如果有进程抛出异常,异常会在这里抛出,类似于迭代器中隐式调用next()的效果
Ejemplo n.º 15
0
import common

inizio = int(input("Anno di inizio: "))
fine = int(input("Anno di fine: "))
for (anno, links) in common.get_links("wpfb-cat-121").items():
    if anno >= inizio and anno <= fine:
        for link in links:
            common.download_file(link[0], link[1]+".pdf")