Exemple #1
0
def parse_channel(channel, model, pool_size=DEFAULT_POOL_SIZE,
        progress_monitor=NullProgressMonitor(), parse_refs=True):
    manager = multiprocessing.Manager()
    lock = manager.RLock()
    work_units = pool_size * BUCKET_PER_WORKER

    # Prepare Input
    entries = []
    for entry in model.entries:
        if not entry.parsed:
            entries.append((entry.local_paths, entry.url))
            entry.parsed = True
    entries_chunks = chunk_it(entries, work_units)
    inputs = []
    for entry_chunk in entries_chunks:
        inputs.append((channel.parser, channel.pk, entry_chunk, parse_refs,
            lock))

    # Close connection to allow the new processes to create their own
    connection.close()

    progress_monitor.start('Parsing Channel Entries', len(inputs))
    progress_monitor.info('Sending {0} chunks to worker pool'
            .format(len(inputs)))
    pool = multiprocessing.Pool(pool_size)
    for result in pool.imap_unordered(sub_process_parse, inputs,
            BUCKET_PER_WORKER):
        progress_monitor.work('Parsed a chunk', 1)

    pool.close()
    progress_monitor.done()
Exemple #2
0
def parse(document, pages, parse_refs=True,
        progress_monitor=NullProgressMonitor(),
        pool_size=DEFAULT_POOL_SIZE):
    progress_monitor.start('Parsing Pages', pool_size + 1)

    # Prepare input
    pages = [(page.local_url, page.url) for page in
            pages.values() if page.local_url is not None]
    pages_chunks = chunk_it(pages, pool_size)
    inputs = []
    for pages_chunk in pages_chunks:
        inputs.append((document.parser, document.pk, parse_refs, pages_chunk))

    # Close connection to allow the new processes to create their own.
    connection.close()

    # Split work
    progress_monitor.info('Sending {0} chunks to worker pool'
            .format(len(inputs)))
    pool = Pool(pool_size)
    for result in pool.imap_unordered(sub_process_parse, inputs, 1):
        progress_monitor.work('Parsed 1/{0} of the pages'.\
                format(pool_size), 1)

    # Word Count
    word_count = 0
    for page in document.pages.all():
        word_count += page.word_count
    document.word_count = word_count
    document.save()
    progress_monitor.work('Counted Total Words', 1)

    pool.close()
    progress_monitor.done()
def parse_channel(channel,
                  model,
                  pool_size=DEFAULT_POOL_SIZE,
                  progress_monitor=NullProgressMonitor(),
                  parse_refs=True):
    manager = multiprocessing.Manager()
    lock = manager.RLock()
    work_units = pool_size * BUCKET_PER_WORKER

    # Prepare Input
    entries = []
    for entry in model.entries:
        if not entry.parsed:
            entries.append((entry.local_paths, entry.url))
            entry.parsed = True
    entries_chunks = chunk_it(entries, work_units)
    inputs = []
    for entry_chunk in entries_chunks:
        inputs.append(
            (channel.parser, channel.pk, entry_chunk, parse_refs, lock))

    # Close connection to allow the new processes to create their own
    connection.close()

    progress_monitor.start('Parsing Channel Entries', len(inputs))
    progress_monitor.info('Sending {0} chunks to worker pool'.format(
        len(inputs)))
    pool = multiprocessing.Pool(pool_size)
    for result in pool.imap_unordered(sub_process_parse, inputs,
                                      BUCKET_PER_WORKER):
        progress_monitor.work('Parsed a chunk', 1)

    pool.close()
    progress_monitor.done()
Exemple #4
0
def parse(document,
          pages,
          parse_refs=True,
          progress_monitor=NullProgressMonitor(),
          pool_size=DEFAULT_POOL_SIZE):
    progress_monitor.start('Parsing Pages', pool_size + 1)

    # Prepare input
    pages = [(page.local_url, page.url) for page in pages.values()
             if page.local_url is not None]
    pages_chunks = chunk_it(pages, pool_size)
    inputs = []
    for pages_chunk in pages_chunks:
        inputs.append((document.parser, document.pk, parse_refs, pages_chunk))

    # Close connection to allow the new processes to create their own.
    connection.close()

    # Split work
    progress_monitor.info('Sending {0} chunks to worker pool'.format(
        len(inputs)))
    pool = Pool(pool_size)
    for result in pool.imap_unordered(sub_process_parse, inputs, 1):
        progress_monitor.work('Parsed 1/{0} of the pages'.\
                format(pool_size), 1)

    # Word Count
    word_count = 0
    for page in document.pages.all():
        word_count += page.word_count
    document.word_count = word_count
    document.save()
    progress_monitor.work('Counted Total Words', 1)

    pool.close()
    progress_monitor.done()