def parse_channel(channel, model, pool_size=DEFAULT_POOL_SIZE, progress_monitor=NullProgressMonitor(), parse_refs=True): manager = multiprocessing.Manager() lock = manager.RLock() work_units = pool_size * BUCKET_PER_WORKER # Prepare Input entries = [] for entry in model.entries: if not entry.parsed: entries.append((entry.local_paths, entry.url)) entry.parsed = True entries_chunks = chunk_it(entries, work_units) inputs = [] for entry_chunk in entries_chunks: inputs.append((channel.parser, channel.pk, entry_chunk, parse_refs, lock)) # Close connection to allow the new processes to create their own connection.close() progress_monitor.start('Parsing Channel Entries', len(inputs)) progress_monitor.info('Sending {0} chunks to worker pool' .format(len(inputs))) pool = multiprocessing.Pool(pool_size) for result in pool.imap_unordered(sub_process_parse, inputs, BUCKET_PER_WORKER): progress_monitor.work('Parsed a chunk', 1) pool.close() progress_monitor.done()
def parse(document, pages, parse_refs=True, progress_monitor=NullProgressMonitor(), pool_size=DEFAULT_POOL_SIZE): progress_monitor.start('Parsing Pages', pool_size + 1) # Prepare input pages = [(page.local_url, page.url) for page in pages.values() if page.local_url is not None] pages_chunks = chunk_it(pages, pool_size) inputs = [] for pages_chunk in pages_chunks: inputs.append((document.parser, document.pk, parse_refs, pages_chunk)) # Close connection to allow the new processes to create their own. connection.close() # Split work progress_monitor.info('Sending {0} chunks to worker pool' .format(len(inputs))) pool = Pool(pool_size) for result in pool.imap_unordered(sub_process_parse, inputs, 1): progress_monitor.work('Parsed 1/{0} of the pages'.\ format(pool_size), 1) # Word Count word_count = 0 for page in document.pages.all(): word_count += page.word_count document.word_count = word_count document.save() progress_monitor.work('Counted Total Words', 1) pool.close() progress_monitor.done()
def parse_channel(channel, model, pool_size=DEFAULT_POOL_SIZE, progress_monitor=NullProgressMonitor(), parse_refs=True): manager = multiprocessing.Manager() lock = manager.RLock() work_units = pool_size * BUCKET_PER_WORKER # Prepare Input entries = [] for entry in model.entries: if not entry.parsed: entries.append((entry.local_paths, entry.url)) entry.parsed = True entries_chunks = chunk_it(entries, work_units) inputs = [] for entry_chunk in entries_chunks: inputs.append( (channel.parser, channel.pk, entry_chunk, parse_refs, lock)) # Close connection to allow the new processes to create their own connection.close() progress_monitor.start('Parsing Channel Entries', len(inputs)) progress_monitor.info('Sending {0} chunks to worker pool'.format( len(inputs))) pool = multiprocessing.Pool(pool_size) for result in pool.imap_unordered(sub_process_parse, inputs, BUCKET_PER_WORKER): progress_monitor.work('Parsed a chunk', 1) pool.close() progress_monitor.done()
def parse(document, pages, parse_refs=True, progress_monitor=NullProgressMonitor(), pool_size=DEFAULT_POOL_SIZE): progress_monitor.start('Parsing Pages', pool_size + 1) # Prepare input pages = [(page.local_url, page.url) for page in pages.values() if page.local_url is not None] pages_chunks = chunk_it(pages, pool_size) inputs = [] for pages_chunk in pages_chunks: inputs.append((document.parser, document.pk, parse_refs, pages_chunk)) # Close connection to allow the new processes to create their own. connection.close() # Split work progress_monitor.info('Sending {0} chunks to worker pool'.format( len(inputs))) pool = Pool(pool_size) for result in pool.imap_unordered(sub_process_parse, inputs, 1): progress_monitor.work('Parsed 1/{0} of the pages'.\ format(pool_size), 1) # Word Count word_count = 0 for page in document.pages.all(): word_count += page.word_count document.word_count = word_count document.save() progress_monitor.work('Counted Total Words', 1) pool.close() progress_monitor.done()