def _multi_channel_apply_disk_parallel(self, function, cleanup_function, output_path, from_time, to_time, channels, cast_dtype, pass_batch_info, pass_batch_results, processes, **kwargs): self.logger.debug('Starting parallel operation...') if pass_batch_results: raise NotImplementedError("pass_batch_results is not " "implemented on 'disk' mode") # need to convert to a list, oherwise cannot be pickled data = list( self.multi_channel(from_time, to_time, channels, return_data=False)) n_batches = self.indexer.n_batches(from_time, to_time, channels) self.logger.info('Data will be splitted in %s batches', n_batches) output_path = Path(output_path) # create local variables to avoid pickling problems _path_to_recordings = copy(self.path_to_recordings) _dtype = copy(self.dtype) _n_channels = copy(self.n_channels) _data_order = copy(self.data_order) _loader = copy(self.loader) _buffer_size = copy(self.buffer_size) reader = partial(RecordingsReader, path_to_recordings=_path_to_recordings, dtype=_dtype, n_channels=_n_channels, data_order=_data_order, loader=_loader, return_data_index=True) m = Manager() mapping = m.dict() next_to_write = m.Value('i', 0) def parallel_runner(element): i, _ = element res = util.batch_runner(element, function, reader, pass_batch_info, cast_dtype, kwargs, cleanup_function, _buffer_size, save_chunks=False, output_path=output_path) if i == 0: mapping['dtype'] = str(res.dtype) while True: if next_to_write.value == i: with open(str(output_path), 'wb' if i == 0 else 'ab') as f: res.tofile(f) next_to_write.value += 1 break # run jobs self.logger.debug('Creating processes pool...') p = Pool(processes) res = p.map_async(parallel_runner, enumerate(data)) finished = 0 if self.show_progress_bar: pbar = tqdm(total=n_batches) if self.show_progress_bar: while True: if next_to_write.value > finished: update = next_to_write.value - finished pbar.update(update) finished = next_to_write.value if next_to_write.value == n_batches: break pbar.close() else: res.get() # save metadata params = util.make_metadata(channels, self.n_channels, mapping['dtype'], output_path) return output_path, params
def writeEventsToCsv(self, urls, processedUrlsFName, batchSize=20): numUrls = len(urls) origNumUrls = numUrls urlsWithEvents = 0 totalEvents = 0 processedListings = 0 numTimeouts = 0 try: with open(processedUrlsFName, 'r') as pus: pUrls = list(set(pus.read().split('\r\n'))) logging.info( 'Already processed {0} of {1} urls. Picking up where we' ' left off.'.format(len(pUrls), numUrls)) urls = [url for url in urls if url not in pUrls] numUrls = len(urls) except IOError: pass with open(processedUrlsFName, 'a+') as pus: pUrls_writer = csv.writer(pus) with open(self.eventFile, 'a+') as f: writer = csv.writer(f) sttm = time.time() if self.eventMode == 'parallel': batches = [ urls[x:x + batchSize] for x in xrange(0, len(urls), batchSize)] for b, batch in enumerate(batches): logging.info('Starting batch {0} of {1}'.format( b + 1, len(batches))) manager = Manager() batchQueue = Queue() batchTimeoutList = manager.list() batchProcessedUrls = manager.list() batchEventQueue = manager.Queue() batchEventsSaved = manager.Value('i', 0) jobs = [] for i, url in enumerate(batch): batchQueue.put( [self.eventMode, url, batchEventQueue, batchProcessedUrls, batchTimeoutList]) for i in range(len(batch)): proc = Process( target=self.eventWorker, args=(batchQueue,)) proc.start() jobs.append(proc) writeProc = Process( target=self.writeToCsvWorker, args=( batchEventQueue, batchEventsSaved)) time.sleep(2) writeProc.start() for j, job in enumerate(jobs): # 5 seconds per url for each process before timeout job.join(max(60, 5 * len(batch))) if job.is_alive(): job.terminate() logging.info( 'Subprocess {0} of {1} timed out'.format( j + 1, min(24, len(batch)))) writeProc.join(max(60, 8 * len(batch))) totalEvents += batchEventsSaved.value processedListings += len(batch) for url in set(list(batchProcessedUrls)): pUrls_writer.writerow([url]) urlsWithEvents += len(set(list(batchProcessedUrls))) numTimeouts += len(set(list(batchTimeoutList))) durMins, minsLeft = self.timeElapsedLeft( sttm, b + 1, len(batches)) logging.info( 'Saved {0} new events from {1} of {2} listings. ' '\nEstimated time to ' 'completion: ~{3} min.'.format( batchEventsSaved.value, len(batchProcessedUrls), len(batch), minsLeft)) os.system( "ps aux | grep chrome | awk ' { print $2 } ' |" " xargs kill -9") elif self.eventMode == 'series': for i, url in enumerate(urls): numEvents = 0 events = self.getEventsFromListingUrl( self.eventMode, url, None, urls, []) if events is None: durMins, minsLeft = self.timeElapsedLeft( sttm, i + 1, numUrls) logging.info( 'No sales events scraped from listing' ' {0} of {1}. Check url: {2}. {3} min.' 'elapsed. {4} min. remaining.'.format( i + 1, numUrls, url, durMins, minsLeft)) continue for event in events: totalEvents += 1 numEvents += 1 writer.writerow(event) urlsWithEvents += 1 pUrls_writer.writerow([url]) durMins, minsLeft = self.timeElapsedLeft( sttm, i, numUrls) if (i + 1) % 1 == 0: logging.info( 'Scraped {0} sales events from listing {1}' ' of {2}. Scraped {3} total sales events in' ' {4} min. Estimated time to completion:' ' ~{5} min.'.format( numEvents, i + 1, numUrls, totalEvents, durMins, minsLeft)) else: raise ValueError( 'Must specify valid event scraping ' 'mode: ["parallel", "series"]') if numUrls > 0: self.pctUrlsWithEvents = round( urlsWithEvents / origNumUrls * 100.0, 1) else: self.pctUrlsWithEvents = -999 logging.info('#' * 100) logging.info('#' * 100) logging.info( 'Scraped events from {0} of {1} ({2}%) urls.'.format( urlsWithEvents, numUrls, self.pctUrlsWithEvents).center( 90, ' ').center(100, '#').upper()) logging.info( ('{0} of {1} urls timed out while scraping events.'.format( numTimeouts, numUrls).upper().center(90, ' ').center( 100, '#'))) logging.info( ('Saved {0} events to {1}'.format( totalEvents, self.eventFile).upper().center( 90, ' ').center(100, '#'))) logging.info('#' * 100) logging.info('#' * 100)