Example #1
0
    def _multi_channel_apply_disk_parallel(self, function, cleanup_function,
                                           output_path, from_time, to_time,
                                           channels, cast_dtype,
                                           pass_batch_info, pass_batch_results,
                                           processes, **kwargs):

        self.logger.debug('Starting parallel operation...')

        if pass_batch_results:
            raise NotImplementedError("pass_batch_results is not "
                                      "implemented on 'disk' mode")

        # need to convert to a list, oherwise cannot be pickled
        data = list(
            self.multi_channel(from_time, to_time, channels,
                               return_data=False))
        n_batches = self.indexer.n_batches(from_time, to_time, channels)

        self.logger.info('Data will be splitted in %s batches', n_batches)

        output_path = Path(output_path)

        # create local variables to avoid pickling problems
        _path_to_recordings = copy(self.path_to_recordings)
        _dtype = copy(self.dtype)
        _n_channels = copy(self.n_channels)
        _data_order = copy(self.data_order)
        _loader = copy(self.loader)
        _buffer_size = copy(self.buffer_size)

        reader = partial(RecordingsReader,
                         path_to_recordings=_path_to_recordings,
                         dtype=_dtype,
                         n_channels=_n_channels,
                         data_order=_data_order,
                         loader=_loader,
                         return_data_index=True)

        m = Manager()
        mapping = m.dict()
        next_to_write = m.Value('i', 0)

        def parallel_runner(element):
            i, _ = element

            res = util.batch_runner(element,
                                    function,
                                    reader,
                                    pass_batch_info,
                                    cast_dtype,
                                    kwargs,
                                    cleanup_function,
                                    _buffer_size,
                                    save_chunks=False,
                                    output_path=output_path)

            if i == 0:
                mapping['dtype'] = str(res.dtype)

            while True:
                if next_to_write.value == i:
                    with open(str(output_path), 'wb' if i == 0 else 'ab') as f:
                        res.tofile(f)

                    next_to_write.value += 1
                    break

        # run jobs
        self.logger.debug('Creating processes pool...')

        p = Pool(processes)
        res = p.map_async(parallel_runner, enumerate(data))

        finished = 0

        if self.show_progress_bar:
            pbar = tqdm(total=n_batches)

        if self.show_progress_bar:

            while True:
                if next_to_write.value > finished:
                    update = next_to_write.value - finished
                    pbar.update(update)
                    finished = next_to_write.value

                if next_to_write.value == n_batches:
                    break

            pbar.close()
        else:
            res.get()

        # save metadata
        params = util.make_metadata(channels, self.n_channels,
                                    mapping['dtype'], output_path)

        return output_path, params
Example #2
0
    def writeEventsToCsv(self, urls, processedUrlsFName, batchSize=20):
        numUrls = len(urls)
        origNumUrls = numUrls
        urlsWithEvents = 0
        totalEvents = 0
        processedListings = 0
        numTimeouts = 0

        try:
            with open(processedUrlsFName, 'r') as pus:
                pUrls = list(set(pus.read().split('\r\n')))
            logging.info(
                'Already processed {0} of {1} urls. Picking up where we'
                ' left off.'.format(len(pUrls), numUrls))
            urls = [url for url in urls if url not in pUrls]
            numUrls = len(urls)
        except IOError:
            pass

        with open(processedUrlsFName, 'a+') as pus:
            pUrls_writer = csv.writer(pus)
            with open(self.eventFile, 'a+') as f:
                writer = csv.writer(f)
                sttm = time.time()

                if self.eventMode == 'parallel':
                    batches = [
                        urls[x:x + batchSize]
                        for x in xrange(0, len(urls), batchSize)]
                    for b, batch in enumerate(batches):
                        logging.info('Starting batch {0} of  {1}'.format(
                            b + 1, len(batches)))
                        manager = Manager()
                        batchQueue = Queue()
                        batchTimeoutList = manager.list()
                        batchProcessedUrls = manager.list()
                        batchEventQueue = manager.Queue()
                        batchEventsSaved = manager.Value('i', 0)
                        jobs = []
                        for i, url in enumerate(batch):
                            batchQueue.put(
                                [self.eventMode, url, batchEventQueue,
                                 batchProcessedUrls, batchTimeoutList])
                        for i in range(len(batch)):
                            proc = Process(
                                target=self.eventWorker, args=(batchQueue,))
                            proc.start()
                            jobs.append(proc)
                        writeProc = Process(
                            target=self.writeToCsvWorker, args=(
                                batchEventQueue, batchEventsSaved))
                        time.sleep(2)
                        writeProc.start()
                        for j, job in enumerate(jobs):
                            # 5 seconds per url for each process before timeout
                            job.join(max(60, 5 * len(batch)))
                            if job.is_alive():
                                job.terminate()
                                logging.info(
                                    'Subprocess {0} of {1} timed out'.format(
                                        j + 1, min(24, len(batch))))
                        writeProc.join(max(60, 8 * len(batch)))
                        totalEvents += batchEventsSaved.value
                        processedListings += len(batch)
                        for url in set(list(batchProcessedUrls)):
                            pUrls_writer.writerow([url])
                        urlsWithEvents += len(set(list(batchProcessedUrls)))
                        numTimeouts += len(set(list(batchTimeoutList)))
                        durMins, minsLeft = self.timeElapsedLeft(
                            sttm, b + 1, len(batches))
                        logging.info(
                            'Saved {0} new events from {1} of {2} listings. '
                            '\nEstimated time to '
                            'completion: ~{3} min.'.format(
                                batchEventsSaved.value,
                                len(batchProcessedUrls), len(batch), minsLeft))
                        os.system(
                            "ps aux | grep chrome | awk ' { print $2 } ' |"
                            " xargs kill -9")

                elif self.eventMode == 'series':
                    for i, url in enumerate(urls):
                        numEvents = 0
                        events = self.getEventsFromListingUrl(
                            self.eventMode, url, None, urls, [])
                        if events is None:
                            durMins, minsLeft = self.timeElapsedLeft(
                                sttm, i + 1, numUrls)
                            logging.info(
                                'No sales events scraped from listing'
                                ' {0} of {1}. Check url: {2}. {3} min.'
                                'elapsed. {4} min. remaining.'.format(
                                    i + 1, numUrls, url, durMins,
                                    minsLeft))
                            continue
                        for event in events:
                            totalEvents += 1
                            numEvents += 1
                            writer.writerow(event)
                        urlsWithEvents += 1
                        pUrls_writer.writerow([url])
                        durMins, minsLeft = self.timeElapsedLeft(
                            sttm, i, numUrls)
                        if (i + 1) % 1 == 0:
                            logging.info(
                                'Scraped {0} sales events from listing {1}'
                                ' of {2}. Scraped {3} total sales events in'
                                ' {4} min. Estimated time to completion:'
                                ' ~{5} min.'.format(
                                    numEvents, i + 1, numUrls, totalEvents,
                                    durMins, minsLeft))
                else:
                    raise ValueError(
                        'Must specify valid event scraping '
                        'mode: ["parallel", "series"]')
        if numUrls > 0:
            self.pctUrlsWithEvents = round(
                urlsWithEvents / origNumUrls * 100.0, 1)
        else:
            self.pctUrlsWithEvents = -999

        logging.info('#' * 100)
        logging.info('#' * 100)
        logging.info(
            'Scraped events from {0} of {1} ({2}%) urls.'.format(
                urlsWithEvents, numUrls, self.pctUrlsWithEvents).center(
                90, ' ').center(100, '#').upper())
        logging.info(
            ('{0} of {1} urls timed out while scraping events.'.format(
                numTimeouts, numUrls).upper().center(90, ' ').center(
                100, '#')))
        logging.info(
            ('Saved {0} events to {1}'.format(
                totalEvents, self.eventFile).upper().center(
                90, ' ').center(100, '#')))
        logging.info('#' * 100)
        logging.info('#' * 100)