def spin_crawl_threads(state, classifiers, MAX_BIT_SIZE, MAX_DL_THREADS, image_path): print("Running threads...") manager = Manager() location_q = manager.Queue(maxsize=16) image_q = manager.Queue(maxsize=64) state_lock = manager.Lock() generate_location = Process(target=generate_location_thread, args=(location_q, MAX_BIT_SIZE), name="generate_location") classification = Process(target=classification_thread, args=(image_q, classifiers, image_path, state, state_lock), name="classification") download_image_t = Process(target=download_image_thread, args=(location_q, image_q, MAX_DL_THREADS), name="download_image") download_image_t.start() classification.start() generate_location.start() def kill_threads(): for thread in active_children(): thread.terminate() atexit.register(kill_threads) download_image_t.join() classification.join() generate_location.join()
def main(): """ main """ file_to_attack = './data/example_files/S_hecht_submission_3.csv' method_order = 'param' nb_element = 1 month_spliter(file_to_attack) manager = Manager() queue_list = [manager.Queue(1) for _ in range(13)] with Pool(6) as p: print("HEY") p.map(maker, [i for i in range(13)], queue_list) for queue in queue_list: GUESS_PART.append(queue.get()) GUESS_PART.sort() for i in range(NB_MONTH): char = "guess_par_t" + str(i) + ".json" with open(char, "w") as jsdump: json.dump(GUESS_PART[i][1], jsdump, indent=4) write_csv(GUESS_PART[0][1], GUESS_PART[1][1], GUESS_PART[2][1], GUESS_PART[3][1], GUESS_PART[4][1], GUESS_PART[5][1], GUESS_PART[6][1], GUESS_PART[7][1], GUESS_PART[8][1], GUESS_PART[9][1], GUESS_PART[10][1], GUESS_PART[11][1], GUESS_PART[12][1])
def predict(self, inputData, transientTime=0, update_processor=lambda x: x, verbose=0): rank = len(inputData.shape) - 1 if rank != self.n_inputDimensions: raise ValueError( "The `inputData` does not have a suitable shape. It has to have {0} spatial dimensions and 1 temporal dimension.".format( self.n_inputDimensions)) manager = Manager() predictQueue = manager.Queue() # workaround as predict does not support batches atm # add dummy dimension to let embedInputData work properly (is optimized to work for batches) inputData = inputData.reshape(1, *inputData.shape) modifiedInputData = self._embedInputData(inputData) modifiedInputData = modifiedInputData[0] inputData = inputData[0] self.transientTime = transientTime self.sharedNamespace.transientTime = transientTime predictionOutput = B.zeros(np.insert(self.inputShape, 0, inputData.shape[0] - transientTime)) jobs = np.stack(np.meshgrid(*[np.arange(x) + self._filterWidth for x in inputData.shape[1:]]), axis=rank).reshape(-1, rank).tolist() nJobs = len(jobs) self.resetState() iterator = PredictionArrayIterator(modifiedInputData, jobs, self._filterWidth, self._stride, self) pool = Pool(processes=self._nWorkers, initializer=SpatioTemporalESN._init_predictProcess, initargs=[predictQueue, self]) pool.map_async(self._predictProcess, iterator, chunksize=200)#, chunksize=1) def _processPoolWorkerResults(): nJobsDone = 0 if verbose > 0: bar = progressbar.ProgressBar(max_value=nJobs, redirect_stdout=True, poll_interval=0.0001) bar.update(0) while nJobsDone < nJobs: data = predictQueue.get() # result of predicting indices, prediction, state = data id = self._uniqueIDFromIndices(indices) self._xs[id] = state # update the values predictionOutput[tuple([Ellipsis] + indices)] = prediction nJobsDone += 1 if verbose > 0: bar.update(nJobsDone) if verbose > 1: print(nJobsDone) if verbose > 0: bar.finish() _processPoolWorkerResults() pool.close() return predictionOutput
def fit(self, inputData, outputData, transientTime=0, verbose=0): rank = len(inputData.shape) - 1 if rank != self.n_inputDimensions and rank != self.n_inputDimensions + 1: raise ValueError( "The `inputData` does not have a suitable shape. It has to have {0} spatial dimensions and 1 temporal dimension.".format( self.n_inputDimensions)) # reshape the input so that it has the shape (timeseries, time, input_dimension^n) if rank == self.n_inputDimensions: inputData = inputData.reshape(1, *inputData.shape) outputData = outputData.reshape(1, *outputData.shape) else: # modify rank again rank -= 1 partialLength = (inputData.shape[1] - transientTime) totalLength = inputData.shape[0] * partialLength timeseriesCount = inputData.shape[0] manager = Manager() fitQueue = manager.Queue() modifiedInputData = self._embedInputData(inputData) self.sharedNamespace.transientTime = transientTime self.sharedNamespace.partialLength = partialLength self.sharedNamespace.totalLength = totalLength self.sharedNamespace.timeseriesCount = timeseriesCount jobs = np.stack(np.meshgrid(*[np.arange(x) + self._filterWidth for x in inputData.shape[2:]]), axis=rank).reshape(-1, rank).tolist() nJobs = len(jobs) self.resetState() iterator = FittingArrayIterator(modifiedInputData, outputData, jobs, self._filterWidth, self._stride, self) pool = Pool(processes=self._nWorkers, initializer=SpatioTemporalESN._init_fitProcess, initargs=[fitQueue, self]) pool.map_async(self._fitProcess, iterator, chunksize=16) def _processPoolWorkerResults(): nJobsDone = 0 if verbose > 0: bar = progressbar.ProgressBar(max_value=nJobs, redirect_stdout=True, poll_interval=0.0001) bar.update(0) while nJobsDone < nJobs: data = fitQueue.get() # result of fitting indices, x, WOut = data id = self._uniqueIDFromIndices(indices) if WOut is None: import sys print("WARNING: Fit process for pixel {0} did not succeed".format(indices), file=sys.stderr) # store WOut if self._averageOutputWeights: if WOut is not None: self._WOut += WOut / np.prod(self.inputShape) else: self._WOuts[id] = WOut # store x self._xs[id] = x nJobsDone += 1 if verbose > 0: bar.update(nJobsDone) if verbose > 1: print(nJobsDone) if verbose > 0: bar.finish() _processPoolWorkerResults() pool.close()
def __init__(self, inputShape, n_reservoir, filterSize=1, stride=1, borderMode="mirror", nWorkers="auto", spectralRadius=1.0, noiseLevel=0.0, inputScaling=None, leakingRate=1.0, reservoirDensity=0.2, randomSeed=None, averageOutputWeights=True, out_activation=lambda x: x, out_inverse_activation=lambda x: x, weightGeneration='naive', bias=1.0, outputBias=1.0, outputInputScaling=1.0, inputDensity=1.0, solver='pinv', regressionParameters={}, activation=B.tanh, activationDerivation=lambda x: 1.0 / B.cosh(x) ** 2): self._averageOutputWeights = averageOutputWeights if averageOutputWeights and solver != "lsqr": raise ValueError( "`averageOutputWeights` can only be set to `True` when `solver` is set to `lsqr` (Ridge Regression)") self._borderMode = borderMode if not borderMode in ["mirror", "padding", "edge", "wrap"]: raise ValueError( "`borderMode` must be set to one of the following values: `mirror`, `padding`, `edge` or `wrap`.") self._regressionParameters = regressionParameters self._solver = solver n_inputDimensions = len(inputShape) if filterSize % 2 == 0: raise ValueError("filterSize has to be an odd number (1, 3, 5, ...).") self._filterSize = filterSize self._filterWidth = int(np.floor(filterSize / 2)) self._stride = stride self._n_input = int(np.power(np.ceil(filterSize / stride), n_inputDimensions)) self.n_inputDimensions = n_inputDimensions self.inputShape = inputShape if not self._averageOutputWeights: self._WOuts = B.empty((np.prod(inputShape), 1, self._n_input + n_reservoir + 1)) self._WOut = None else: self._WOuts = None self._WOut = B.zeros((1, self._n_input + n_reservoir + 1)) self._xs = B.empty((np.prod(inputShape), n_reservoir, 1)) if nWorkers == "auto": self._nWorkers = np.max((cpu_count() - 1, 1)) else: self._nWorkers = nWorkers manager = Manager() self.sharedNamespace = manager.Namespace() if hasattr(self, "fitWorkerID") == False or self.parallelWorkerIDs is None: self.parallelWorkerIDs = manager.Queue() for i in range(self._nWorkers): self.parallelWorkerIDs.put((i)) super(SpatioTemporalESN, self).__init__(n_input=self._n_input, n_reservoir=n_reservoir, n_output=1, spectralRadius=spectralRadius, noiseLevel=noiseLevel, inputScaling=inputScaling, leakingRate=leakingRate, reservoirDensity=reservoirDensity, randomSeed=randomSeed, out_activation=out_activation, out_inverse_activation=out_inverse_activation, weightGeneration=weightGeneration, bias=bias, outputBias=outputBias, outputInputScaling=outputInputScaling, inputDensity=inputDensity, activation=activation, activationDerivation=activationDerivation) """
def writeEventsToCsv(self, urls, processedUrlsFName, batchSize=20): numUrls = len(urls) origNumUrls = numUrls urlsWithEvents = 0 totalEvents = 0 processedListings = 0 numTimeouts = 0 try: with open(processedUrlsFName, 'r') as pus: pUrls = list(set(pus.read().split('\r\n'))) logging.info( 'Already processed {0} of {1} urls. Picking up where we' ' left off.'.format(len(pUrls), numUrls)) urls = [url for url in urls if url not in pUrls] numUrls = len(urls) except IOError: pass with open(processedUrlsFName, 'a+') as pus: pUrls_writer = csv.writer(pus) with open(self.eventFile, 'a+') as f: writer = csv.writer(f) sttm = time.time() if self.eventMode == 'parallel': batches = [ urls[x:x + batchSize] for x in xrange(0, len(urls), batchSize)] for b, batch in enumerate(batches): logging.info('Starting batch {0} of {1}'.format( b + 1, len(batches))) manager = Manager() batchQueue = Queue() batchTimeoutList = manager.list() batchProcessedUrls = manager.list() batchEventQueue = manager.Queue() batchEventsSaved = manager.Value('i', 0) jobs = [] for i, url in enumerate(batch): batchQueue.put( [self.eventMode, url, batchEventQueue, batchProcessedUrls, batchTimeoutList]) for i in range(len(batch)): proc = Process( target=self.eventWorker, args=(batchQueue,)) proc.start() jobs.append(proc) writeProc = Process( target=self.writeToCsvWorker, args=( batchEventQueue, batchEventsSaved)) time.sleep(2) writeProc.start() for j, job in enumerate(jobs): # 5 seconds per url for each process before timeout job.join(max(60, 5 * len(batch))) if job.is_alive(): job.terminate() logging.info( 'Subprocess {0} of {1} timed out'.format( j + 1, min(24, len(batch)))) writeProc.join(max(60, 8 * len(batch))) totalEvents += batchEventsSaved.value processedListings += len(batch) for url in set(list(batchProcessedUrls)): pUrls_writer.writerow([url]) urlsWithEvents += len(set(list(batchProcessedUrls))) numTimeouts += len(set(list(batchTimeoutList))) durMins, minsLeft = self.timeElapsedLeft( sttm, b + 1, len(batches)) logging.info( 'Saved {0} new events from {1} of {2} listings. ' '\nEstimated time to ' 'completion: ~{3} min.'.format( batchEventsSaved.value, len(batchProcessedUrls), len(batch), minsLeft)) os.system( "ps aux | grep chrome | awk ' { print $2 } ' |" " xargs kill -9") elif self.eventMode == 'series': for i, url in enumerate(urls): numEvents = 0 events = self.getEventsFromListingUrl( self.eventMode, url, None, urls, []) if events is None: durMins, minsLeft = self.timeElapsedLeft( sttm, i + 1, numUrls) logging.info( 'No sales events scraped from listing' ' {0} of {1}. Check url: {2}. {3} min.' 'elapsed. {4} min. remaining.'.format( i + 1, numUrls, url, durMins, minsLeft)) continue for event in events: totalEvents += 1 numEvents += 1 writer.writerow(event) urlsWithEvents += 1 pUrls_writer.writerow([url]) durMins, minsLeft = self.timeElapsedLeft( sttm, i, numUrls) if (i + 1) % 1 == 0: logging.info( 'Scraped {0} sales events from listing {1}' ' of {2}. Scraped {3} total sales events in' ' {4} min. Estimated time to completion:' ' ~{5} min.'.format( numEvents, i + 1, numUrls, totalEvents, durMins, minsLeft)) else: raise ValueError( 'Must specify valid event scraping ' 'mode: ["parallel", "series"]') if numUrls > 0: self.pctUrlsWithEvents = round( urlsWithEvents / origNumUrls * 100.0, 1) else: self.pctUrlsWithEvents = -999 logging.info('#' * 100) logging.info('#' * 100) logging.info( 'Scraped events from {0} of {1} ({2}%) urls.'.format( urlsWithEvents, numUrls, self.pctUrlsWithEvents).center( 90, ' ').center(100, '#').upper()) logging.info( ('{0} of {1} urls timed out while scraping events.'.format( numTimeouts, numUrls).upper().center(90, ' ').center( 100, '#'))) logging.info( ('Saved {0} events to {1}'.format( totalEvents, self.eventFile).upper().center( 90, ' ').center(100, '#'))) logging.info('#' * 100) logging.info('#' * 100)
def __init__(self, manager: mp.Manager): self.q_in = manager.Queue() self.q_out = manager.Queue() self.q_err = manager.Queue()
def __init__( self, inputShape, n_reservoir, filterSize=1, stride=1, borderMode="mirror", nWorkers="auto", spectralRadius=1.0, noiseLevel=0.0, inputScaling=None, leakingRate=1.0, reservoirDensity=0.2, randomSeed=None, averageOutputWeights=True, out_activation=lambda x: x, out_inverse_activation=lambda x: x, weightGeneration="naive", bias=1.0, outputBias=1.0, outputInputScaling=1.0, inputDensity=1.0, solver="pinv", regressionParameters={}, activation=B.tanh, activationDerivative=lambda x: 1.0 / B.cosh(x)**2, chunkSize=16, ): """ ESN that predicts (steps of) a spatio-temporal time series based on a time series. Args: inputShape : Shape of the input w/o the time axis, e.g. (W, H) for a 2D input. n_reservoir : Number of units in the reservoir. filterSize : Size of patches used to predict a single output element. stride : Stride between different patches. borderMode : How to handle border values. Choices: mirror, padding, edge, wrap. nWorkers : Number of CPU threads executed in parallel to solve the problem. spectralRadius : Spectral radius of the reservoir's connection/weight matrix. noiseLevel : Magnitude of noise that is added to the input while fitting to prevent overfitting. inputScaling : Scaling factor of the input. leakingRate : Convex combination factor between 0 and 1 that weights current and new state value. reservoirDensity : Percentage of non-zero weight connections in the reservoir. randomSeed : Seed for random processes, e.g. weight initialization. averageOutputWeights : Average output matrices after fitting across all pixels or use a distinct matrix per pixel. The former assumes homogeneity of the problem across all pixels. out_activation : Final activation function (i.e. activation function of the output). out_inverse_activation : Inverse of the final activation function weightGeneration : Algorithm to generate weight matrices. Choices: naive, SORM, advanced, custom bias : Size of the bias added for the internal update process. outputBias : Size of the bias added for the final linear regression of the output. outputInputScaling : Rescaling factor for the input of the ESN for the regression. inputDensity : Percentage of non-zero weights in the input-to-reservoir weight matrix. solver : Algorithm to find output matrix. Choices: pinv, lsqr. regressionParameters : Arguments to the solving algorithm. For LSQR this controls the L2 regularization. activation : (Non-linear) Activation function. activationDerivative : Derivative of the activation function. chunkSize : Internal parameter for the multi-threading. For long time series this should be reduced to avoid OOM errors/getting stuck and to reduce memory consumption. """ self._averageOutputWeights = averageOutputWeights if averageOutputWeights and solver != "lsqr": raise ValueError( "`averageOutputWeights` can only be set to `True` when `solver` is set to `lsqr` (Ridge Regression)" ) self._borderMode = borderMode if not borderMode in ["mirror", "padding", "edge", "wrap"]: raise ValueError( "`borderMode` must be set to one of the following values: `mirror`, `padding`, `edge` or `wrap`." ) self._regressionParameters = regressionParameters self._solver = solver n_inputDimensions = len(inputShape) if filterSize % 2 == 0: raise ValueError( "filterSize has to be an odd number (1, 3, 5, ...).") self._filterSize = filterSize self._filterWidth = int(np.floor(filterSize / 2)) self._stride = stride self._n_input = int( np.power(np.ceil(filterSize / stride), n_inputDimensions)) self.n_inputDimensions = n_inputDimensions self.inputShape = inputShape if not self._averageOutputWeights: self._WOuts = B.empty( (np.prod(inputShape), 1, self._n_input + n_reservoir + 1)) self._WOut = None else: self._WOuts = None self._WOut = B.zeros((1, self._n_input + n_reservoir + 1)) self._xs = B.empty((np.prod(inputShape), n_reservoir, 1)) if nWorkers == "auto": self._nWorkers = np.max((cpu_count() - 1, 1)) else: self._nWorkers = nWorkers manager = Manager() self.sharedNamespace = manager.Namespace() if hasattr(self, "fitWorkerID") == False or self.parallelWorkerIDs is None: self.parallelWorkerIDs = manager.Queue() for i in range(self._nWorkers): self.parallelWorkerIDs.put((i)) self._chunkSize = chunkSize super(SpatioTemporalESN, self).__init__( n_input=self._n_input, n_reservoir=n_reservoir, n_output=1, spectralRadius=spectralRadius, noiseLevel=noiseLevel, inputScaling=inputScaling, leakingRate=leakingRate, reservoirDensity=reservoirDensity, randomSeed=randomSeed, out_activation=out_activation, out_inverse_activation=out_inverse_activation, weightGeneration=weightGeneration, bias=bias, outputBias=outputBias, outputInputScaling=outputInputScaling, inputDensity=inputDensity, activation=activation, activationDerivative=activationDerivative, ) """