Example #1
0
def spin_crawl_threads(state, classifiers, MAX_BIT_SIZE, MAX_DL_THREADS, image_path):
    print("Running threads...")
    manager = Manager()

    location_q = manager.Queue(maxsize=16)
    image_q = manager.Queue(maxsize=64)
    state_lock = manager.Lock()

    generate_location = Process(target=generate_location_thread,
                                args=(location_q, MAX_BIT_SIZE),
                                name="generate_location")
    classification = Process(target=classification_thread,
                             args=(image_q, classifiers, image_path,
                                   state, state_lock), name="classification")
    download_image_t = Process(target=download_image_thread,
                               args=(location_q, image_q, MAX_DL_THREADS),
                               name="download_image")

    download_image_t.start()
    classification.start()
    generate_location.start()

    def kill_threads():
        for thread in active_children():
            thread.terminate()

    atexit.register(kill_threads)

    download_image_t.join()
    classification.join()
    generate_location.join()
Example #2
0
def main():
    """
    main
    """
    file_to_attack = './data/example_files/S_hecht_submission_3.csv'
    method_order = 'param'
    nb_element = 1
    month_spliter(file_to_attack)
    manager = Manager()
    queue_list = [manager.Queue(1) for _ in range(13)]
    with Pool(6) as p:
        print("HEY")
        p.map(maker, [i for i in range(13)], queue_list)
    for queue in queue_list:
        GUESS_PART.append(queue.get())
    GUESS_PART.sort()
    for i in range(NB_MONTH):
        char = "guess_par_t" + str(i) + ".json"
        with open(char, "w") as jsdump:
            json.dump(GUESS_PART[i][1], jsdump, indent=4)
    write_csv(GUESS_PART[0][1], GUESS_PART[1][1], GUESS_PART[2][1],
              GUESS_PART[3][1], GUESS_PART[4][1], GUESS_PART[5][1],
              GUESS_PART[6][1], GUESS_PART[7][1], GUESS_PART[8][1],
              GUESS_PART[9][1], GUESS_PART[10][1], GUESS_PART[11][1],
              GUESS_PART[12][1])
    def predict(self, inputData, transientTime=0, update_processor=lambda x: x, verbose=0):
        rank = len(inputData.shape) - 1

        if rank != self.n_inputDimensions:
            raise ValueError(
                "The `inputData` does not have a suitable shape. It has to have {0} spatial dimensions and 1 temporal dimension.".format(
                    self.n_inputDimensions))

        manager = Manager()
        predictQueue = manager.Queue()

        # workaround as predict does not support batches atm
        # add dummy dimension to let embedInputData work properly (is optimized to work for batches)
        inputData = inputData.reshape(1, *inputData.shape)
        modifiedInputData = self._embedInputData(inputData)
        modifiedInputData = modifiedInputData[0]
        inputData = inputData[0]

        self.transientTime = transientTime
        self.sharedNamespace.transientTime = transientTime
        predictionOutput = B.zeros(np.insert(self.inputShape, 0, inputData.shape[0] - transientTime))

        jobs = np.stack(np.meshgrid(*[np.arange(x) + self._filterWidth for x in inputData.shape[1:]]),
                        axis=rank).reshape(-1, rank).tolist()
        nJobs = len(jobs)

        self.resetState()

        iterator = PredictionArrayIterator(modifiedInputData, jobs, self._filterWidth, self._stride, self)

        pool = Pool(processes=self._nWorkers, initializer=SpatioTemporalESN._init_predictProcess,
                    initargs=[predictQueue, self])
        pool.map_async(self._predictProcess, iterator, chunksize=200)#, chunksize=1)

        def _processPoolWorkerResults():
            nJobsDone = 0

            if verbose > 0:
                bar = progressbar.ProgressBar(max_value=nJobs, redirect_stdout=True, poll_interval=0.0001)
                bar.update(0)

            while nJobsDone < nJobs:
                data = predictQueue.get()
                # result of predicting
                indices, prediction, state = data
                id = self._uniqueIDFromIndices(indices)
                self._xs[id] = state
                # update the values
                predictionOutput[tuple([Ellipsis] + indices)] = prediction

                nJobsDone += 1
                if verbose > 0:
                    bar.update(nJobsDone)
                    if verbose > 1:
                        print(nJobsDone)

            if verbose > 0:
                bar.finish()

        _processPoolWorkerResults()

        pool.close()

        return predictionOutput
    def fit(self, inputData, outputData, transientTime=0, verbose=0):
        rank = len(inputData.shape) - 1

        if rank != self.n_inputDimensions and rank != self.n_inputDimensions + 1:
            raise ValueError(
                "The `inputData` does not have a suitable shape. It has to have {0} spatial dimensions and 1 temporal dimension.".format(
                    self.n_inputDimensions))

        # reshape the input so that it has the shape (timeseries, time, input_dimension^n)
        if rank == self.n_inputDimensions:
            inputData = inputData.reshape(1, *inputData.shape)
            outputData = outputData.reshape(1, *outputData.shape)
        else:
            # modify rank again
            rank -= 1

        partialLength = (inputData.shape[1] - transientTime)
        totalLength = inputData.shape[0] * partialLength
        timeseriesCount = inputData.shape[0]

        manager = Manager()
        fitQueue = manager.Queue()

        modifiedInputData = self._embedInputData(inputData)

        self.sharedNamespace.transientTime = transientTime

        self.sharedNamespace.partialLength = partialLength
        self.sharedNamespace.totalLength = totalLength
        self.sharedNamespace.timeseriesCount = timeseriesCount

        jobs = np.stack(np.meshgrid(*[np.arange(x) + self._filterWidth for x in inputData.shape[2:]]),
                        axis=rank).reshape(-1, rank).tolist()

        nJobs = len(jobs)

        self.resetState()

        iterator = FittingArrayIterator(modifiedInputData, outputData, jobs, self._filterWidth, self._stride, self)

        pool = Pool(processes=self._nWorkers, initializer=SpatioTemporalESN._init_fitProcess, initargs=[fitQueue, self])
        pool.map_async(self._fitProcess, iterator, chunksize=16)

        def _processPoolWorkerResults():
            nJobsDone = 0

            if verbose > 0:
                bar = progressbar.ProgressBar(max_value=nJobs, redirect_stdout=True, poll_interval=0.0001)
                bar.update(0)

            while nJobsDone < nJobs:
                data = fitQueue.get()

                # result of fitting
                indices, x, WOut = data
                id = self._uniqueIDFromIndices(indices)

                if WOut is None:
                    import sys
                    print("WARNING: Fit process for pixel {0} did not succeed".format(indices), file=sys.stderr)

                # store WOut
                if self._averageOutputWeights:
                    if WOut is not None:
                        self._WOut += WOut / np.prod(self.inputShape)
                else:
                    self._WOuts[id] = WOut

                    # store x
                self._xs[id] = x

                nJobsDone += 1
                if verbose > 0:
                    bar.update(nJobsDone)
                    if verbose > 1:
                        print(nJobsDone)

            if verbose > 0:
                bar.finish()

        _processPoolWorkerResults()

        pool.close()
    def __init__(self, inputShape, n_reservoir,
                 filterSize=1, stride=1, borderMode="mirror", nWorkers="auto",
                 spectralRadius=1.0, noiseLevel=0.0, inputScaling=None,
                 leakingRate=1.0, reservoirDensity=0.2, randomSeed=None, averageOutputWeights=True,
                 out_activation=lambda x: x, out_inverse_activation=lambda x: x,
                 weightGeneration='naive', bias=1.0, outputBias=1.0,
                 outputInputScaling=1.0, inputDensity=1.0, solver='pinv', regressionParameters={}, activation=B.tanh,
                 activationDerivation=lambda x: 1.0 / B.cosh(x) ** 2):

        self._averageOutputWeights = averageOutputWeights
        if averageOutputWeights and solver != "lsqr":
            raise ValueError(
                "`averageOutputWeights` can only be set to `True` when `solver` is set to `lsqr` (Ridge Regression)")

        self._borderMode = borderMode
        if not borderMode in ["mirror", "padding", "edge", "wrap"]:
            raise ValueError(
                "`borderMode` must be set to one of the following values: `mirror`, `padding`, `edge` or `wrap`.")

        self._regressionParameters = regressionParameters
        self._solver = solver

        n_inputDimensions = len(inputShape)

        if filterSize % 2 == 0:
            raise ValueError("filterSize has to be an odd number (1, 3, 5, ...).")
        self._filterSize = filterSize
        self._filterWidth = int(np.floor(filterSize / 2))
        self._stride = stride

        self._n_input = int(np.power(np.ceil(filterSize / stride), n_inputDimensions))

        self.n_inputDimensions = n_inputDimensions
        self.inputShape = inputShape

        if not self._averageOutputWeights:
            self._WOuts = B.empty((np.prod(inputShape), 1, self._n_input + n_reservoir + 1))
            self._WOut = None
        else:
            self._WOuts = None
            self._WOut = B.zeros((1, self._n_input + n_reservoir + 1))
        self._xs = B.empty((np.prod(inputShape), n_reservoir, 1))

        if nWorkers == "auto":
            self._nWorkers = np.max((cpu_count() - 1, 1))
        else:
            self._nWorkers = nWorkers

        manager = Manager()
        self.sharedNamespace = manager.Namespace()
        if hasattr(self, "fitWorkerID") == False or self.parallelWorkerIDs is None:
            self.parallelWorkerIDs = manager.Queue()
            for i in range(self._nWorkers):
                self.parallelWorkerIDs.put((i))

        super(SpatioTemporalESN, self).__init__(n_input=self._n_input, n_reservoir=n_reservoir, n_output=1,
                                                spectralRadius=spectralRadius,
                                                noiseLevel=noiseLevel, inputScaling=inputScaling,
                                                leakingRate=leakingRate, reservoirDensity=reservoirDensity,
                                                randomSeed=randomSeed, out_activation=out_activation,
                                                out_inverse_activation=out_inverse_activation,
                                                weightGeneration=weightGeneration, bias=bias, outputBias=outputBias,
                                                outputInputScaling=outputInputScaling,
                                                inputDensity=inputDensity, activation=activation,
                                                activationDerivation=activationDerivation)

        """
Example #6
0
    def writeEventsToCsv(self, urls, processedUrlsFName, batchSize=20):
        numUrls = len(urls)
        origNumUrls = numUrls
        urlsWithEvents = 0
        totalEvents = 0
        processedListings = 0
        numTimeouts = 0

        try:
            with open(processedUrlsFName, 'r') as pus:
                pUrls = list(set(pus.read().split('\r\n')))
            logging.info(
                'Already processed {0} of {1} urls. Picking up where we'
                ' left off.'.format(len(pUrls), numUrls))
            urls = [url for url in urls if url not in pUrls]
            numUrls = len(urls)
        except IOError:
            pass

        with open(processedUrlsFName, 'a+') as pus:
            pUrls_writer = csv.writer(pus)
            with open(self.eventFile, 'a+') as f:
                writer = csv.writer(f)
                sttm = time.time()

                if self.eventMode == 'parallel':
                    batches = [
                        urls[x:x + batchSize]
                        for x in xrange(0, len(urls), batchSize)]
                    for b, batch in enumerate(batches):
                        logging.info('Starting batch {0} of  {1}'.format(
                            b + 1, len(batches)))
                        manager = Manager()
                        batchQueue = Queue()
                        batchTimeoutList = manager.list()
                        batchProcessedUrls = manager.list()
                        batchEventQueue = manager.Queue()
                        batchEventsSaved = manager.Value('i', 0)
                        jobs = []
                        for i, url in enumerate(batch):
                            batchQueue.put(
                                [self.eventMode, url, batchEventQueue,
                                 batchProcessedUrls, batchTimeoutList])
                        for i in range(len(batch)):
                            proc = Process(
                                target=self.eventWorker, args=(batchQueue,))
                            proc.start()
                            jobs.append(proc)
                        writeProc = Process(
                            target=self.writeToCsvWorker, args=(
                                batchEventQueue, batchEventsSaved))
                        time.sleep(2)
                        writeProc.start()
                        for j, job in enumerate(jobs):
                            # 5 seconds per url for each process before timeout
                            job.join(max(60, 5 * len(batch)))
                            if job.is_alive():
                                job.terminate()
                                logging.info(
                                    'Subprocess {0} of {1} timed out'.format(
                                        j + 1, min(24, len(batch))))
                        writeProc.join(max(60, 8 * len(batch)))
                        totalEvents += batchEventsSaved.value
                        processedListings += len(batch)
                        for url in set(list(batchProcessedUrls)):
                            pUrls_writer.writerow([url])
                        urlsWithEvents += len(set(list(batchProcessedUrls)))
                        numTimeouts += len(set(list(batchTimeoutList)))
                        durMins, minsLeft = self.timeElapsedLeft(
                            sttm, b + 1, len(batches))
                        logging.info(
                            'Saved {0} new events from {1} of {2} listings. '
                            '\nEstimated time to '
                            'completion: ~{3} min.'.format(
                                batchEventsSaved.value,
                                len(batchProcessedUrls), len(batch), minsLeft))
                        os.system(
                            "ps aux | grep chrome | awk ' { print $2 } ' |"
                            " xargs kill -9")

                elif self.eventMode == 'series':
                    for i, url in enumerate(urls):
                        numEvents = 0
                        events = self.getEventsFromListingUrl(
                            self.eventMode, url, None, urls, [])
                        if events is None:
                            durMins, minsLeft = self.timeElapsedLeft(
                                sttm, i + 1, numUrls)
                            logging.info(
                                'No sales events scraped from listing'
                                ' {0} of {1}. Check url: {2}. {3} min.'
                                'elapsed. {4} min. remaining.'.format(
                                    i + 1, numUrls, url, durMins,
                                    minsLeft))
                            continue
                        for event in events:
                            totalEvents += 1
                            numEvents += 1
                            writer.writerow(event)
                        urlsWithEvents += 1
                        pUrls_writer.writerow([url])
                        durMins, minsLeft = self.timeElapsedLeft(
                            sttm, i, numUrls)
                        if (i + 1) % 1 == 0:
                            logging.info(
                                'Scraped {0} sales events from listing {1}'
                                ' of {2}. Scraped {3} total sales events in'
                                ' {4} min. Estimated time to completion:'
                                ' ~{5} min.'.format(
                                    numEvents, i + 1, numUrls, totalEvents,
                                    durMins, minsLeft))
                else:
                    raise ValueError(
                        'Must specify valid event scraping '
                        'mode: ["parallel", "series"]')
        if numUrls > 0:
            self.pctUrlsWithEvents = round(
                urlsWithEvents / origNumUrls * 100.0, 1)
        else:
            self.pctUrlsWithEvents = -999

        logging.info('#' * 100)
        logging.info('#' * 100)
        logging.info(
            'Scraped events from {0} of {1} ({2}%) urls.'.format(
                urlsWithEvents, numUrls, self.pctUrlsWithEvents).center(
                90, ' ').center(100, '#').upper())
        logging.info(
            ('{0} of {1} urls timed out while scraping events.'.format(
                numTimeouts, numUrls).upper().center(90, ' ').center(
                100, '#')))
        logging.info(
            ('Saved {0} events to {1}'.format(
                totalEvents, self.eventFile).upper().center(
                90, ' ').center(100, '#')))
        logging.info('#' * 100)
        logging.info('#' * 100)
Example #7
0
 def __init__(self, manager: mp.Manager):
     self.q_in = manager.Queue()
     self.q_out = manager.Queue()
     self.q_err = manager.Queue()
Example #8
0
    def __init__(
        self,
        inputShape,
        n_reservoir,
        filterSize=1,
        stride=1,
        borderMode="mirror",
        nWorkers="auto",
        spectralRadius=1.0,
        noiseLevel=0.0,
        inputScaling=None,
        leakingRate=1.0,
        reservoirDensity=0.2,
        randomSeed=None,
        averageOutputWeights=True,
        out_activation=lambda x: x,
        out_inverse_activation=lambda x: x,
        weightGeneration="naive",
        bias=1.0,
        outputBias=1.0,
        outputInputScaling=1.0,
        inputDensity=1.0,
        solver="pinv",
        regressionParameters={},
        activation=B.tanh,
        activationDerivative=lambda x: 1.0 / B.cosh(x)**2,
        chunkSize=16,
    ):
        """ ESN that predicts (steps of) a spatio-temporal time series based on a time series.

            Args:
                inputShape : Shape of the input w/o the time axis, e.g. (W, H) for a 2D input.
                n_reservoir : Number of units in the reservoir.
                filterSize : Size of patches used to predict a single output element.
                stride : Stride between different patches.
                borderMode : How to handle border values. Choices: mirror, padding, edge, wrap.
                nWorkers : Number of CPU threads executed in parallel to solve the problem.
                spectralRadius : Spectral radius of the reservoir's connection/weight matrix.
                noiseLevel : Magnitude of noise that is added to the input while fitting to prevent overfitting.
                inputScaling : Scaling factor of the input.
                leakingRate : Convex combination factor between 0 and 1 that weights current and new state value.
                reservoirDensity : Percentage of non-zero weight connections in the reservoir.
                randomSeed : Seed for random processes, e.g. weight initialization.
                averageOutputWeights : Average output matrices after fitting across all pixels or use a distinct matrix
                                        per pixel. The former assumes homogeneity of the problem across all pixels.
                out_activation : Final activation function (i.e. activation function of the output).
                out_inverse_activation : Inverse of the final activation function
                weightGeneration : Algorithm to generate weight matrices. Choices: naive, SORM, advanced, custom
                bias : Size of the bias added for the internal update process.
                outputBias : Size of the bias added for the final linear regression of the output.
                outputInputScaling : Rescaling factor for the input of the ESN for the regression.
                inputDensity : Percentage of non-zero weights in the input-to-reservoir weight matrix.
                solver : Algorithm to find output matrix. Choices: pinv, lsqr.
                regressionParameters : Arguments to the solving algorithm. For LSQR this controls the L2 regularization.
                activation : (Non-linear) Activation function.
                activationDerivative : Derivative of the activation function.
                chunkSize : Internal parameter for the multi-threading. For long time series this should be reduced to
                            avoid OOM errors/getting stuck and to reduce memory consumption.
        """

        self._averageOutputWeights = averageOutputWeights
        if averageOutputWeights and solver != "lsqr":
            raise ValueError(
                "`averageOutputWeights` can only be set to `True` when `solver` is set to `lsqr` (Ridge Regression)"
            )

        self._borderMode = borderMode
        if not borderMode in ["mirror", "padding", "edge", "wrap"]:
            raise ValueError(
                "`borderMode` must be set to one of the following values: `mirror`, `padding`, `edge` or `wrap`."
            )

        self._regressionParameters = regressionParameters
        self._solver = solver

        n_inputDimensions = len(inputShape)

        if filterSize % 2 == 0:
            raise ValueError(
                "filterSize has to be an odd number (1, 3, 5, ...).")
        self._filterSize = filterSize
        self._filterWidth = int(np.floor(filterSize / 2))
        self._stride = stride

        self._n_input = int(
            np.power(np.ceil(filterSize / stride), n_inputDimensions))

        self.n_inputDimensions = n_inputDimensions
        self.inputShape = inputShape

        if not self._averageOutputWeights:
            self._WOuts = B.empty(
                (np.prod(inputShape), 1, self._n_input + n_reservoir + 1))
            self._WOut = None
        else:
            self._WOuts = None
            self._WOut = B.zeros((1, self._n_input + n_reservoir + 1))
        self._xs = B.empty((np.prod(inputShape), n_reservoir, 1))

        if nWorkers == "auto":
            self._nWorkers = np.max((cpu_count() - 1, 1))
        else:
            self._nWorkers = nWorkers

        manager = Manager()
        self.sharedNamespace = manager.Namespace()
        if hasattr(self,
                   "fitWorkerID") == False or self.parallelWorkerIDs is None:
            self.parallelWorkerIDs = manager.Queue()
            for i in range(self._nWorkers):
                self.parallelWorkerIDs.put((i))

        self._chunkSize = chunkSize

        super(SpatioTemporalESN, self).__init__(
            n_input=self._n_input,
            n_reservoir=n_reservoir,
            n_output=1,
            spectralRadius=spectralRadius,
            noiseLevel=noiseLevel,
            inputScaling=inputScaling,
            leakingRate=leakingRate,
            reservoirDensity=reservoirDensity,
            randomSeed=randomSeed,
            out_activation=out_activation,
            out_inverse_activation=out_inverse_activation,
            weightGeneration=weightGeneration,
            bias=bias,
            outputBias=outputBias,
            outputInputScaling=outputInputScaling,
            inputDensity=inputDensity,
            activation=activation,
            activationDerivative=activationDerivative,
        )
        """