Esempio n. 1
0
def sygusCompetition(checkpoints, tasks):
    from pathos.multiprocessing import Pool
    import datetime

    # map from task to list of search times, one for each checkpoint.
    # search time will be None if it is not solved
    searchTimes = {t: [] for t in tasks}

    CPUs = int(8 / len(checkpoints))
    maxWorkers = int(numberOfCPUs() / CPUs)
    workers = Pool(maxWorkers)
    eprint(
        f"You gave me {len(checkpoints)} checkpoints to ensemble. Each checkpoint will get {CPUs} CPUs. Creating a pool of {maxWorkers} worker processes."
    )
    timeout = 3600

    promises = []
    for t in tasks:
        for checkpoint in checkpoints:
            promise = workers.apply_async(competeOnOneTask, (checkpoint, t), {
                "CPUs": CPUs,
                "timeout": timeout
            })
            promises.append(promise)
    eprint(f"Queued {len(promises)} jobs.")
    for promise in promises:
        dt, task = promise.get()
        if dt is not None:
            searchTimes[task].append(dt)

    searchTimes = {
        t: min(ts) if len(ts) > 0 else None
        for t, ts in searchTimes.items()
    }

    fn = "experimentOutputs/text_competition_%s.p" % (
        datetime.datetime.now().isoformat())
    with open(fn, "wb") as handle:
        pickle.dump(searchTimes, handle)
    eprint()

    hits = sum(t is not None for t in searchTimes.values())
    total = len(searchTimes)
    percentage = 100 * hits / total
    eprint("Hits %d/%d = %f\n" % (hits, total, percentage))
    eprint()
    eprint("Exported competition results to", fn)
Esempio n. 2
0
def backgroundHelmholtzEnumeration(tasks,
                                   g,
                                   timeout,
                                   _=None,
                                   special=None,
                                   evaluationTimeout=None):
    from pathos.multiprocessing import Pool
    requests = list({t.request for t in tasks})
    inputs = {
        r: list({
            tuplify(xs)
            for t in tasks if t.request == r for xs, y in t.examples
        })
        for r in requests
    }
    workers = Pool(len(requests))
    promises = [
        workers.apply_async(helmholtzEnumeration,
                            args=(g, r, inputs[r], float(timeout)),
                            kwds={
                                'special': special,
                                'evaluationTimeout': evaluationTimeout
                            }) for r in requests
    ]

    def get():
        results = [p.get() for p in promises]
        frontiers = []
        with timing("(Helmholtz enumeration) Decoded json into frontiers"):
            for request, result in zip(requests, results):
                response = json.loads(result.decode("utf-8"))
                for b, entry in enumerate(response):
                    frontiers.append(
                        Frontier([
                            FrontierEntry(program=Program.parse(p),
                                          logPrior=entry["ll"],
                                          logLikelihood=0.)
                            for p in entry["programs"]
                        ],
                                 task=Task(str(b), request, [])))
        eprint("Total number of Helmholtz frontiers:", len(frontiers))
        return frontiers

    return get
Esempio n. 3
0
    def run(self, integrator):
        if self.n_cores == 1:  # run single thread
            self.out_img = integrator.run(self.rows_pool[0], self.cols_pool[0],
                                          self.camera.get_ray,
                                          self.world)[0].reshape(
                                              (self.height, self.width, 3))
        else:  # run multithread
            pool = Pool(processes=self.n_cores)  # create pool of threads
            results = [
                pool.apply_async(integrator.run,
                                 args=(self.rows_pool[core_idx],
                                       self.cols_pool[core_idx],
                                       self.camera.get_ray, self.world))
                for core_idx in range(self.n_cores)
            ]

            output = [p.get() for p in results]  # get results

            # map results to the resulting image
            for out in output:
                self.out_img[out[1], out[2], :] = out[0]
Esempio n. 4
0
def estimate_param_scan(estimator,
                        X,
                        param_sets,
                        evaluate=None,
                        evaluate_args=None,
                        failfast=True,
                        return_estimators=False,
                        n_jobs=1,
                        progress_reporter=None,
                        show_progress=True,
                        return_exceptions=False):
    """ Runs multiple estimations using a list of parameter settings

    Parameters
    ----------
    estimator : Estimator object or class
        An estimator object that provides an estimate(X, **params) function.
        If only a class is provided here, the Estimator objects will be
        constructed with default parameter settings, and the parameter settings
        from param_sets for each estimation. If you want to specify other
        parameter settings for those parameters not specified in param_sets,
        construct an Estimator before and pass the object.

    param_sets : iterable over dictionaries
        An iterable that provides parameter settings. Each element defines a
        parameter set, for which an estimation will be run using these
        parameters in estimate(X, **params). All other parameter settings will
        be taken from the default settings in the estimator object.

    evaluate : str or list of str, optional
        The given methods or properties will be called on the estimated
        models, and their results will be returned instead of the full models.
        This may be useful for reducing memory overhead.

    evaluate_args: iterable of iterable, optional
        Arguments to be passed to evaluated methods. Note, that size has to match to the size of evaluate.

    failfast : bool
        If True, will raise an exception when estimation failed with an exception
        or trying to calls a method that doesn't exist. If False, will simply
        return None in these cases.

    return_estimators: bool
        If True, return a list estimators in addition to the models.

    show_progress: bool
        if the given estimator supports show_progress interface, we set the flag
        prior doing estimations.

    return_exceptions: bool, default=False
        if failfast is False while this setting is True, returns the exception thrown at the actual grid element,
        instead of None.

    Returns
    -------
    models : list of model objects or evaluated function values
        A list of estimated models in the same order as param_sets. If evaluate
        is given, each element will contain the results from these method
        evaluations.

    estimators (optional) : list of estimator objects. These are returned only
        if return_estimators=True

    Examples
    --------

    Estimate a maximum likelihood Markov model at lag times 1, 2, 3.

    >>> from pyemma.msm.estimators import MaximumLikelihoodMSM, BayesianMSM
    >>>
    >>> dtraj = [0,0,1,2,1,0,1,0,1,2,2,0,0,0,1,1,2,1,0,0,1,2,1,0,0,0,1,1,0,1,2]  # mini-trajectory
    >>> param_sets=param_grid({'lag': [1,2,3]})
    >>>
    >>> estimate_param_scan(MaximumLikelihoodMSM, dtraj, param_sets, evaluate='timescales')
    [array([ 1.24113168,  0.77454377]), array([ 2.65266698,  1.42909842]), array([ 5.34810405,  1.14784446])]

    Now we also want to get samples of the timescales using the BayesianMSM.
    >>> estimate_param_scan(MaximumLikelihoodMSM, dtraj, param_sets, failfast=False,
    ...     evaluate=['timescales', 'timescales_samples']) # doctest: +SKIP
    [[array([ 1.24113168,  0.77454377]), None], [array([ 2.48226337,  1.54908754]), None], [array([ 3.72339505,  2.32363131]), None]]

    We get Nones because the MaximumLikelihoodMSM estimator doesn't provide timescales_samples. Use for example
    a Bayesian estimator for that.

    Now we also want to get samples of the timescales using the BayesianMSM.
    >>> estimate_param_scan(BayesianMSM, dtraj, param_sets, show_progress=False,
    ...     evaluate=['timescales', 'sample_f'], evaluate_args=((), ('timescales', ))) # doctest: +SKIP
    [[array([ 1.24357685,  0.77609028]), [array([ 1.5963252 ,  0.73877883]), array([ 1.29915847,  0.49004912]), array([ 0.90058583,  0.73841786]), ... ]]

    """
    # make sure we have an estimator object
    estimator = get_estimator(estimator)
    if hasattr(estimator, 'show_progress'):
        estimator.show_progress = show_progress

    # if we want to return estimators, make clones. Otherwise just copy references.
    # For parallel processing we always need clones.
    # Also if the Estimator is its own Model, we have to clone.
    from pyemma._base.model import Model
    if (return_estimators or n_jobs > 1 or n_jobs is None
            or isinstance(estimator, Model)):
        estimators = [clone_estimator(estimator) for _ in param_sets]
    else:
        estimators = [estimator for _ in param_sets]

    # if we evaluate, make sure we have a list of functions to evaluate
    if _types.is_string(evaluate):
        evaluate = [evaluate]
    if _types.is_string(evaluate_args):
        evaluate_args = [evaluate_args]

    if evaluate is not None and evaluate_args is not None and len(
            evaluate) != len(evaluate_args):
        raise ValueError(
            "length mismatch: evaluate ({}) and evaluate_args ({})".format(
                len(evaluate), len(evaluate_args)))

    show_progress = progress_reporter is not None and show_progress
    if show_progress:
        progress_reporter._progress_register(len(estimators),
                                             stage=0,
                                             description="estimating %s" %
                                             str(estimator.__class__.__name__))

    if n_jobs > 1 and os.name == 'posix':
        if hasattr(estimators[0], 'logger'):
            estimators[0].logger.debug('estimating %s with n_jobs=%s',
                                       estimator, n_jobs)
        # iterate over parameter settings
        task_iter = ((estimator, param_set, X, evaluate, evaluate_args,
                      failfast, return_exceptions)
                     for estimator, param_set in zip(estimators, param_sets))

        from pathos.multiprocessing import Pool as Parallel
        pool = Parallel(processes=n_jobs)
        args = list(task_iter)
        if show_progress:
            from pyemma._base.model import SampledModel
            for a in args:
                if isinstance(a[0], SampledModel):
                    a[0].show_progress = False

            def callback(_):
                progress_reporter._progress_update(1, stage=0)
        else:
            callback = None

        import six
        if six.PY3:

            def error_callback(*args, **kw):
                if failfast:
                    raise Exception('something failed')

            with pool:
                res_async = [
                    pool.apply_async(_estimate_param_scan_worker,
                                     a,
                                     callback=callback,
                                     error_callback=error_callback)
                    for a in args
                ]
                res = [x.get() for x in res_async]
        else:
            try:
                res_async = [
                    pool.apply_async(_estimate_param_scan_worker,
                                     a,
                                     callback=callback) for a in args
                ]
                res = [x.get() for x in res_async]
            finally:
                pool.close()

    # if n_jobs=1 don't invoke the pool, but directly dispatch the iterator
    else:
        if hasattr(estimators[0], 'logger'):
            estimators[0].logger.debug(
                'estimating %s with n_jobs=1 because of the setting or '
                'you not have a POSIX system', estimator)
        res = []
        if show_progress:
            from pyemma._base.model import SampledModel
            if isinstance(estimator, SampledModel):
                for e in estimators:
                    e.show_progress = False

        for estimator, param_set in zip(estimators, param_sets):
            res.append(
                _estimate_param_scan_worker(estimator, param_set, X, evaluate,
                                            evaluate_args, failfast,
                                            return_exceptions))
            if show_progress:
                progress_reporter._progress_update(1, stage=0)

    if show_progress:
        progress_reporter._progress_force_finish(0)

    # done
    if return_estimators:
        return res, estimators
    else:
        return res
Esempio n. 5
0
    def learnMulti(self, gn, top_n=5):
        """
		Learn using multiprocessing

		gn: initial grammar
		top_n: save the best top_n grammars
		"""

        # settings
        max_process = cpu_count(
        )  # processes to run in parallel; or use cpu_count()

        pool = Pool(max_process)
        P_LOCK = Manager().Lock()  # mutex lock for printing
        gidCounter = 0
        results = []
        gList = [gn]
        gnBest = [deepcopy(gn)]
        bestmdl_last = gn.mdl  # best MDL on the upper level nodes

        # uncomment this for debugging
        #history_pri_lik= set() # history of prior & likelihood values

        while len(gList) > 0:
            gn = gList.pop(0)  # grammar node to be expanded
            self.printMsg(
                1, '>> gList size: %d, bestMDL: %.3f (#%d)' %
                (len(gList), gnBest[0].mdl, gnBest[0].gid))

            # substitute if possible
            ntlist = self.getFirstDLThack(gn.dlt)
            if len(ntlist) > 0:
                self.printMsg(2,
                              '>> Possible substitutions on #%d:' % (gn.gid),
                              ntlist)
                while len(ntlist) > 0:
                    gidCounter += 1
                    argList = [gn, ntlist.pop(), gidCounter, P_LOCK]
                    results.append(
                        pool.apply_async(self.substituteMulti, [argList]))
            else:
                self.printMsg(
                    2, '>> No more SUBSTITUTE possible on #%d\n' % gn.gid)

            # merge if possible
            ntlist = self.mergeSet(gn.g)
            if len(ntlist) > 0:
                self.printMsg(2, '>> Possible merges on #%d:' % (gn.gid),
                              ntlist)
                while len(ntlist) > 0:
                    gidCounter += 1
                    argList = [gn, ntlist.pop(), gidCounter, P_LOCK]
                    results.append(pool.apply_async(self.mergeMulti,
                                                    [argList]))
            else:
                self.printMsg(2,
                              '>> No more MERGE possible on #%d.\n' % gn.gid)

            del gn
            delList = []
            bestmdl = bestmdl_last

            # search next level in the search tree
            while len(results) >= 1:
                time.sleep(0.001)  # avoid wasting resource
                for r in range(len(results)):
                    if results[r].ready():
                        delList.append(r)
                        gn_new = results[r].get()

                        # uncomment for debugging
                        #if gn_new.lik < self.max_mdl:
                        #	history_pri_lik.add((gn_new.pri,gn_new.lik))

                        # save the best-N grammars
                        if len(gnBest) < top_n:
                            gnBest.append(gn_new)
                            gnBest = sorted(gnBest, key=lambda gn: gn.mdl)
                        elif gn_new.mdl < gnBest[-1].mdl:
                            del gnBest[-1]
                            gnBest.append(deepcopy(gn_new))
                            gnBest = sorted(gnBest, key=lambda gn: gn.mdl)

                        # save this level's best mdl
                        if gn_new.mdl < bestmdl:
                            bestmdl = gn_new.mdl

                        # beam search: compare with the best mdl on the upper level
                        if gn_new.mdl >= bestmdl_last:
                            gn_new.worse += 1
                        if gn_new.worse < BEAMSIZE:
                            gList.append(gn_new)
                        else:
                            del gn_new

                delList.sort()
                for d in range(len(delList)):
                    del results[delList.pop()]
            bestmdl_last = bestmdl

        pool.close()
        pool.join()

        return gnBest
Esempio n. 6
0
def generate_output(args):
    """ Main application Driver
        
        1. Partition filenames into smaller chunks/arrays of image filenames
        2. Generate worker processes
        3. Pass the chunks to the workers
        4. Each worker deduplicates it's set of image files 
        5. Merge the results from each worker to one python dictionary
        6. OPTIONAL -- Output the deduplicated image files to a directory 
    """
    

    # Partition the list of filenames
    num_chunks = args.num_jobs 
    
    # Create a pool of worker threads
    # Each worker will deduplicate a set of images
    filenames = [] 
    metadata = None
    end_str = ""
    if args.json_metadata != None:
        metadata,filenames = process_json_file(args.json_metadata)
        end_str = "from metadata file: %s" % args.json_metadata
    else:
        # Find all image files in dump directory
        filenames = find_all_images(args.dump_dir)
        end_str = "from directory: %s" % args.dump_dir

    file_chunks = partition_filenames(filenames, num_chunks)
    print("Found {} images in directory: {}".format(len(filenames), end_str))

    """
    metadata_results = []
    file_chunk_list = list(file_chunks)
    num_proc = len(file_chunk_list)
    print >> sys.stderr, "Printing file chunks"
    print >> sys.stderr, file_chunks
    pool2 = Pool(processes = num_proc)
    with open(args.json_metadata) as json_metadata_file:
        metadata_results = [pool2.map(process_json_line,json_metadata_file, chunk) for index, chunk in enumerate(file_chunk_list)]
    #objs = [p.get() for p in results] 
    metadata =  merge_exact_duplicates(metadata_results)
    """
    
    pool = Pool(processes=num_chunks)

    # Pass the partitions to each thread
    results = []
    final_dictionary = {}
    if not args.near_duplicates:
        if args.num_jobs == 1:
            # If we're only using one worker, don't make overhead of starting a process
            result = exact_deduplicate_images(filenames)
            dictionaries = [result]
        else:
            # Get the results from each worker
            results = [pool.apply_async(exact_deduplicate_images, args=(index,chunk,)) 
                    for index, chunk in enumerate(file_chunks)]
            dictionaries = [p.get() for p in results]

        # Merge the results into one dictionary
        final_dictionary = merge_exact_duplicates(dictionaries)
    else:
        if args.num_jobs == 1:
            # If we're only using one worker, don't make overhead of starting a process
            result = near_deduplicate_images(filenames, args.bit_distance, metadata = metadata)
            near_duplicate_objects = [result]
        else:
            # Get the results from each near duplicate worker
            if metadata != None:
                results = [pool.apply_async(near_deduplicate_images, (chunk,args.bit_distance, ), dict(metadata=metadata)) for chunk in file_chunks] 
            else:
                results = [pool.apply_async(near_deduplicate_images, (chunk,args.bit_distance,))  for chunk in file_chunks] 
            # create an array of near duplicate objects
            near_duplicate_objects = [p.get() for p in results]

        # Merge the dictionaries together using the info from its corresponding indexes
        final_dictionary = merge_near_duplicates(near_duplicate_objects)

    print("Number of images prior to deduplication: {}".format(len(filenames)), file=sys.stderr)
    print("Number of images after deduplication: {}".format(len(final_dictionary)), file=sys.stderr)
    
    # Write the image locations to an output file
    
    
    if args.output_json != None:
        # TODO
        # For now, just do this with exact duplicates
        # Dumping the simhash class to JSON doesn't work because the object isn't
        # JSON serializable 
        outfile_name = args.output_json
        print("Writing to image dictionary to file: {}".format(outfile_name))
        with open(outfile_name, 'w') as outfile:
            json.dump(final_dictionary, outfile, indent=4, skipkeys=True, default=str)

    # Copy the images to an output directory
    create_output_image_directory(args, final_dictionary)

    return len(final_dictionary), len(filenames) - len(final_dictionary)
Esempio n. 7
0
reg_loss = 0
nonzeros = 0
for batch in range(1, 2500000):
    if batch % 10 == 0:
        args_to_save = {}
        auxs_to_save = {}
        for k in net.arg_dict:
            args_to_save[k] = net.arg_dict[k].copyto(mx.cpu())
        for k in net.aux_dict:
            auxs_to_save[k] = net.aux_dict[k].copyto(mx.cpu())
        mx.nd.save('args_reg37ss.nd', args_to_save) # avoid device ordinal problem
        mx.nd.save('auxs_reg37ss.nd', auxs_to_save)
    if batch % 20 == 0:
        optimizer.lr /= 2
        anno[:] = anno_np
    data = [pool.apply_async(get_image, ()) for i in range(batch_size)]
#     get_data(batch_size, imgout, anno_np, reg_anno_np)
    for bb in range(batch_size):
        d = data[bb].get()
        imgout[0] = d[0]
        anno[0] = d[1]
        reg_anno[0] = d[2]
        net.forward(is_train=True)
        
        cls_grad = net.outputs[0] - anno
        cls_pred_np = net.outputs[0].asnumpy()
        cls_truth_np = anno.asnumpy()
#    print net.outputs[1].asnumpy()[0,0,50:60,50:60]
#    print reg_anno.asnumpy()[0,0,50:60,50:60]
        precision += np.mean(np.argmax(cls_pred_np, axis=1)==np.argmax(cls_truth_np, axis=1))
#         for i in range(batch_size):
    def __init__(self):
        grid_size = 16
        # HEURISTICS: radius = (1/3)*2^(ENCODING_SIZE)
        # where ENCODING_SIZE is bit size of every pattern element (8 bits for us)
        radius = 24
        # Calculate pattern size based on grid_size and size of a Nibble (4)
        pattern_size = pow(grid_size, 2) / 4
        # Set neural network data size
        RbfNetwork.PATTERN_SIZE = pattern_size
        # Set neural network default radius
        RbfNetwork.DEFAULT_RADIUS = radius
        # Set pattern size in RBF knowledge
        RbfKnowledge.PATTERN_SIZE = pattern_size

        # If there are no persisten memory related files, create them
        if not os.path.isfile("persistent_memory/sight_snb.p"):
            self.erase_all_knowledge()

        # 3.2.1.1 TODO: use detected processor number, and equation logic 3.1.3.
        # Detect system and determine threads number to use
        detect_system = DetectSystem()
        # Init thread's pool, with the determined threads number
        pool = Pool(detect_system.thread_number(12))

        # SNB
        #self.snb = SensoryNeuralBlock("persistent_memory/sight_snb.p", "persistent_memory/hearing_snb.p")
        self.snb = pool.apply_async(lambda x: SensoryNeuralBlock("persistent_memory/sight_snb.p", "persistent_memory/hearing_snb.p"), [None]).get()
        # Relational Neural Block
        self.rnb = pool.apply_async(lambda x: RelNetwork.deserialize("persistent_memory/rnb.p"), [None]).get()
        # Analytical neuron
        self.analytical_n = pool.apply_async(lambda x: AnalyticalNeuron(), [None]).get()
        # Addition by memory network
        self.am_net = pool.apply_async(lambda x: CulturalNetwork.deserialize("persistent_memory/am_net.p"), [None]).get()
        # Geometric Neural Block
        self.gnb = pool.apply_async(lambda x: GeometricNeuralBlock.deserialize("persistent_memory/gnb.p"), [None]).get()
        # Syllables net
        self.syllables_net = pool.apply_async(lambda x: CulturalNetwork.deserialize("persistent_memory/syllables_net.p"), [None]).get()
        # Words net
        self.words_net = pool.apply_async(lambda x: CulturalNetwork.deserialize("persistent_memory/words_net.p"), [None]).get()
        # Sight-Syllables rel network
        self.ss_rnb = pool.apply_async(lambda x: RelNetwork.deserialize("persistent_memory/ss_rnb.p"), [None]).get()

        # ################### INTENTIONS MODULES ########################################################################
        self.episodic_memory = pool.apply_async(lambda x: EpisodicMemoriesBlock.deserialize("persistent_memory/episodic_memory.p"), [None]).get()
        self.decisions_block = pool.apply_async(lambda x: DecisionsBlock.deserialize("persistent_memory/decisions_block.p"), [None]).get()

        self.internal_state = pool.apply_async(lambda x: InternalState.deserialize("persistent_memory/internal_state.p"), [None]).get()
        self.desired_state = pool.apply_async(lambda x: InternalState.deserialize("persistent_memory/desired_state.p"), [None]).get()

        # Internal state "Ports" (Three components real valued vector)
        self._internal_state_in = None

        # Memory that stores short term bip inputs for making a decision
        self._intentions_short_term_memory = []
        self._output_memory = None
        # ###############################################################################################################

        # _bbcc_words
        self._learning_words = False
        self._learning_syllables = False
        self._enable_bbcc = False

        # Output "ports" (related to senses)
        self.s_knowledge_out = None
        self.h_knowledge_out = None

        # Input "ports" (senses)
        self.s_knowledge_in = None
        self.h_knowledge_in = None

        self._working_domain = "ADDITION"
        self.state = "MISS"
Esempio n. 9
0
def main():
    parser = OptionParser()
    parser.add_option('-n',
                      '--name',
                      dest='name',
                      type=str,
                      action='store',
                      help='Name of the movie file to get.')
    parser.add_option('-y',
                      '--year',
                      dest='year',
                      type=int,
                      action='store',
                      help='Year to look for the movie file to get.')
    parser.add_option(
        '--maxnum',
        dest='maxnum',
        type=int,
        action='store',
        default=10,
        help='Maximum number of torrents to look through. Default is 10.')
    parser.add_option(
        '--timeout',
        dest='timeout',
        type=int,
        action='store',
        default=60,
        help=
        'Timeout on when to quit searching for torrents (in seconds). Default is 60 seconds..'
    )
    #parser.add_option('--any', dest='do_any', action='store_true', default = False,
    #                  help = 'If chosen, make no filter on movie format.')
    parser.add_option('-f',
                      '--filename',
                      dest='filename',
                      action='store',
                      type=str,
                      help='If defined, put option into filename.')
    parser.add_option('--bypass',
                      dest='do_bypass',
                      action='store_true',
                      default=False,
                      help='If chosen, bypass YTS.AG.')
    parser.add_option('--nozooq',
                      dest='do_nozooq',
                      action='store_true',
                      default=False,
                      help='If chosen, bypass ZOOQLE.')
    #parser.add_option('--torrentz', dest='do_torrentz', action='store_true', default=False,
    #                  help = 'If chosen, also look through TORRENTZ to get magnet link.')
    parser.add_option('--info',
                      dest='do_info',
                      action='store_true',
                      default=False,
                      help='If chosen, run in info mode.')
    parser.add_option(
        '--add',
        dest='do_add',
        action='store_true',
        default=False,
        help=
        'If chosen, push the magnet link or torrent file into the deluge server.'
    )
    parser.add_option('--noverify',
                      dest='do_verify',
                      action='store_false',
                      default=True,
                      help='If chosen, do not verify SSL connections.')
    parser.add_option(
        '--timing',
        dest='do_timing',
        action='store_true',
        default=False,
        help=
        'If chosen, show timing information (how long to get movie torrents).')
    parser.add_option(
        '--doRaw',
        dest='do_raw',
        action='store_true',
        default=False,
        help='If chosen, do not use IMDB matching for Jackett torrents.')
    opts, args = parser.parse_args()
    assert (opts.timeout >= 10)
    assert (opts.name is not None)
    if opts.do_info: logging.basicConfig(level=logging.INFO)
    #
    num_both = 0
    if opts.filename is not None: num_both += 1
    if opts.do_add: num_both += 1
    assert (
        num_both !=
        2), "error, at most either one of --f or --add must be set, NOT both."
    #
    time0 = time.time()
    tmdb_id = None
    if opts.year is not None:
        tmdb_id = plextmdb.get_movie_tmdbids(opts.name, year=opts.year)
    if not opts.do_bypass:
        try:
            get_movie_yts(opts.name,
                          verify=opts.do_verify,
                          raiseError=True,
                          to_torrent=opts.do_add)
            logging.info('search for YTS torrents took %0.3f seconds.' %
                         (time.time() - time0))
            return
        except ValueError:
            pass

    pool = Pool(processes=4)
    if not opts.do_nozooq:
        jobs = [
            pool.apply_async(get_items_zooqle, args=(opts.name, opts.maxnum))
        ]
    else:
        jobs = []
    #
    ## check for jackett
    if get_jackett_credentials() is None:
        jobs += list(
            map(
                lambda func: pool.apply_async(func,
                                              args=(opts.name, opts.maxnum)),
                (get_items_rarbg, get_items_tpb)))
        #if opts.do_torrentz:
        #    jobs.append( pool.apply_async( get_items_torrentz, args = ( opts.name, opts.maxnum ) ) )
    else:
        jobs.append(
            pool.apply_async(get_items_jackett,
                             args=(opts.name, tmdb_id, opts.maxnum,
                                   opts.do_verify, opts.do_raw)))
        jobs.append(
            pool.apply_async(get_items_eztv_io,
                             args=(opts.name, tmdb_id, opts.maxnum,
                                   opts.do_verify)))
    items_lists = []
    for job in jobs:
        try:
            items = job.get(opts.timeout)  # 60 second timeout on process
            if items is None: continue
            items_lists.append(items)
        except:
            pass
    items = list(chain.from_iterable(items_lists))
    if opts.do_timing:
        print('search for %d torrents took %0.3f seconds.' %
              (len(items), time.time() - time0))
    if len(items) != 0:
        #
        ## sort from most seeders + leecher to least
        items_sorted = sorted(
            items,
            key=lambda tup: -tup['seeders'] - tup['leechers'])[:opts.maxnum]
        get_movie_torrent_items(items_sorted,
                                filename=opts.filename,
                                to_torrent=opts.do_add)
Esempio n. 10
0
            *main_client.get_array_nd_int32('simulation-time'))
        print(f"DA Time:{current_time}")
        temp_intp = thetao_assimilator.time_interpolate(current_time)
        salt_intp = so_assimilator.time_interpolate(current_time)

        for rank in rank_list:
            rank.running = False
            main_client.put_scalar_int32(f'{rank.id_str}_sent-inc', 0)
        da_todo = [rank for rank in rank_list if not rank.running]

        async_list = []
        # Loop over ranks have been processed and add them to the async queue
        da_rank = 0
        while (da_todo):
            for rank in da_todo:
                stime = time()
                if (main_client.poll_key_and_check_scalar_int32(
                        f'{rank.id_str}_sent-prior', 1, 10, 1)):
                    rank.running = True
                    async_list.append(
                        pool.apply_async(rank.run_da,
                                         (temp_intp, salt_intp, da_rank)))
                    da_rank = (da_rank + 1) % NUM_THREADS
                    # rank.run_da( temp_intp, salt_intp )
                print(time() - stime)

            da_todo = [rank for rank in rank_list if not rank.running]
            print(f"Remaining number of ranks: {len(da_todo)}")
        pool.close()
        pool.join()
Esempio n. 11
0
 assert (os.path.basename(opts.filename).endswith('.srt'))
 logger = logging.getLogger()
 if opts.do_info: logger.setLogger(logging.INFO)
 keywords_set = {}
 if opts.keywords is not None:
     keywords_set = set(
         map(
             lambda tok: tok.lower(),
             filter(lambda tok: len(tok.strip()) != 0,
                    opts.keywords.strip().split(','))))
 #
 ## now calculation with multiprocessing
 time0 = time.time()
 pool = Pool(processes=3)
 if not opts.do_bypass:
     jobs = [pool.apply_async(get_items_yts, args=(opts.name, opts.maxnum))]
 else:
     jobs = []
 jobs += list(
     map(
         lambda func: pool.apply_async(
             func, args=(opts.name, opts.maxnum, keywords_set)),
         (get_items_subscene, get_items_opensubtitles)))
 items_lists = []
 for job in jobs:
     try:
         items = job.get()
         if items is None: continue
         items_lists.append(items)
     except:
         pass
Esempio n. 12
0
def generate_output(args):
    """ Main application Driver
        
        1. Partition filenames into smaller chunks/arrays of image filenames
        2. Generate worker processes
        3. Pass the chunks to the workers
        4. Each worker deduplicates it's set of image files 
        5. Merge the results from each worker to one python dictionary
        6. OPTIONAL -- Output the deduplicated image files to a directory 
    """

    # Partition the list of filenames
    num_chunks = args.num_jobs

    # Create a pool of worker threads
    # Each worker will deduplicate a set of images
    filenames = []
    metadata = None
    end_str = ""
    if args.json_metadata != None:
        metadata, filenames = process_json_file(args.json_metadata)
        end_str = "from metadata file: %s" % args.json_metadata
    else:
        # Find all image files in dump directory
        filenames = find_all_images(args.dump_dir)
        end_str = "from directory: %s" % args.dump_dir

    file_chunks = partition_filenames(filenames, num_chunks)
    print("Found {} images in directory: {}".format(len(filenames), end_str))
    """
    metadata_results = []
    file_chunk_list = list(file_chunks)
    num_proc = len(file_chunk_list)
    print >> sys.stderr, "Printing file chunks"
    print >> sys.stderr, file_chunks
    pool2 = Pool(processes = num_proc)
    with open(args.json_metadata) as json_metadata_file:
        metadata_results = [pool2.map(process_json_line,json_metadata_file, chunk) for index, chunk in enumerate(file_chunk_list)]
    #objs = [p.get() for p in results] 
    metadata =  merge_exact_duplicates(metadata_results)
    """

    pool = Pool(processes=num_chunks)

    # Pass the partitions to each thread
    results = []
    final_dictionary = {}
    if not args.near_duplicates:
        if args.num_jobs == 1:
            # If we're only using one worker, don't make overhead of starting a process
            result = exact_deduplicate_images(filenames)
            dictionaries = [result]
        else:
            # Get the results from each worker
            results = [
                pool.apply_async(exact_deduplicate_images,
                                 args=(
                                     index,
                                     chunk,
                                 )) for index, chunk in enumerate(file_chunks)
            ]
            dictionaries = [p.get() for p in results]

        # Merge the results into one dictionary
        final_dictionary = merge_exact_duplicates(dictionaries)
    else:
        if args.num_jobs == 1:
            # If we're only using one worker, don't make overhead of starting a process
            result = near_deduplicate_images(filenames,
                                             args.bit_distance,
                                             metadata=metadata)
            near_duplicate_objects = [result]
        else:
            # Get the results from each near duplicate worker
            if metadata != None:
                results = [
                    pool.apply_async(near_deduplicate_images, (
                        chunk,
                        args.bit_distance,
                    ), dict(metadata=metadata)) for chunk in file_chunks
                ]
            else:
                results = [
                    pool.apply_async(near_deduplicate_images, (
                        chunk,
                        args.bit_distance,
                    )) for chunk in file_chunks
                ]
            # create an array of near duplicate objects
            near_duplicate_objects = [p.get() for p in results]

        # Merge the dictionaries together using the info from its corresponding indexes
        final_dictionary = merge_near_duplicates(near_duplicate_objects)

    print("Number of images prior to deduplication: {}".format(len(filenames)),
          file=sys.stderr)
    print("Number of images after deduplication: {}".format(
        len(final_dictionary)),
          file=sys.stderr)

    # Write the image locations to an output file

    if args.output_json != None:
        # TODO
        # For now, just do this with exact duplicates
        # Dumping the simhash class to JSON doesn't work because the object isn't
        # JSON serializable
        outfile_name = args.output_json
        print("Writing to image dictionary to file: {}".format(outfile_name))
        with open(outfile_name, 'w') as outfile:
            json.dump(final_dictionary,
                      outfile,
                      indent=4,
                      skipkeys=True,
                      default=str)

    # Copy the images to an output directory
    create_output_image_directory(args, final_dictionary)

    return len(final_dictionary), len(filenames) - len(final_dictionary)