Exemple #1
0
def trainerLearnScoreParallel(lrLearner, svmLearner, knnLearner):
    manager = Manager()

    lrScore, svmScore, knnScore = manager.Value('d', 0.0), manager.Value('d', 0.0), manager.Value('d', 0.0)

    temp = manager.Namespace()
    temp.learner = lrLearner
    lrLearner = temp

    temp = manager.Namespace()
    temp.learner = svmLearner
    svmLearner = temp

    temp = manager.Namespace()
    temp.learner = knnLearner
    knnLearner = temp

    lrP = Process(target=trainerLearnScore, args=(lrLearner, 'LogReg', finalFeatures, finalAnswers, testFeatures, testAnswers, lrScore))
    svmP = Process(target=trainerLearnScore, args=(svmLearner, 'SVM', finalFeatures, finalAnswers, testFeatures, testAnswers, svmScore))
    knnP = Process(target=trainerLearnScore, args=(knnLearner, 'kNN', finalFeatures, finalAnswers, testFeatures, testAnswers, knnScore))

    lrP.start()
    svmP.start()
    knnP.start()

    lrP.join()
    svmP.join()
    knnP.join()

    lrLearner = lrLearner.learner
    svmLearner = svmLearner.learner
    knnLearner = knnLearner.learner
    return (lrLearner, svmLearner, knnLearner, lrScore, svmScore, knnScore)
Exemple #2
0
def make_context():
    manager = Manager()
    context = {
        'status': manager.Namespace(),
        'configLock': manager.Lock(),
        'streams': manager.Namespace(),
        'filecopy': manager.Namespace(),
        'logs': manager.Queue()
    }
    return context
Exemple #3
0
def _main(*, input_files, clean, preprocess_args=None, plot_args, output_path):
    if clean:
        shutil.rmtree(output_path, ignore_errors=True)
    ensure_directory(output_path)
    dfs = [pd.read_parquet(f.format(**os.environ)) for f in input_files]
    df = pd.concat(dfs)

    if preprocess_args:
        df = preprocess(df, **preprocess_args)

    print(df.head())
    for col in df.columns:
        if df[col].nunique() < 20:
            values = ', '.join(map(str, df[col].unique()))
            print(f'{col} has the values: {values}')

    plot_args = list(expand(df, plot_args))
    plot_args = [{**pa, 'output_path': output_path} for pa in plot_args]

    print(f"Make {len(plot_args)} plots.")

    mgr = Manager()
    ns = mgr.Namespace()
    ns.df = df
    pool = Pool(20)
    data_args = list(zip(plot_args, [ns] * len(plot_args)))
    pool.map(_plot, data_args)
def test_call_aws_cli_called(mock_aws):
    """
    Test that the aws cli is called since the object storage class is standard
    """
    scripts.aws_replicate.logger = MagicMock()
    subprocess.Popen = MagicMock()
    utils.get_aws_bucket_name = MagicMock()

    mock_aws.return_value = "tcga-open"

    scripts.aws_replicate.bucket_exists = MagicMock()
    scripts.aws_replicate.bucket_exists.return_value = True
    scripts.aws_replicate.object_exists = MagicMock()
    scripts.aws_replicate.object_exists.return_value = True
    scripts.aws_replicate.get_object_storage_class = MagicMock()
    boto3.session.Session = MagicMock()
    scripts.aws_replicate.get_object_storage_class.return_value = "STANDARD"

    manager = Manager()
    manager_ns = manager.Namespace()
    manager_ns.total_processed_files = 0
    manager_ns.total_copied_data = 0
    lock = manager.Lock()

    job_info = scripts.aws_replicate.JobInfo({},
                                             gen_mock_manifest_data()[0], 1, 1,
                                             "", {}, {}, manager_ns, "bucket")

    scripts.aws_replicate.exec_aws_copy(lock, False, job_info)
    assert subprocess.Popen.call_count == 1
Exemple #5
0
def multi_run(producer,consumer,pool_cnt=20,producer_cnt=10,consumer_cnt=10):
    global token
    global last_down_w
    global finished

    manager = Manager()
    q1 = manager.Queue(10000)
    ws = [i for i in range(1, int(no_of_images)+1, 100)]
    for i in ws: q1.put(i)
    last_down_w = ws[-1]

    q = manager.Queue(10000)
    p = Pool(pool_cnt)
    nm = manager.Namespace()
    nm.running = True
    for i in xrange(producer_cnt):
        pw = p.apply_async(producer,args=(q1,q,nm,))
        print 'new producer'
        sys.stdout.flush()
        time.sleep(0.1)
    for i in xrange(consumer_cnt):
        p.apply_async(consumer,args=(q1,q,nm,))
        print 'new consumer'
        sys.stdout.flush()
    p.close()
    p.join()
Exemple #6
0
    def _random(self, random_changes=None, random_upgrades=None, tries=100):
        manager = Manager()
        glob = manager.Namespace()
        # start = time.time()
        glob.best_grid = self.copy()
        glob.best_score = glob.best_grid.score()
        lock = Lock()

        def f(i, glob, lock):
            start_grid = self.copy()
            start_grid._random_upgrade(random_upgrades)
            # start_grid._random_change(random_changes)
            # start_grid.brute_upgrade()
            score = start_grid.score()
            with lock:
                if score > glob.best_score:
                    print(i, score)
                    glob.best_grid = start_grid.copy()
                    glob.best_score = score
            return

        ps = [Process(target=f, args=(i, glob, lock)) for i in range(tries)]
        [p.start() for p in ps]
        [p.join() for p in ps]

        # try:

        # except KeyboardInterrupt:
        #     pass
        self = glob.best_grid
        # print("%d : %d : %.2fs" % (i, score, (time.time()-start)))
        return self
Exemple #7
0
 def __init__(self) -> None:
     self.Xd_shared = Array('d',[0,0,0])
     self.dXd_shared = Array('d',[0,0,0])
     # self.dXd_shared = [0,0,0]
     self.force = Value('d',0.0)
     mgr = Manager()
     self.namespace = mgr.Namespace()
     self.namespace.my_list = []
Exemple #8
0
def scrape_recommendations(client_id, client_secret):
    manager = Manager()
    namespace = manager.Namespace()
    namespace.credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
    sp = spotipy.Spotify(client_credentials_manager=namespace.credentials_manager)
    genres = sp.recommendation_genre_seeds()["genres"]
    worker = partial(scraping_worker, namespace)
    with Pool(processes=len(genres)) as pool:
        pool.map(worker, genres)
Exemple #9
0
    def __init__(self):
        process_manager = Manager()
        self._shared_data = process_manager.Namespace()
        self._shared_data.alive = True

        self._queue = Queue()
        self._worker = Process(target=input_worker,
                               args=(self._shared_data, self._queue))
        self._worker.start()
Exemple #10
0
def main():
    processes = []
    manager = Manager()
    shared_namespace = manager.Namespace()
    shared_namespace.hashes = []
    for i in range(NB_POOLS):
        proc = Process(target=process_job, args=(i, shared_namespace))
        proc.start()
        processes.append(proc)
    for proc in processes:
        proc.join()
Exemple #11
0
def run_downloader(db_connection, logger):
    manager = Manager()
    shared_state = manager.Namespace()
    shared_state.default_date = get_default_date()
    shared_state.past_date = None

    job_serializer = Serializer(db_connection, job)

    # TODO: Check if today's job is already running.
    # If so, just go with missed jobs.
    logger.info('Starting the main workflow')
    p = start_main_job(shared_state)

    if environ.get('JUST_MAIN', False):
        p.join()
        return

    end_time = datetime.now()\
        .replace(hour=23, minute=30, second=0, microsecond=0)

    while True:
        sleep(5)

        p.join(timeout=0)
        if not p.is_alive():
            job_id = shared_state.job_id
            completed = shared_state.completed
            if job_id is not None and not completed:
                job_serializer.put(job_id, {
                    'status': JobStatus.FAILED,
                })

            if p.exitcode != 0:
                logger.error('Job exited unexpectedly',
                             f'Exit code: {p.exitcode}\nJob id: {job_id}')

            if datetime.now() >= end_time:
                break

            p = start_past_job(shared_state)
            if p is None:
                break

        elif datetime.now() >= end_time:
            job_id = shared_state.job_id
            completed = shared_state.completed
            if job_id is not None and not completed:
                # Time to end.
                p.terminate()
                job_serializer.put(job_id, {
                    'status': JobStatus.FAILED,
                })

    logger.info('All jobs finished.')
Exemple #12
0
def trainOnSet(hyperParams, examples, theta, histGrad, cores):
  try: fixWords = hyperParams['fixEmb']
  except: fixWords = False
  try: fixWeights = hyperParams['fixW']
  except: fixWeights = False

#  print 'fixEmb:',fixWords, ',fixW:', fixWeights

  adagrad = hyperParams['ada']


  mgr = Manager()
  ns= mgr.Namespace()
  ns.lamb = hyperParams['lambda']
  batchsize = hyperParams['bSize']
  random.shuffle(examples) # randomly split the data into parts of batchsize
  avErrors = []
  for batch in xrange((len(examples)+batchsize-1)//batchsize):
    ns.theta = theta
    minibatch = examples[batch*batchsize:(batch+1)*batchsize]
    s = (len(minibatch)+cores-1)//cores
    trainPs = []
    q = Queue()

    if cores<2:
      trainBatch(ns, minibatch,q, fixWords,fixWeights) #don't start a subprocess
      trainPs.append('')  # But do put a placeholder in the queue
    else:
      for j in xrange(cores):
        p = Process(name='minibatch'+str(batch)+'-'+str(j), target=trainBatch, args=(ns, minibatch[j*s:(j+1)*s],q,fixWords,fixWeights))
        trainPs.append(p)
        p.start()

    errors = []
    theta.regularize(hyperParams['alpha']/len(examples), hyperParams['lambda'])
    for j in xrange(len(trainPs)):
      (grad, error) = q.get()
      if grad is None: continue
      theta.add2Theta(grad,hyperParams['alpha'],histGrad)
      errors.append(error)

    # make sure all worker processes have finished and are killed
    if cores>1:
      for p in trainPs: p.join()

    try: avError = sum(errors)/len(errors)
    except:
      avError = 0
      print 'batch size zero!'
    if batch % 25 == 0:
      print '\t\tBatch', batch, ', average error:',avError , ', theta norm:', theta.norm()
    avErrors.append(avError)
  return sum(avErrors)/len(avErrors)
Exemple #13
0
    def __init__(self):
        self.i3 = i3ipc.Connection()
        self._actived = False
        self._win_rect = (0, 0, 0, 0)

        process_manager = Manager()
        self._shared_data = process_manager.Namespace()
        self._shared_data.alive = True
        self._shared_data.actived = False
        self._lock = Lock()
        self._worker = Process(target=info_worker,
                               args=(self._lock, self._shared_data))
        self._worker.start()
Exemple #14
0
class ParallelContext:
    """Holds the objects needed to coordinate parallelism."""
    def __init__(self,
                 spawner: GeneSpawner,
                 evaluator: Evaluator,
                 n_proc: Optional[int] = None):
        self.manager = Manager()
        self.ns = self.manager.Namespace()
        self.ns.spawner = spawner
        self.ns.evaluator = evaluator
        if n_proc is None:
            self.pool = Pool()
        else:
            self.pool = Pool(n_proc)
Exemple #15
0
def main():
    final_result = []
    args = cmd_args_parser()
    case_id = int(args.case_num[3:])
    prefix = args.case_num[:3]
    case_type_filter = None
    if hasattr(args, "case_type_filter"):
        case_type_filter = args.case_type_filter
    lock = Lock()
    jobs = []
    mgr = Manager()
    ns = mgr.Namespace()
    ns.df = final_result

    start = case_id - args.batch
    end = case_id + args.batch

    total_num = end - start + 1

    if total_num > 20:
        batch_result = get_batch_pair(total_num, start, end)

        for i in range(len(batch_result)):
            p = multiprocessing.Process(target=query_website,
                                        args=(
                                            ns,
                                            batch_result[i],
                                            prefix,
                                            case_type_filter,
                                            lock,
                                            args.verbose,
                                        ))
            jobs.append(p)
            p.start()
        for job in jobs:
            job.join()

        final_result = ns.df

    else:
        for i in range(start, end):
            result = get_result(i, prefix, case_type_filter, args.verbose)
            if bool(result):
                final_result.append(result)

    json_type = json.dumps(final_result, indent=4)
    now = datetime.datetime.now()
    with open("data-%s.yml" % now.strftime("%Y-%m-%d"), "w") as outfile:
        yaml.dump(yaml.load(json_type), outfile, allow_unicode=True)
    print yaml.dump(yaml.load(json_type), allow_unicode=True, width=256)
Exemple #16
0
def findTrainerErrorParallel(lrLearner, svmLearner, knnLearner, finalFeatures, finalAnswers, testFeatures, testAnswers):
    manager = Manager()
    lrTrainingError, lrTestingError, lrIndices = manager.list(), manager.list(), manager.list()
    svmTrainingError, svmTestingError, svmIndices = manager.list(), manager.list(), manager.list()
    knnTrainingError, knnTestingError, knnIndices = manager.list(), manager.list(), manager.list()

    temp = manager.Namespace()
    temp.learner = lrLearner
    lrLearner = temp

    temp = manager.Namespace()
    temp.learner = svmLearner
    svmLearner = temp

    temp = manager.Namespace()
    temp.learner = knnLearner
    knnLearner = temp

    lrP = Process(target=findTrainerError, args=(lrLearner, 'LogReg', finalFeatures, finalAnswers, testFeatures, testAnswers, lrTrainingError, lrTestingError, lrIndices))
    svmP = Process(target=findTrainerError, args=(svmLearner, 'SVM', finalFeatures, finalAnswers, testFeatures, testAnswers, svmTrainingError, svmTestingError, svmIndices))
    knnP = Process(target=findTrainerError, args=(knnLearner, 'kNN', finalFeatures, finalAnswers, testFeatures, testAnswers, knnTrainingError, knnTestingError, knnIndices))

    lrP.start()
    svmP.start()
    knnP.start()

    lrP.join()
    svmP.join()
    knnP.join()

    lrLearner = lrLearner.learner
    svmLearner = svmLearner.learner
    knnLearner = knnLearner.learner
    lrTrainingError, lrTestingError, lrIndices = np.asarray(lrTrainingError), np.asarray(lrTestingError), np.asarray(lrIndices)
    svmTrainingError, svmTestingError, svmIndices = np.asarray(svmTrainingError), np.asarray(svmTestingError), np.asarray(svmIndices)
    knnTrainingError, knnTestingError, knnIndices = np.asarray(knnTrainingError), np.asarray(knnTestingError), np.asarray(knnIndices)
    return (lrLearner, svmLearner, knnLearner, lrTrainingError, lrTestingError, lrIndices, svmTrainingError, svmTestingError, svmIndices, knnTrainingError, knnTestingError, knnIndices)
def main():
    final_result = []
    reminder_result = []
    args = cmdArgumentParser()
    case_numberic = int(args.case_num[3:])
    prefix = args.case_num[:3]
    lock = Lock()
    jobs = []
    mgr = Manager()
    ns = mgr.Namespace()
    ns.df = final_result

    start = case_numberic - args.batch
    end = case_numberic + args.batch

    total_num = end - start + 1
    rmnder = total_num % CPU_CORES

    if total_num > 20:
        batch_result = get_batch_pair(total_num, start, end)

        for i in range(len(batch_result)):
            p = multiprocessing.Process(target=query_website,
                                        args=(
                                            ns,
                                            batch_result[i],
                                            prefix,
                                            lock,
                                            args.verbose,
                                        ))
            jobs.append(p)
            p.start()
        for job in jobs:
            job.join()

        final_result = ns.df

        # for i in range(end - rmnder + 1,end):
        # 	reminder_result.append(get_result(i,prefix))
    else:
        for i in range(start, end):
            final_result.append(get_result(i, prefix, args.verbose))

    json_type = json.dumps(final_result, indent=4)
    now = datetime.datetime.now()
    with open('data-%s.yml' % now.strftime("%Y-%m-%d"), 'w') as outfile:
        yaml.dump(yaml.load(json_type), outfile, allow_unicode=True)
    print yaml.dump(yaml.load(json_type), allow_unicode=True)
Exemple #18
0
def main():
    """Gets input hash and salt from command line,
       spawns worker threads, initializes queue,
       calls worker function"""

    # Check user input
    if len(sys.argv) != 2:
        sys.exit("Usage: python crack.py hash")

    # Extract hash and salt from user input
    user_hash = sys.argv[1]
    salt = user_hash[:2]

    # Input lists
    inputs = [HASH_INPUTS_1, HASH_INPUTS_2, HASH_INPUTS_3, HASH_INPUTS_4]

    # Manager to create shared Namespace
    mgr = Manager()
    namespace = mgr.Namespace()

    # Found to catch found event on threads
    namespace.found = False

    # The function to crack the DES hashed password.
    #  It is the callback function to main's calls to set_tuple_permutations
    #  via pool.apply_async. When the pool of child processes return results,
    #  the main process will process them via the callback function DES_crack.

    # Spawn pool of child processes calling the worker
    #  function: worker_crack in parallel and
    #  applying async, with callback function: print_n_terminate
    #  to the result returned from the workers.
    start_time = time.time()
    pool = Pool(NUM_PHYS_CORES)
    for aninput in inputs:
        pool.apply_async(worker_crack,
                         args=(aninput, user_hash, salt, namespace),
                         callback=print_n_terminate)
    pool.close()
    pool.join()

    duration = time.time() - start_time
    print(f"Cracking password took {duration:.2f} seconds")

    # Success
    print('success - exiting')
    sys.exit(0)
def main():
    # Create a multiprocessing manager to use as the token store
    global tokens, refresh_lock
    manager = Manager()
    tokens = manager.Namespace()
    refresh_lock = manager.Lock()

    # Authenticate in master process
    oauth2, tokens.access, tokens.refresh = authenticate(CooperativelyManagedOAuth2)

    # Create 2 worker processes and wait on them to finish
    workers = []
    for _ in range(2):
        worker_process = Process(target=worker)
        worker_process.start()
        workers.append(worker_process)
    for worker_process in workers:
        worker_process.join()
Exemple #20
0
def iter_latest_asynchonously(gen_func,
                              timeout=None,
                              empty_value=None,
                              use_forkserver=False,
                              uninitialized_wait=None):
    """
    Given a generator function, make an iterator that pulls the latest value yielded when running it asynchronously.
    If a value has never been set, or timeout is exceeded, yield empty_value instead.

    :param gen_func: A generator function (a function returning a generator);
    :return:
    """
    if use_forkserver:
        from multiprocessing import set_start_method  # Only Python 3.X
        set_start_method(
            'forkserver'
        )  # On macos this is necessary to start camera in separate thread

    m = Manager()
    namespace = m.Namespace()

    lock = Lock()

    with lock:
        namespace.time_and_data = (-float('inf'), Uninitialized)

    p = Process(target=_async_value_setter, args=(gen_func, namespace, lock))
    p.start()
    while True:
        with lock:
            lasttime, item = namespace.time_and_data
        if item is PoisonPill:  # The generator has terminated
            break
        elif item is Uninitialized:
            if uninitialized_wait is not None:
                time.sleep(uninitialized_wait)
                continue
            else:
                yield empty_value
        elif timeout is not None and (time.time(
        ) - lasttime) > timeout:  # Nothing written or nothing recent enough
            yield empty_value
        else:
            yield item
 def start(self):
     p = current_process()
     print('parent id: {}'.format(id(p)))
     manager = Manager()
     ns = manager.Namespace()
     ns.a = 1
     lst_proxy = manager.list()
     lst_proxy.append(1)
     dct_proxy = manager.dict()
     dct_proxy['b'] = 2
     print(ns.a)
     print(lst_proxy)
     print(dct_proxy)
     p = Process(target=self.modify, args=(ns, lst_proxy, dct_proxy))
     p.start()
     p.join()
     print(ns.a)
     print(lst_proxy)
     print(dct_proxy)
Exemple #22
0
    def __init__(self):
        process_manager = Manager()
        self._shared_data = process_manager.Namespace()
        self._shared_data.alive = True
        self._shared_data.DEFAULT_MOUSE = [
            uinput.BTN_LEFT,
            uinput.BTN_RIGHT,
            uinput.BTN_MIDDLE,
            uinput.REL_X,
            uinput.REL_Y,
            # TODO detect full screen size
            uinput.ABS_X + (0, 1920, 0, 0),
            uinput.ABS_Y + (0, 1080, 0, 0),
        ]

        self._queue = Queue()
        self._worker = Process(target=input_worker,
                               args=(self._shared_data, self._queue))
        self._worker.start()
Exemple #23
0
def main():
    """Gets input hash and salt from command line,
       spawns worker processes, initializes Sync_Manager,
       calls worker function"""

    # Check user input
    if len(sys.argv) != 2:
        sys.exit("Usage: python crack.py hash")

    # Extract hash and salt from user input
    user_hash = sys.argv[1]
    salt = user_hash[:2]

    # Input lists
    inputs = [HASH_INPUTS_1, HASH_INPUTS_2, HASH_INPUTS_3, HASH_INPUTS_4]

    # Manager to create shared Namespace
    mgr = Manager()
    namespace = mgr.Namespace()

    # Found to catch found event on threads
    namespace.found = False

    # Spawn processes calling the worker
    start_time = time.time()
    processes = []
    for i in range(len(inputs)):
        process = Process(target=worker_crack_password,
                          args=(inputs[i], user_hash, salt, namespace),
                          daemon=True)
        processes.append(process)
        process.start()

    for process in processes:
        process.join()

    duration = time.time() - start_time
    print(
        f"Cracking password took {duration:.2f} seconds using mp.Pool.apply_async()"
    )

    # Success
    sys.exit(0)
def run(thread_num, global_config, job_name, manifest_file, bucket=None):
    """
    start threads and log after they finish
    """

    tasks, _ = prepare_data(manifest_file, global_config)

    manager = Manager()
    manager_ns = manager.Namespace()
    manager_ns.total_processed_files = 0

    jobInfos = []
    for task in tasks:
        job = JobInfo(global_config, task, len(tasks), job_name, {},
                      manager_ns, bucket)
        jobInfos.append(job)

    # Make the Pool of workers
    pool = Pool(thread_num)

    results = []

    if job_name == "copying":
        results = pool.map(exec_google_copy, jobInfos)
    elif job_name == "indexing":
        results = pool.map(check_and_index_the_data, jobInfos)

    # close the pool and wait for the work to finish
    pool.close()
    pool.join()

    filename = global_config.get("log_file", "{}_log.json".format(job_name))

    timestr = time.strftime("%Y%m%d-%H%M%S")
    filename = timestr + "_" + filename

    if job_name == "copying":
        results = [{"data": results}]

    json_log = {}

    for result in results:
        json_log.update(result)
Exemple #25
0
class ParallelMgr():
    def __init__(self, **kwargs):
        num_pes = kwargs.get('num_pes', 1)

        # TODO: compare num_pes to multiprocessing.cpu_count()

        # manager for manager-to-worker communication & shared namespace
        self.mpMgr = Manager()
        self.namespace = self.mpMgr.Namespace()
        # need a copy of num_pes
        self.namespace.num_pes = num_pes

        # manager for manager-to-worker communication
        self.namespace.masterList = [self.mpMgr.list() for i in range(num_pes)]
        # self.masterList = [ self.commMgr.list() for i in range(self.num_pes) ]

        # separate lists for direct PE-to-PE communication
        self.namespace.msgLists = [self.mpMgr.list() for i in range(num_pes)]
        # self.msgLists = [ self.commMgr.list() for i in range(self.num_pes) ]

        print('ParallelMgr init')
        sys.stdout.flush()

    def runWorkers(self, workerModule):
        # self.peList = [ workerModule(i,self.namespace) for i in range(self.namespace.num_pes) ]
        self.peList = []
        for i in range(self.namespace.num_pes):
            commMgr = CommMgr(tid=i, namespace=self.namespace)
            pid = workerModule(i, commMgr)
            self.peList.append(pid)
        for pe in self.peList:
            print('starting worker ' + str(pe))
            pe.start()
        # TODO: check for errors

    def finalize(self):
        print('waiting for workers to finish')
        for pe in self.peList:
            print('worker ' + str(pe) + ' is_alive=' + str(pe.is_alive()))

        for pe in self.peList:
            pe.join()
Exemple #26
0
class Test(object):

    def __init__(self):
        self.manager = Manager()
        self.namespace = self.manager.Namespace()
        self.process_list = []
        self.namespace.d=self.manager.dict()

    def print_me(self, ns, i):
        print("\nBefore:{0}, {1}, {2} ".format(current_process().name, i, ns.d))
        temp = i * i
        ns.d[i] =  temp
        print("\nAfter:{0}, {1}, {2} ".format(current_process().name, i, ns.d))
        time.sleep(5)
        return

    def spawn_processes(self):
        for i in range(5):
            p = Process(target=self.print_me, args=(self.namespace, i,))
            self.process_list.append(p)
            p.start()

        for i in self.process_list:
            i.join()

        print(self.namespace.d)

    def spawn_processes22(self):
        for i in range(5):
            p = Process(target=self.print_me, args=(self.namespace, i,))
            self.process_list.append(p)
            p.start()
            print("exitcode is:{0}".format(p.exitcode))
        while len(self.process_list) is not 0:
            for i in self.process_list:
                if not i.is_alive():
                    print("exitcode is:{0}".format(i.exitcode))
                    print("process:{0} is done, exit code:{1}".format(i.name, i.exitcode));
                    self.process_list.remove(i)
                    i.terminate()

        print(self.namespace.d)
Exemple #27
0
    def get_prices(self):
        mgr = Manager()
        ns = mgr.Namespace()
        ns.prices = pd.DataFrame()
        d = mgr.dict()
        filenames = os.listdir(self.prices_dir)
        count = len(filenames)
        jobs = []
        for filename in tqdm(filenames):
            job = Process(target=self.worker, args=(ns, filename))
            job.start()
            jobs.append(job)

        for job in jobs:
            job.join()

        prices = ns.prices

        #print(prices.head())
        self.prices = prices
Exemple #28
0
    def upload_parts(self):
        args_list = []
        if OS_WINDOWS:
            self.ns = FakeNamespace()
        else:
            manager = Manager()
            self.ns = manager.Namespace()
            self.ns.completed = 0
        part_amount = int(math.ceil(self.file_size / float(self.part_size)))
        self.total_parts = part_amount
        self.pbar = ProgressBar(widgets=[Percentage(), Bar()],
                                maxval=self.total_parts).start()
        try:
            for i in xrange(part_amount):
                offset = i * self.part_size
                remaining_bytes = self.file_size - offset
                bytes = min(remaining_bytes, self.part_size)
                if not self.multiparts.uploaded(i + 1):
                    args_list.append([
                        self.file_path, offset, bytes, self.url,
                        self.upload_id, i + 1, self.headers, self.verify,
                        self.pbar, self.ns
                    ])
                else:
                    self.total_parts -= 1
            if self.total_parts == 0:
                return
            self.pbar.maxval = self.total_parts

            pool = Pool(processes=self.processes)
            pool.map_async(upload_multipart_wrapper, args_list).get(9999999)
            pool.close()
            pool.join()
        except KeyboardInterrupt:
            log.error("Caught KeyboardInterrupt, terminating workers")
            pool.terminate()
            pool.join()
            raise Exception("Process canceled by user")
def test_call_streamming_method_called(mock_aws):
    """
    Test that the streamming method is called since the object is Glacier
    """
    scripts.aws_replicate.logger = MagicMock()
    subprocess.Popen = MagicMock()
    scripts.aws_replicate.stream_object_from_gdc_api = MagicMock()
    mock_aws.return_value = "tcga-open"

    scripts.aws_replicate.bucket_exists = MagicMock()
    scripts.aws_replicate.bucket_exists.return_value = True
    scripts.aws_replicate.object_exists = MagicMock()
    scripts.aws_replicate.object_exists.return_value = False

    source_objects = {"11111111111111111/abc.bam": {"StorageClass": "GLACIER"}}
    copied_objects = {}
    manager = Manager()
    manager_ns = manager.Namespace()
    manager_ns.total_processed_files = 0
    manager_ns.total_copied_data = 0
    lock = manager.Lock()

    job_info = scripts.aws_replicate.JobInfo(
        {},
        gen_mock_manifest_data()[0],
        1,
        1,
        "",
        copied_objects,
        source_objects,
        manager_ns,
        "bucket",
    )
    scripts.aws_replicate.exec_aws_copy(lock, False, job_info)
    assert subprocess.Popen.call_count == 0
    assert scripts.aws_replicate.stream_object_from_gdc_api.call_count == 1
Exemple #30
0
class Pipeline:
    def __init__(self,
                 granule_loader: GranuleLoader,
                 slicer: TileSlicer,
                 data_store_factory,
                 metadata_store_factory,
                 tile_processors: List[TileProcessor],
                 max_concurrency: int):
        self._granule_loader = granule_loader
        self._tile_processors = tile_processors
        self._slicer = slicer
        self._data_store_factory = data_store_factory
        self._metadata_store_factory = metadata_store_factory
        self._max_concurrency = max_concurrency

        # Create a SyncManager so that we can to communicate exceptions from the
        # worker processes back to the main process.
        self._manager = Manager()

    def __del__(self):
        self._manager.shutdown()

    @classmethod
    def from_string(cls, config_str: str, data_store_factory, metadata_store_factory, max_concurrency: int = 16):
        try:
            config = yaml.load(config_str, yaml.FullLoader)
            cls._validate_config(config)
            return cls._build_pipeline(config,
                                       data_store_factory,
                                       metadata_store_factory,
                                       processor_module_mappings,
                                       max_concurrency)

        except yaml.scanner.ScannerError:
            raise PipelineBuildingError("Cannot build pipeline because of a syntax error in the YAML.")

    # TODO: this method should validate the config against an actual schema definition
    @staticmethod
    def _validate_config(config: dict):
        if type(config) is not dict:
            raise PipelineBuildingError("Cannot build pipeline; the pipeline configuration that " +
                                        "was received is not valid YAML.")

    @classmethod
    def _build_pipeline(cls,
                        config: dict,
                        data_store_factory,
                        metadata_store_factory,
                        module_mappings: dict,
                        max_concurrency: int):
        try:
            granule_loader = GranuleLoader(**config['granule'])

            slicer_config = config['slicer']
            slicer = cls._parse_module(slicer_config, module_mappings)

            tile_processors = []
            for processor_config in config['processors']:
                module = cls._parse_module(processor_config, module_mappings)
                tile_processors.append(module)

            return cls(granule_loader,
                       slicer,
                       data_store_factory,
                       metadata_store_factory,
                       tile_processors,
                       max_concurrency)
        except PipelineBuildingError:
            raise
        except KeyError as e:
            raise PipelineBuildingError(f"Cannot build pipeline because {e} is missing from the YAML.")
        except Exception as e:
            logger.exception(e)
            raise PipelineBuildingError(f"Cannot build pipeline because of the following error: {e}")

    @classmethod
    def _parse_module(cls, module_config: dict, module_mappings: dict):
        module_name = module_config.pop('name')
        try:
            module_class = module_mappings[module_name]
            logger.debug("Loaded processor {}.".format(module_class))
            processor_module = module_class(**module_config)
        except KeyError:
            raise PipelineBuildingError(f"'{module_name}' is not a valid processor.")
        except Exception as e:
            raise PipelineBuildingError(f"Parsing module '{module_name}' failed because of the following error: {e}")

        return processor_module

    async def run(self):
        async with self._granule_loader as (dataset, granule_name):
            start = time.perf_counter()

            shared_memory = self._manager.Namespace()
            async with Pool(initializer=_init_worker,
                            initargs=(self._tile_processors,
                                      dataset,
                                      self._data_store_factory,
                                      self._metadata_store_factory,
                                      shared_memory),
                            maxtasksperchild=self._max_concurrency,
                            childconcurrency=self._max_concurrency) as pool:
                serialized_tiles = [nexusproto.NexusTile.SerializeToString(tile) for tile in
                                    self._slicer.generate_tiles(dataset, granule_name)]
                # aiomultiprocess is built on top of the stdlib multiprocessing library, which has the limitation that
                # a queue can't have more than 2**15-1 tasks. So, we have to batch it.
                for chunk in self._chunk_list(serialized_tiles, MAX_CHUNK_SIZE):
                    try:
                        await pool.map(_process_tile_in_worker, chunk)
                    except ProxyException:
                        pool.terminate()
                        # Give the shared memory manager some time to write the exception
                        # await asyncio.sleep(1)
                        raise pickle.loads(shared_memory.error)

        end = time.perf_counter()
        logger.info("Pipeline finished in {} seconds".format(end - start))

    @staticmethod
    def _chunk_list(items, chunk_size: int):
        return [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)]