Beispiel #1
0
 def _itergroundings(self, simplify=False, unsatfailure=False):
     global global_bpll_grounding
     global_bpll_grounding = self
     if self.multicore:
         pool = Pool(maxtasksperchild=1)
         try:
             for gndresult in pool.imap(with_tracing(create_formula_groundings), self.formulas):
                 for fidx, stat in gndresult:
                     for (varidx, validx, val) in stat: 
                         self._varidx2fidx[varidx].add(fidx)
                         self._addstat(fidx, varidx, validx, val)
                     checkmem()
                 yield None
         except CtrlCException as e:
             pool.terminate()
             raise e
         pool.close()
         pool.join()
     else:
         for gndresult in imap(create_formula_groundings, self.formulas):
             for fidx, stat in gndresult:
                 for (varidx, validx, val) in stat: 
                     self._varidx2fidx[varidx].add(fidx)
                     self._addstat(fidx, varidx, validx, val)
             yield None
Beispiel #2
0
def work(host, port, processes, threads, times):
    pool = Pool(processes,
                lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
    p = Process(target=progress)
    p.daemon = True

    start = time.time()

    try:
        for chunk in divide(times, processes):
            pool.apply_async(thread, (host, port, threads, chunk))

        p.start()

        pool.close()
        pool.join()
        p.terminate()
        p.join()

    except KeyboardInterrupt:
        pool.terminate()
        p.terminate()
        p.join()
        pool.join()

    return time.time() - start
def main(datadir, convert_dir, crop_size):
    try:
        os.mkdir(convert_dir)
    except OSError:
        pass

    filenames = data_util.get_image_files(datadir)

    print('Resizing images in {} to {}'.format(datadir, convert_dir))

    n = len(filenames)

    batch_size = 500
    batches = n // batch_size + 1
    p = Pool()

    args = []

    for f in filenames:
        args.append((convert_size, (datadir, convert_dir, f, crop_size)))

    for i in range(batches):
        print('batch {:>2} / {}'.format(i + 1, batches))
        p.map(convert, args[i * batch_size : (i + 1) * batch_size])

    p.close()
    p.join()
    print('Done')
Beispiel #4
0
 def _itergroundings(self, simplify=True, unsatfailure=True):
     # generate all groundings
     if not self.formulas:
         return
     global global_fastConjGrounding
     global_fastConjGrounding = self
     batches = list(rndbatches(self.formulas, 20))
     batchsizes = [len(b) for b in batches]
     if self.verbose:
         bar = ProgressBar(width=100, steps=sum(batchsizes), color='green')
         i = 0
     if self.multicore:
         pool = Pool()
         try:
             for gfs in pool.imap(with_tracing(create_formula_groundings), batches):
                 if self.verbose:
                     bar.inc(batchsizes[i])
                     bar.label(str(cumsum(batchsizes, i + 1)))
                     i += 1
                 for gf in gfs: yield gf
         except Exception as e:
             logger.error('Error in child process. Terminating pool...')
             pool.close()
             raise e
         finally:
             pool.terminate()
             pool.join()
     else:
         for gfs in imap(create_formula_groundings, batches):
             if self.verbose:
                 bar.inc(batchsizes[i])
                 bar.label(str(cumsum(batchsizes, i + 1)))
                 i += 1
             for gf in gfs: yield gf
Beispiel #5
0
    def start(self):
        """Starts a server that controls local workers.

        Calling this function starts a pool of `num_workers` workers used to run
        targets sent to the server. The server will run indefinitely unless shut
        down by the user.
        """
        try:
            serv = Listener((self.hostname, self.port))
            workers = Pool(
                processes=self.num_workers,
                initializer=Worker,
                initargs=(self.status, self.queue, self.waiting),
            )

            logging.info(
                "Started %s workers, listening on port %s",
                self.num_workers,
                serv.address[1],
            )
            self.wait_for_clients(serv)
        except OSError as e:
            if e.errno == 48:
                raise ServerError(
                    (
                        "Could not start workers listening on port {}. "
                        "The port may already be in use."
                    ).format(self.port)
                )
        except KeyboardInterrupt:
            logging.info("Shutting down...")
            workers.close()
            workers.join()
            self.manager.shutdown()
Beispiel #6
0
class Pool(object):
  '''
  
  '''
  def __init__(self, **pool_kwargs):
  
    try:
      kw = KwargsCheck(MPIPool, pool_kwargs)
      self._pool = MPIPool(**kw)
      self.MPI = True
    except (ImportError, ValueError):
      kw = KwargsCheck(MultiPool, pool_kwargs)
      self._pool = MultiPool(**kw)
      self.MPI = False
    
    if self.MPI:
      if not self._pool.is_master():
        self._pool.wait()
        sys.exit(0)
  
  def map(self, f, x, args = (), kwargs = {}): 
    '''
    
    '''
    if len(args) or len(kwargs):
      w = wrap(f, *args, **kwargs)  
      return self._pool.map(w, x)
    else:
      return self._pool.map(f, x)
  
  def close(self):
    self._pool.close()
    
Beispiel #7
0
def extract_all_labels(filenames, out_filepath=DATA_FOLDER+'labels.p', chunk_size=2000):
    print "EXTRACTING ALL LABELS INTO {0}".format(out_filepath)
    all_labels = []
    label_dict = {}

    filenames_chunks = util.chunks(filenames, chunk_size)

    for i, chunk in enumerate(filenames_chunks):
        pool = Pool(processes=util.CPU_COUNT)
        chunk_labels = pool.map(extract_labels, chunk)
        pool.close()

        for filepath, labels in zip(chunk, chunk_labels):
            if labels is not None:
                file_id = util.filename_without_extension(filepath)
                label_dict[file_id] = labels
                all_labels += labels

        print i+1, '/', len(filenames_chunks)

    #Write labels to file
    with open(out_filepath,'w') as f:
        pickle.dump(label_dict, f)

    print '\nLabels:'
    print len(set(all_labels))
    print Counter(all_labels)
Beispiel #8
0
def query_tweets(query, limit=None, begindate=dt.date(2006, 3, 21), enddate=dt.date.today(), poolsize=20, lang=''):
    no_days = (enddate - begindate).days
    if poolsize > no_days:
        # Since we are assigning each pool a range of dates to query,
		# the number of pools should not exceed the number of dates.
        poolsize = no_days
    dateranges = [begindate + dt.timedelta(days=elem) for elem in linspace(0, no_days, poolsize+1)]

    if limit:
        limit_per_pool = (limit // poolsize)+1
    else:
        limit_per_pool = None

    queries = ['{} since:{} until:{}'.format(query, since, until)
               for since, until in zip(dateranges[:-1], dateranges[1:])]

    all_tweets = []
    try:
        pool = Pool(poolsize)
        logger.info('queries: {}'.format(queries))
        try:
            for new_tweets in pool.imap_unordered(partial(query_tweets_once, limit=limit_per_pool, lang=lang), queries):
                all_tweets.extend(new_tweets)
                logger.info('Got {} tweets ({} new).'.format(
                    len(all_tweets), len(new_tweets)))
        except KeyboardInterrupt:
            logger.info('Program interrupted by user. Returning all tweets '
                         'gathered so far.')
    finally:
        pool.close()
        pool.join()

    return all_tweets
Beispiel #9
0
 def add_tree(self, iterations=-1, snapshot=False):
     """
     Multi-core, fully utilizes underlying CPU to create the trees
     of the forest and stores them into the forest's list of trees
     :param iterations: number of trees to make, -1 means use default setting
     :return: None
     """
     print("Adding trees:", iterations)
     if iterations == -1:
         iterations = self.default_tree_count
     #########################
     # MULTI THREADED
     ########################
     pool = Pool()  # creates multiple processes equal to cores in machine
     outputs = pool.map(make_tree, [(self.data_copy(), self.depthlimit, self.weak_learner)
                                    for _ in range(iterations)])
     pool.close()
     pool.join()
     self.trees.extend(outputs)  # get the trees created and store them
     #########################
     # SINGLE THREADED
     ########################
     #for i in range(iterations):
     #    tree = Tree(self.data, self.bagging, self.bag_ratio, self.depthlimit, self.weak_learner)
     #    self.trees.append(tree)  # get the trees created and store them
     if snapshot:
         self.sum_squares(len(self.trees))  # get error after each snapshot, if this command is run multiple times
def get_correlation_parallel(s1,s2):
    """
    params s1 - series 1
    params s2 - series 2 
    NOTE : series are number 1 to 25 when giving in arguments
    returns the correlation between series
    """
    start = time.time()
    offsets = [] #this will be the arguments to all the parallel jobs
    instances = (MAX_ROWS/BATCH_SIZE)
    mean,std = calculate_mean_std_parallel()
    stripped_mean,stripped_std = calculate_stripped_mean_std_parallel(mean,std)
    processes = Pool(processes=instances)
    for i in range(instances):
        offsets.append((s1,s2,mean,std,stripped_mean,stripped_std,i*BATCH_SIZE))
    results = processes.map(get_correlation,offsets)
    processes.close()
    processes.join()
    pearson_corr = 0
    total = 0
    for result in results:
        pearson_corr += result[0]*result[1]
        total += result[1]
    pearson_corr = 1.0*pearson_corr / total
    t_value = abs(pearson_corr*math.sqrt( 1.0*(total - 2) / ( 1 - (pearson_corr*pearson_corr))))
    p_value = t.sf(t_value,total-2)
    print "\n ######### CORRELATION BETWEEN SERIES ",s1," AND SERIES ",s2, " is ",pearson_corr , "t value is ", t_value ," and p value is ", p_value,  "######### \n" 
    end = time.time()
    print "EXECUTION TIME : ", end-start , " sec"
    return pearson_corr
Beispiel #11
0
def parse(document, pages, parse_refs=True,
        progress_monitor=NullProgressMonitor(),
        pool_size=DEFAULT_POOL_SIZE):
    progress_monitor.start('Parsing Pages', pool_size + 1)

    # Prepare input
    pages = [(page.local_url, page.url) for page in
            pages.values() if page.local_url is not None]
    pages_chunks = chunk_it(pages, pool_size)
    inputs = []
    for pages_chunk in pages_chunks:
        inputs.append((document.parser, document.pk, parse_refs, pages_chunk))

    # Close connection to allow the new processes to create their own.
    connection.close()

    # Split work
    progress_monitor.info('Sending {0} chunks to worker pool'
            .format(len(inputs)))
    pool = Pool(pool_size)
    for result in pool.imap_unordered(sub_process_parse, inputs, 1):
        progress_monitor.work('Parsed 1/{0} of the pages'.\
                format(pool_size), 1)

    # Word Count
    word_count = 0
    for page in document.pages.all():
        word_count += page.word_count
    document.word_count = word_count
    document.save()
    progress_monitor.work('Counted Total Words', 1)

    pool.close()
    progress_monitor.done()
Beispiel #12
0
def stat_volume(stime,etime):
    tgsinfo = read_tgs_info()

    # from multiprocessing.dummy import Pool as ThreadPool
    from multiprocessing.pool import Pool

    pool = Pool()
    volume = [pool.apply_async(stat_tgs_volume,args=(stime,etime,int(cid))) for cid in tgsinfo.keys()]
    pool.close()

    print 'waiting to join....'
    pool.join()

    print 'start to writing to file...'

    volume0 = []
    for i,elem in enumerate(volume):
        volume0.append((tgsinfo.keys()[i], elem.get()))
    volume0.sort(key=lambda x:x[1], reverse=True)

    total = 0
    with open(os.path.join(root_dir, "result", "volume.txt"),"w") as f:
        for i,elem in enumerate(volume0):
            # cid = tgsinfo.keys()[i]
            # vol = elem.get()
            total += elem[1]

            line = "%5s,%s: %d\n" % (elem[0], tgsinfo[elem[0]]['kkmc'], elem[1])
            f.write(line)

    print 'totally %d records.' % (total)
Beispiel #13
0
class _MultiExecutor(_Executor):
    """Execute functions async in a process pool"""

    def __init__(self):
        super(_MultiExecutor, self).__init__()
        self._children = 0
        self.pool = Pool()

    def _collector(self, result):
        super(_MultiExecutor, self)._collector(result)
        self._children -= 1

    def execute(self, func, args):
        self._children += 1
        self.pool.apply_async(func, args, callback=self._collector)

    def wait_for_results(self):
        self.pool.close()
        # One would have hoped joining the pool would take care of this, but
        # apparently you need to first make sure that all your launched tasks
        # has returned their results properly, before calling join, or you
        # risk a deadlock.
        while self._children > 0:
            time.sleep(0.001)
        self.pool.join()
Beispiel #14
0
def ingest(
        dataset,
        cls,
        skip_if_exists=True,
        multi_process=False,
        multi_threaded=False,
        cores=None):

    pool = None

    if multi_process:
        pool = Pool(cores or cpu_count())
        map_func = pool.imap_unordered
    elif multi_threaded:
        pool = ThreadPool(cores or cpu_count())
        map_func = pool.imap_unordered
    else:
        map_func = map

    cls_args = repeat(cls)
    skip_args = repeat(skip_if_exists)

    map_func(ingest_one, zip(dataset, cls_args, skip_args))

    if pool is not None:
        # if we're ingesting using multiple processes or threads, the processing
        # should be parallel, but this method should be synchronous from the
        # caller's perspective
        pool.close()
        pool.join()
Beispiel #15
0
def main():
	global pool
	pool = Pool(POOL_SIZE)
	
	
	nseeds = 100
	
#	print("== generating seeds...")
#	generate_seeds(nseeds)
	
	#print("running const density experiments...")
	#run_constant_density(0.1, range(100, 1000, 100), nseeds)
	
	#print("running const size experiments...")
	#run_constant_size(50, range(100, 1000, 100), nseeds)
	
	print("== running aggregate interval experiments (const density)...")
#	run_aggregate_interval_constant_density(0.1, range(100, 1000, 100), nseeds, [100, 500] + list(range(1000, 4000, 1000)))

	run_aggregate_interval_constant_density(0.1, range(100, 1000, 100), nseeds, [3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 50000])
	reset_pool()
	run_aggregate_interval_constant_density(0.2, range(100, 1000, 100), nseeds, [100, 500, 1000, 2000,4000, 5000, 6000, 7000, 8000, 9000, 10000, 50000])
	reset_pool()
	run_aggregate_interval_constant_density(0.3, range(100, 1000, 100), nseeds, [100, 500, 1000, 2000,4000, 5000, 6000, 7000, 8000, 9000, 10000, 50000])
	reset_pool()
	run_aggregate_interval_constant_density(0.4, range(100, 1000, 100), nseeds, [100, 500, 1000, 2000,4000, 5000, 6000, 7000, 8000, 9000, 10000, 50000])
	reset_pool()
	run_aggregate_interval_constant_density(0.5, range(100, 1000, 100), nseeds, [100, 500, 1000, 2000,4000, 5000, 6000, 7000, 8000, 9000, 10000, 50000])

	pool.close()
	pool.join()
Beispiel #16
0
def main():
    print('Process (%s) start...' % os.getpid())
    p = Pool()
    for i in range(4):
        p.apply_async(long_time_task, args=(i,))
    print('Waiting for all subprocesses done...')
    p.close()
    p.join()
    print('All subprocesses done.')
def load_images_uint(files):
    p = Pool()
    process = imread
    results = p.map(process, files)
    p.close()
    p.join()
    images = np.array(results)
    images = images.transpose(0, 3, 1, 2)
    return images
def get_data():
	f2 = open('app_links1.txt','r')

	nprocs = 500 # nprocs is the number of processes to run
	ParsePool = Pool(nprocs)
	#ParsePool.map(btl_test,url)
	ParsedURLS = ParsePool.map(deatilsExtract,f2)
	ParsePool.close()
	ParsePool.join()
Beispiel #19
0
def get_word():
    domains=open('dic/newwords').readlines()
    try:
        pool=Pool(processes=2)
        pool.map(check_domain,domains)
        pool.close()
        pool.join()
    except Exception as e:
        print e
        pass
Beispiel #20
0
def main():
    queue_logger = setup_redirection()
    queue_logger.write("ABCDEF\n")
    try:
        p = Pool(10)
        results = [p.apply_async(some_process_body) for i in xrange(20)]
        [result.get() for result in results]
        p.close()
    finally:
        queue_logger.stop()
def calculate_stripped_mean_std_parallel(mean,std):
    """
    params - mean
    params - std
    returns stripped up mean and std
    """
    stripped_mean = []
    stripped_squares = []
    stripped_std = []
    dirty_data = []
    outliers = []
    for i in range(0,NO_OF_SERIES):
        stripped_std.append(0)
        stripped_squares.append(0)
        stripped_mean.append(0)
        dirty_data.append(0)
        outliers.append(0)
    start = time.time()
    offsets = [] #this will be the arguments to all the parallel jobs
    instances = (MAX_ROWS/BATCH_SIZE)
    processes = Pool(processes=instances)
    for i in range(instances):
        offsets.append((mean,std,i*BATCH_SIZE))
    results = processes.map(calculate_stripped_mean_std,offsets)
    processes.close()
    processes.join()
    total = 0
    for result in results:
        for i in range(len(result[0])):
            count = result[2] - result[3][i] #actual - dirty data
            stripped_mean[i] += result[0][i]*count
            stripped_squares[i] += result[1][i]*count
            dirty_data[i] += result[3][i]
            outliers[i] += result[4][i]
        total += result[2]

    for i in range(len(mean)):
        stripped_mean[i] = 1.0*(stripped_mean[i])/(total - dirty_data[i])
        stripped_squares[i] = 1.0*(stripped_squares[i]) / (total - dirty_data[i])
        stripped_std[i] = math.sqrt(stripped_squares[i] - (stripped_mean[i]*stripped_mean[i]))

    end = time.time()

    print "######### STRIPPED MEAN ######### \n"
    print stripped_mean
    print "\n ######### STRIPPED STANDARD DEVIATION ######### \n"
    print stripped_std
    print "\n######### NAN ROWS COUNT #########\n"
    print dirty_data
    print "\n######### OUTLIERS ROWS COUNT #########\n"
    print outliers
    print "\n######### EXECUTION TIME #########\n"
    print (end-start)

    return stripped_mean,stripped_std
Beispiel #22
0
class TcpController(object):
    def __init__(self,handlers):
        self.handlers=handlers
        self.workers=Pool(MAX_PROCESS_POOL_SIZE)

    def process(self,header,body):
        self.workers.apply_async(wrap,(self.handlers[header](),body,))

    def destroy(self):
        self.handlers=None
        self.workers.close()
Beispiel #23
0
    def run(self):
        cases = self.get_test_case()
        # 定义一个进程池
        pool = Pool(processes=len(cases))

        result.append(pool.map_async(self.init_driver, cases.values()))

        pool.close()
        pool.join()

        while not q.empty():
            comm.Template.set_middle(q.get())
Beispiel #24
0
    def _get(self, args):
        draft_id = args[0]
        id = args[1] if len(args) > 1 else None

        q = self.db.query(Player)
        if id is not None:
            player = q.filter(Player.id == int(id)).first()
            team = self.db.query(Team).filter(and_(Team.is_owner == True,
                                                   Team.draft_id == draft_id)).first()

            available_players = self.db.query(Player).join(Player.core).filter(and_(PlayerCore.rank != None,
                                                                                    PlayerCore.target_price != None,
                                                                                    PlayerCore.points > 0,
                                                                                    Player.draft_id == draft_id,
                                                                                    Player.team_id == None,
                                                                                    Player.id != player.id)).order_by(PlayerCore.rank).all()

            min_price = 1
            max_price = min(player.core.target_price + 21, team.money)
            manager = Manager()
            max_starters_points = manager.dict()
            max_bench_points = manager.dict()
            pool = Pool(processes=8)
            starters, bench = get_starters_and_bench(self.db, team.id)
            max_starters_points[0] = optimizer.optimize_roster(starters, available_players, team.money - (constants.BENCH_SIZE - len(bench)))[1]
            for m in range(min_price, 10):
                pool.apply_async(wrap_optimizer, args=(starters, available_players, team.money - m - (constants.BENCH_SIZE - len(bench)) + 1, max_bench_points, m))

            full_starters = True
            for s in starters:
                if s is None:
                    full_starters = False
            if not full_starters:
                starters_clone = list(starters)
                bench_clone = list(bench)
                place_player(player, starters_clone, bench_clone)
                for m in range(min_price, max_price):
                    pool.apply_async(wrap_optimizer, args=(starters_clone, available_players, team.money - m - (constants.BENCH_SIZE - len(bench_clone)), max_starters_points, m))

            pool.close()
            pool.join()

            ret = player.to_dict(['core'])
            ret['max_starters_points'] = dict(max_starters_points)
            ret['max_bench_points'] = dict(max_bench_points)

            return ret
        else:
            players = q.join(PlayerCore).filter(and_(Player.draft_id == int(draft_id),
                                                     PlayerCore.rank != None,
                                                     PlayerCore.target_price != None)).all()
            return {'players': [p.to_dict(['core']) for p in players]}
def parallel_augment(images, normalize=None, test=False):
    if normalize is not None:
        mean, std = normalize
        images = images - mean[:, np.newaxis, np.newaxis] # assuming channel-wise normalization
        images = images / std[:, np.newaxis, np.newaxis]

    p = Pool()
    process = partial(augment, test=test)
    results = p.map(process, images)
    p.close()
    p.join()
    augmented_images = np.array(results, dtype=np.float32)
    return augmented_images
def calculate_mean_std_parallel():
    """
    call this function to compute the mean, standard deviation and NaNs for each seies
    the file name, no of jobs can be changed in the settings file 
    """
    start = time.time()
    offsets = []
    instances = (MAX_ROWS/BATCH_SIZE)
    processes = Pool(processes=instances)
    for i in range(instances):
        offsets.append(i*BATCH_SIZE)
    print offsets
    result = processes.map(calculate_mean_std,offsets)
    processes.close()
    processes.join()
    mean = []
    std = []
    squares = []
    dirty_data = []
    #initializing
    for i in range(0,NO_OF_SERIES):
        mean.append(0)
        std.append(0)
        squares.append(0)
        dirty_data.append(0)

    total = 0
    ### here we combine the results from different processes / threads
    for r in result:
        for i in range(len(r[0])): ### update for each time series
            count = (r[2] - r[3][i])  ### actual count - the count with missing value
            mean[i] += r[0][i]*count
            squares[i] += r[1][i]*count
            dirty_data[i] += r[3][i]
        total += r[2]

    for i in range(len(mean)):
        mean[i] = 1.0*(mean[i])/(total - dirty_data[i])
        squares[i] = 1.0*(squares[i]) / (total - dirty_data[i])
        std[i] = math.sqrt(squares[i] - (mean[i]*mean[i]))
    end = time.time()
    print "######### MEAN ######### \n"
    print mean
    print "\n ######### STANDARD DEVIATION ######### \n"
    print std
    print "\n######### NAN ROWS COUNT #########\n"
    print dirty_data
    print "\n######### EXECUTION TIME #########\n"
    print (end-start)

    return mean,std
Beispiel #27
0
def extract_all_plaintext(filenames, out_folder=PLAINTEXT_FOLDER):
    print "EXTRACTING PLAINTEXT FROM {0} FILES INTO {1}".format(len(filenames),out_folder)

    #Zip the filename input with the output folder
    tuple_input = zip(filenames, [out_folder]*len(filenames))

    pool = Pool(processes=util.CPU_COUNT)
    #pool = Pool(processes=1)
    num_tasks = len(filenames)
    for i, _ in enumerate(pool.imap_unordered(__extract_plaintext_as_tuple, tuple_input), 1):
        sys.stderr.write('\rdone {0:%}'.format(i/num_tasks))
    pool.close()

    print "\nDONE"
Beispiel #28
0
def main():
	"""
		Build all the models. Spin off a new process for each participant
		because the ANN library is not multithreaded. Process is used instead
		of thread to leverage multiple cores.
	"""
	parser = ArgumentParser()
	parser.add_argument("inputFilename")
	parser.add_argument("outputDirectory")
	
	args = parser.parse_args()
	inputFilename = args.inputFilename
	outputDirectory = args.outputDirectory
	
	data = pickle.load( open(inputFilename, 'rb') )
	
	tasks = [ 'matb', 'rantask' ]
	participantIds = [ '001', '002', '003', '004', '005', '006', '007' ]
	
	# Cut off first row header for each data set
	for task in tasks:
		for participantId in participantIds:
			data[participantId][task] = data[participantId][task][1:] 
			
	splits = performSplit( data )
	
	# Record start time so that the elapsed time can be determined
	start_time = time.time()
	
	# Create a multicore processing pool with 7 processes ( 7 so that one core stays free
	# for system processes )
	pool = Pool( processes = 7 )
	
	# Build models for participants in a task
	for task in tasks:
		for participantId in participantIds:
			outputFilename = path.join( outputDirectory, 'testingOn-' + participantId + '-' + task + '.txt' )
			
			# Spin off a process for the building
			pool.apply_async( tuneANN, ( splits[participantId][task], outputFilename ) )
			
	# Close down the pool so that we can wait on all the processes
	pool.close()
	pool.join()
	
	# Calculate and print the elapsed time
	elapsed_time = time.time() - start_time
	print( "Elapsed time: " + str(elapsed_time) )
Beispiel #29
0
def main(directory, convert_directory, test, crop_size, extension):

    try:
        os.mkdir(convert_directory)
    except OSError:
        pass

    filenames = [os.path.join(dp, f) for dp, dn, fn in os.walk(directory)
                 for f in fn if f.endswith('jpeg') or f.endswith('tiff')] 
    filenames = sorted(filenames)

    if test:
        names = data.get_names(filenames)
        y = data.get_labels(names)
        for f, level in zip(filenames, y):
            if level == 1:
                try:
                    img = convert(f, crop_size)
                    img.show()
                    Image.open(f).show()
                    real_raw_input = vars(__builtins__).get('raw_input',input)
                    real_raw_input('enter for next')
                except KeyboardInterrupt:
                    exit(0)

    print("Resizing images in {} to {}, this takes a while."
          "".format(directory, convert_directory))

    n = len(filenames)
    # process in batches, sometimes weird things happen with Pool on my machine
    batchsize = 500
    batches = n // batchsize + 1
    pool = Pool(N_PROC)

    args = []

    for f in filenames:
        args.append((convert, (directory, convert_directory, f, crop_size, 
                           extension)))

    for i in range(batches):
        print("batch {:>2} / {}".format(i + 1, batches))
        pool.map(process, args[i * batchsize: (i + 1) * batchsize])

    pool.close()

    print('done')
    def multi_proc5(self, batch):
        start_time = datetime.datetime.now()

        sql = "select count(id) from records"
        count_result = db_connection.execute(sql)

        for row in count_result:
            count = row[0]
            break

        sql = "select id from records"
        result = db_connection.execute(sql)

        record_ids = []
        for idx, row in enumerate(result):
            if (idx % int(count/4) == 0) or (idx == count - 1): #4 because that is how many workers we have
                if idx == 0:
                    some_records = []
                else:
                    record_ids.append(some_records)
                    some_records = []
            some_records.append(row[0])

        input_pool = Pool(4)
        #Add id messages to input queue
        input_pool.map(partial(add_batch_ids_to_queue, batch_size=int(batch)), record_ids)
        input_pool.close()
        input_pool.join()


        output_pool = Pool(4)
        #Read ids from input_queue, read message from DB and write it to output_queue
        worker_results = []
        for i in range(4):
            worker_results.append(output_pool.apply_async(read_id_from_queue, ()))

        output_pool.close()

        for r in worker_results:
            r.get() # This reports results, including errors, of workers

        output_pool.join() # This blocks until all the processes have finished

        end_time = datetime.datetime.now()
        time_taken = (end_time - start_time).total_seconds()

        return time_taken
Beispiel #31
0
def capture(interface,database_output_file,redraw_frequency,arp_resolve,
        dns_resolve,sender_lists,target_lists,color_profile,
        output_columns,display_false,pcap_output_file,force_sender,
        *args,**kwargs):

    dbfile = database_output_file

    osigint = signal.signal(signal.SIGINT,signal.SIG_IGN)
    pool = Pool(3)
    signal.signal(signal.SIGINT, osigint)

    try:

        # ==============
        # START SNIFFING
        # ==============

        '''
        The sniffer is started in a distinct process because Scapy
        will block forever when scapy.all.sniff is called. This allows
        us to interrupt execution of the sniffer by terminating the
        process.

        TODO: It may be easier to use threading. Pool methods were fresh
        to me at the time of original development.
        '''


        ptable = None
        pcount = 0
        # Handle new database file. When verbose, alert user that a new
        # capture must occur prior to printing results.

        arp_resolution = ('disabled','enabled')[arp_resolve]
        dns_resolution = ('disabled','enabled')[dns_resolve]

        print('\x1b[2J\x1b[H\33[F')
        print(logo+'\n')
        print(f'Capture interface: {interface}')
        print(f'ARP resolution:    {arp_resolution}')
        print(f'DNS resolution:    {dns_resolution}')
        sess = create_db(dbfile)

        # ======================================
        # CREATE AN IP FOR THE CURRENT INTERFACE
        # ======================================


        iface_mac, iface_ips = get_interfaces()[interface]
        for ip in iface_ips:
            ip = get_or_create_ip(ip,
                sess,
                mac_address=iface_mac)

        if not Path(dbfile).exists():
            print('- Initializing capture\n- This may take time depending '\
                'on network traffic and filter configurations')
        else:

            print(f'Requests analyzed: {pcount}\n')
            ptable = get_output_table(
                sess,
                sender_lists=sender_lists,
                target_lists=target_lists,
                dns_resolve=dns_resolve,
                color_profile=color_profile,
                arp_resolve=arp_resolve,
                columns=output_columns,
                display_false=display_false,
                force_sender=force_sender)
            print(ptable)

        # Cache packets that will be written to output file
        pkts = []
        sniff_result = None
        arp_resolve_result, dns_resolve_result = None, None

        # Loop eternally
        while True:


            # Handle sniff results
            if sniff_result and sniff_result.ready():

                packets = sniff_result.get()
                sniff_result = None

                # Capture packets for the output file
                if pcap_output_file and packets: pkts += packets

                if packets: pcount += packets.__len__()

                # Clear the previous table from the screen using
                # escape sequences screen
                # https://stackoverflow.com/questions/5290994/remove-and-replace-printed-items/5291044#5291044
                if ptable:
                    lcount = ptable.split('\n').__len__()+2
                    stdout.write('\033[F\033[K'*lcount)

                ptable = get_output_table(
                    sess,
                    sender_lists=sender_lists,
                    target_lists=target_lists,
                    dns_resolve=dns_resolve,
                    color_profile=color_profile,
                    arp_resolve=arp_resolve,
                    columns=output_columns,
                    display_false=display_false,
                    force_sender=force_sender)

                print(f'Requests analyzed: {pcount}\n')
                print(ptable)

            # Do sniffing
            elif not sniff_result:


                sniff_result = pool.apply_async(
                    async_sniff,
                    (
                        interface,
                        redraw_frequency,
                        sender_lists,
                        target_lists,
                        database_output_file,
                    )
                )

            # ==================
            # DNS/ARP RESOLUTION
            # ==================

            # Do reverse resolution
            if dns_resolve:

                # Reset dns resolution results
                if not dns_resolve_result or dns_resolve_result.ready():

                    to_resolve = sess.query(IP) \
                            .filter(IP.reverse_dns_attempted != True) \
                            .count()

                    if to_resolve:

                       dns_resolve_result = pool.apply_async(
                            reverse_dns_resolve_ips,
                            (database_output_file,)
                        )

            # Do ARP resolution
            if arp_resolve:

                if not arp_resolve_result or arp_resolve_result.ready():

                    to_resolve = sess.query(IP) \
                            .filter(IP.arp_resolve_attempted != True) \
                            .count()

                    if to_resolve:

                        arp_resolve_result = pool.apply_async(
                            arp_resolve_ips,
                                (interface, database_output_file,)
                            )

            sleep(.2)


    except KeyboardInterrupt:

        print('\n- CTRL^C Caught...')
        sess.close()

    finally:

        # ===================
        # HANDLE OUTPUT FILES
        # ===================

        if pcap_output_file: wrpcap(pcap_output_file,pkts)

        # =====================
        # CLOSE CHILD PROCESSES
        # =====================

        try:

            pool.close()

            if sniff_result:
                print('- Waiting for the sniffer process...',end='')
                sniff_result.wait(5)
                print('done')

            if dns_resolve_result:
                print('- Waiting for the DNS resolver process...',end='')
                dns_resolve_result.wait(5)
                print('done')

            if arp_resolve_result:
                print('- Waiting for the ARP resolver ocess...',end='')
                arp_resolve_result.wait(5)
                print('done')

        except KeyboardInterrupt:

            pool.terminate()

        pool.join()
    def evaluate_csv_right(self):
        """
        评估CSV文件
        """
        # in_file_name = 'test_400_right'     # 测试400
        # in_file_name = 'test_1000_right'    # 测试1000
        # in_file_name = 'random_1w_urls'    # 测试1w
        # in_file = os.path.join(DATA_DIR, 'test_urls_files', in_file_name + ".csv")

        # in_file_name = "sanghu.zj_question_cut_sampled_jueying_url_5k_1229"  # 整页影印
        # in_file_name = "dump_write_pure.out"  # 纯手写
        # in_file_name = "7_train_ori.out"  # 整页query
        # in_file_name = "HW_TRAIN.out"
        # in_file_name = "biaozhu_fix.check"
        # in_file_name = "biaozhu_csv_out"
        # in_file_name = "random_1w_urls"  # 普通query
        # in_file_name = "zjw_url"  # 小图
        # in_file_name = "xiaotu_labeled_25w_165512"  # 小图
        in_file_name = "zjw_imgs_20210427_urls"  # 小图

        in_file = os.path.join(DATA_DIR, 'page_dataset_files',
                               in_file_name + ".txt")  # 输入文件

        print('[Info] in_file: {}'.format(in_file))

        data_lines = read_file(in_file)
        print('[Info] 样本总量: {}'.format(len(data_lines)))
        if len(data_lines) == 0:
            print('[Info] 文件路径错误: {}'.format(in_file))
            return

        # 测试文件
        n = 10000
        if len(data_lines) > n:
            random.seed(47)
            # random.seed(89)
            random.shuffle(data_lines)  # 随机生成
            data_lines = data_lines[:n]

        print('[Info] 样本数量: {}'.format(len(data_lines)))

        # 测试文件
        time_str = get_current_time_str()
        out_name = 'check_{}.{}.csv'.format(in_file_name, time_str)
        out_dir = os.path.join(DATA_DIR, "check_dir_20210329")
        mkdir_if_not_exist(out_dir)
        out_file = os.path.join(out_dir, out_name)

        # 筛选文件
        # out_dir = os.path.join(DATA_DIR, "xiaotu_dir")
        # in_file_name = '{}_good.txt'.format(in_file_name)
        # mkdir_if_not_exist(out_dir)
        # out_file = os.path.join(out_dir, in_file_name)

        # write_dir = os.path.join(out_dir, 'write_dir_{}'.format(time_str))
        # mkdir_if_not_exist(write_dir)
        write_dir = None

        pool = Pool(processes=100)
        for idx, data_line in enumerate(data_lines):
            # 方案1
            # if idx == 0:
            #     continue
            # url, r_angle = data_line.split(',')

            # 方案2
            url, r_angle = data_line, 0

            # name = url.split('/')[-1].split('.')[0]
            # file_name_x = in_file_name.split('.')[0]
            # url = "https://sm-transfer.oss-cn-hangzhou.aliyuncs.com/zhengsheng.wcl/problems_rotation/" \
            #       "datasets/{}_x/{}.jpg".format(file_name_x, name)

            try:
                pool.apply_async(OnlineEvaluation.process_thread_right,
                                 (idx, url, r_angle, out_file, write_dir))
                # OnlineEvaluation.process_thread_right(idx, url, r_angle, out_file, write_dir)

                # 筛选图像
                # pool.apply_async(OnlineEvaluation.process_save_img_url, (idx, url, r_angle, out_file, write_dir))
                # OnlineEvaluation.process_save_img_url(idx, url, r_angle, out_file, write_dir)
            except Exception as e:
                print('[Info] Error URL: {}'.format(url))
                continue
            # print('[Info] URL: {}'.format(url))
            # OnlineEvaluation.process_thread_right(idx, url, r_angle, out_file, write_dir)

        pool.close()
        pool.join()

        print('[Info] 写入文件: {}'.format(out_file))
Beispiel #33
0
        dftr = pd.DataFrame({'id': ids, 'train': 'train'})
        tdftr = pd.DataFrame({'id': ids, 'train': 'test'})
        train, test = DataProcess.train_test_between_subject(
            gdata, pd.concat((dftr, tdftr)),
            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
        DLogger.logger().debug("total points: " + str(get_total_pionts(train)))

        worker = GQL.get_instance(2, 10, {})
        train = DataProcess.merge_data(train)
        OptML.optimise(worker,
                       output_path,
                       train,
                       test,
                       global_iters=1000,
                       learning_rate=learning_rate)


if __name__ == '__main__':

    if len(sys.argv) == 2:
        n_proc = int(sys.argv[1])
    elif len(sys.argv) == 1:
        n_proc = 1
    else:
        raise Exception('invalid argument')

    p = Pool(n_proc)
    p.map(run_BD, range(len(configs)))
    p.close()  # no more tasks
    p.join()  # wrap up current tasks
Beispiel #34
0
def main(args):
    """Do it all."""
    if not os.path.isdir(args.logs):
        raise Fail("Logs location '%s' is not a directory." % args.logs)

    builds = gather_builds(args)
    if args.verbose:
        print("Lined up %d builds." % len(builds))

    # The "configure" step is single-threaded.  We can run many at the same
    # time, even when we're also running a "build" step at the same time.
    # This means we may run a lot more processes than we have CPUs, but there's
    # no law against that.  There's also I/O time to be covered.
    configure_pool = Pool()

    # Builds which have failed the "configure" stage, with their errors.  This
    # queue must never stall, so that we can let results pile up here while the
    # work continues.
    configure_fails = Queue(len(builds))

    # Waiting list for the "build" stage.  It contains Build objects,
    # terminated by a final None to signify that there are no more builds to be
    # done.
    build_queue = JoinableQueue(10)

    # Builds that have failed the "build" stage.
    build_fails = Queue(len(builds))

    # Waiting list for the "test" stage.  It contains Build objects, terminated
    # by a final None.
    test_queue = JoinableQueue(10)

    # The "build" step tries to utilise all CPUs, and it may use a fair bit of
    # memory.  Run only one of these at a time, in a single worker process.
    build_worker = Process(
        target=service_builds, args=(build_queue, build_fails, test_queue))
    build_worker.start()

    # Builds that have failed the "test" stage.
    test_fails = Queue(len(builds))

    # Completed builds.  This must never stall.
    done_queue = JoinableQueue(len(builds))

    # The "test" step can not run concurrently (yet).  So, run tests serially
    # in a single worker process.  It takes its jobs directly from the "build"
    # worker.
    test_worker = Process(
        target=service_tests, args=(test_queue, test_fails, done_queue))
    test_worker.start()

    # Feed all builds into the "configure" pool.  Each build which passes this
    # stage goes into the "build" queue.
    for build in builds:
        configure_pool.apply_async(
            build.do_configure, callback=partial(enqueue, build_queue, build),
            error_callback=partial(enqueue_error, configure_fails, build))
    if args.verbose:
        print("All jobs are underway.")
    configure_pool.close()
    configure_pool.join()

# TODO: Async reporting for faster feedback.
    configure_fail_count = report_failures(configure_fails, "CONFIGURE FAIL")
    if args.verbose:
        print("Configure stage done.")

    # Mark the end of the build queue for the build worker.
    build_queue.put(None)

    build_worker.join()
# TODO: Async reporting for faster feedback.
    build_fail_count = report_failures(build_fails, "BUILD FAIL")
    if args.verbose:
        print("Build step done.")

    # Mark the end of the test queue for the test worker.
    test_queue.put(None)

    test_worker.join()
# TODO: Async reporting for faster feedback.
    test_fail_count = report_failures(test_fails, "TEST FAIL")
    if args.verbose:
        print("Test step done.")

    # All done.  Clean up.
    for build in builds:
        build.clean_up()

    ok_count = count_entries(done_queue)
    if ok_count == len(builds):
        print("All tests OK.")
    else:
        print(
            "Failures during configure: %d - build: %d - test: %d.  OK: %d."
            % (
                configure_fail_count,
                build_fail_count,
                test_fail_count,
                ok_count,
            ))
def evaluate_regions(folder_predicted: str,
                     folder_gt: str,
                     regions: dict,
                     processes=default_num_threads):
    region_names = list(regions.keys())
    files_in_pred = subfiles(folder_predicted, suffix='.nii.gz', join=False)
    files_in_gt = subfiles(folder_gt, suffix='.nii.gz', join=False)
    have_no_gt = [i for i in files_in_pred if i not in files_in_gt]
    assert len(
        have_no_gt
    ) == 0, "Some files in folder_predicted have not ground truth in folder_gt"
    have_no_pred = [i for i in files_in_gt if i not in files_in_pred]
    if len(have_no_pred) > 0:
        print(
            "WARNING! Some files in folder_gt were not predicted (not present in folder_predicted)!"
        )

    files_in_gt.sort()
    files_in_pred.sort()

    # run for all cases
    full_filenames_gt = [folder_gt + "/" + i for i in files_in_pred]
    full_filenames_pred = [folder_predicted + "/" + i for i in files_in_pred]

    p = Pool(processes)
    res = p.starmap(
        evaluate_case,
        zip(full_filenames_pred, full_filenames_gt,
            [list(regions.values())] * len(files_in_gt)))
    p.close()
    p.join()

    all_results = {r: [] for r in region_names}
    with open(folder_predicted + "/" + 'summary.csv', 'w') as f:
        f.write("casename")
        for r in region_names:
            f.write(",%s" % r)
        f.write("\n")
        for i in range(len(files_in_pred)):
            f.write(files_in_pred[i][:-7])
            result_here = res[i]
            for k, r in enumerate(region_names):
                dc = result_here[k]
                f.write(",%02.4f" % dc)
                all_results[r].append(dc)
            f.write("\n")

        f.write('mean')
        for r in region_names:
            f.write(",%02.4f" % np.nanmean(all_results[r]))
        f.write("\n")
        f.write('median')
        for r in region_names:
            f.write(",%02.4f" % np.nanmedian(all_results[r]))
        f.write("\n")

        f.write('mean (nan is 1)')
        for r in region_names:
            tmp = np.array(all_results[r])
            tmp[np.isnan(tmp)] = 1
            f.write(",%02.4f" % np.mean(tmp))
        f.write("\n")
        f.write('median (nan is 1)')
        for r in region_names:
            tmp = np.array(all_results[r])
            tmp[np.isnan(tmp)] = 1
            f.write(",%02.4f" % np.median(tmp))
        f.write("\n")
Beispiel #36
0
def depth_first(root, n, tofind, best, top=False, master=None, lock=None):
    # if master is not None:
    #     # print(master, lock)
    #     with lock:
    #         if len(master.get()) < len(master.get()):
    #             best = master.get()
    #         # lock.release()
    #         # print("released")

    # If tofind is empty
    if not tofind:
        return root
    # If longer than the current best (assuming all unfound patterns can be
    # included with just one additional character each, which is the best case)
    elif (len(root) + len(tofind)) >= len(best):
        return None

    # Generate potential branches based on the current root
    potential = try_add(root, n, tofind, best)

    # Potential branches collected, explore each one
    new = best
    # If more than one branch and has not previously done so,
    # do multiprocessing
    if top and (len(potential) > 1):
        # print("Multi", potential)
        for p in potential:
            p.n = n
            p.best = best

        args = [pickle.dumps(p) for p in potential]

        # Start processes and get results
        # lock = Lock()
        # master = Master(lock, best)
        with Manager() as manager:
            master = manager.Value(str, best)
            lock = manager.RLock()
            func = partial(depth_wrapper, master=master, lock=lock)
            pool = Pool(processes=min(len(args), os.cpu_count()))
            async_result = pool.map_async(func, args)
            pool.close()
            pool.join()
            results = async_result.get()

        # Find shortest
        for r in results:
            if r is None:
                continue
            # Else, compare result to current best
            else:
                # logger.info(" # Branches:  {}".format(len(potential)))
                if len(r) < len(new):
                    new = r
                    # logger.info(" New Best:    {}\n".format(len(new)))
    else:
        for p in potential:
            r = depth_first(p.root, n, p.tofind, new, top=top, master=master, lock=lock)
            # If None, branch is discarded as not being a better solution
            if r is None:
                continue
            # Else, compare result to current best
            else:
                # print(" # Branches:  {}".format(len(potential)))
                # print(" master: {}".format(master))
                # print(" Seed Level:  {}".format(len(p.root) - n))
                # logger.info(" # Branches:  {}".format(len(potential)))
                # logger.info(" Seed Level:  {}".format(len(p.root) - n))

                if len(r) < len(new):
                    new = r
                    if master is not None:
                        # print(master, lock)
                        with lock:
                            if len(new) < len(master.get()):
                                master.set(new)
                            else:
                                new = master.get()
                            # lock.release()
                            # print("released")
                    # logger.info(" New Best:    {}\n".format(len(new)))
                    # print(" New Best:    {}\n".format(len(new)))

    # If a better solution was found, return it
    return new if len(new) < len(best) else None
Beispiel #37
0
class Sampler(object):
    """
    ABC population monte carlo sampler
    
    :param N: number of particles
    :param Y: observed data set
    :param postfn: model function (a callable), which creates a new dataset x for a given theta
    :param dist: distance function rho(X, Y) (a callable)
    :param threads: (optional) number of threads. If >1 and no pool is given <threads> multiprocesses will be started
    :param pool: (optional) a pool instance which has a <map> function 
    """
    
    particle_proposal_cls = ParticleProposal
    particle_proposal_kwargs = {}
    
    def __init__(self, N, Y, postfn, dist, threads=1, pool=None):
        self.N = N
        self.Y = Y
        self.postfn = postfn
        self.dist = dist
        self._random = np.random.mtrand.RandomState()

        if pool is not None:
            self.pool = pool
            self.mapFunc  = self.pool.map
            
        elif threads == 1:
            self.mapFunc = map
        else:
            self.pool = Pool(threads)
            self.mapFunc  = self.pool.map
            
    
    def sample(self, prior, eps_proposal, pool=None):
        """
        Launches the sampling process. Yields the intermediate results per iteration.
        
        :param prior: instance of a prior definition (or an other callable)  see :py:class:`sampler.GaussianPrior`
        :param eps_proposal: an instance of a threshold proposal (or an other callable) see :py:class:`sampler.ConstEps`
        :param pool: (optional) a PoolSpec instance,if not None the initial rejection sampling 
        will be skipped and the pool is used for the further sampling
        
        :yields pool: yields a namedtuple representing the values of one iteration
        """
        if pool is None:
            eps = eps_proposal.next()
            wrapper = _RejectionSamplingWrapper(self, eps, prior)
            
            res = list(self.mapFunc(wrapper, self._random.randint(0, np.iinfo(np.uint32).max, self.N)))
            thetas = np.array([theta for (theta, _, _) in res])
            dists = np.array([dist for (_, dist, _) in res])
            cnts = np.sum([cnt for (_, _, cnt) in res])
            ws = np.ones(self.N) / self.N
            
            pool = PoolSpec(0, eps, self.N/cnts, thetas, dists, ws)
            yield pool
        
        for t, eps in enumerate(eps_proposal, pool.t + 1):
            particleProposal = self.particle_proposal_cls(self, eps, pool, self.particle_proposal_kwargs)
            
            res = list(self.mapFunc(particleProposal, self._random.randint(0, np.iinfo(np.uint32).max, self.N)))
            thetas = np.array([theta for (theta, _, _) in res])
            dists = np.array([dist for (_, dist, _) in res]) 
            cnts = np.sum([cnt for (_, _, cnt) in res])
            
            sigma = 2 * weighted_cov(pool.thetas, pool.ws)
            wrapper = _WeightWrapper(prior, sigma, pool.ws, pool.thetas)
            
            wt = np.array(list(self.mapFunc(wrapper, thetas)))
            ws = wt/np.sum(wt)
            
            pool = PoolSpec(t, eps, self.N/cnts, thetas, dists, ws)
            yield pool
            
            
    def close(self):
        """
        Tries to close the pool (avoid hanging threads)
        """
        if hasattr(self, "pool") and self.pool is not None:
            try:
                self.pool.close()
            except: pass
Beispiel #38
0
        cursor.rollback()
    cursor.close()


if __name__ == '__main__':
    #开启进程,与逻辑核保持一致
    connect_db = connect_db()
    filepath = r'D:\filename'
    table = 'table_name'

    t1 = time.time()
    pro_num = 10  #进程数
    pool = Pool(processes=pro_num)
    job_result = []
    #遍历文件夹读取所有文件
    for file in os.listdir(filepath):
        filename = filepath + '\\' + file
        res = pool.apply_async(read_data, (filename, ))
        job_result.append(res)

    pool.close()  #关闭进程池
    pool.join()

    #合并所有读取的文件
    get_result = pd.DataFrame()
    for tmp in job_result:
        get_result = get_result.append(tmp.get())
    t2 = time.time()

    insert_data(connect_db, get_result, table)
    print('It took a total of %0.2f seconds.' % (t2 - t1))
Beispiel #39
0
            else:
                print('Already Downloaded', file_path)
    except requests.ConnectionError:
        print('Failed to Save Image,item %s' % item)

   
def main(offset):
    json=get_page(offset)
    for item in get_images(json):
        print(item)
        save_image(item)
        
GROUP_START=1
GROUP_END=10

if __name__=='__main__':
    pool=Pool()
    groups=([x*20 for x in range(GROUP_START,GROUP_END+1)])
    pool.map(main,groups)
    pool.close()
    pool.join()

print('suc4')
    
        

            
            


Beispiel #40
0
    df.to_csv(
        f"./Prediction_202106_Ratio631/categorical_vggish_6pnn_20210621_prediction_CV{i}_Gamma_{round(best_gamma,4)}_C_{round(best_c)}_ACC_{test_accuracy}_F1_{test_f1}_AUC_{test_auc}.csv"
    )
    df2 = pd.DataFrame(y_test)
    df2.to_csv(
        f"./Prediction_202106_Ratio631/categorical_vggish_6pnn_20210324_GT_CV{i}.csv"
    )
    print(f">>>>>>> CV = {i}/10, Over Training >>>>>>>\n")
    logger.info(f">>>>>>> CV = {i}/10,Over Training >>>>>>>")
    return [test_accuracy, test_f1, test_auc]


if __name__ == '__main__':
    pool = Pool(int(os.getenv('N_PROC', os.cpu_count())))
    futures = [pool.apply_async(func=svm, args=[i]) for i in range(1, 11)]
    pool.close()  # 关闭pool,使其不在接受新的(主进程)任务
    average_acc_test, average_f1_test, average_auc_test = [], [], []
    for item in futures:
        result = item.get()
        average_acc_test.append(result[0])
        average_f1_test.append(result[1])
        average_auc_test.append(result[2])
    print(
        f"Vggish Classification Average Results: Acc.= {mean(average_acc_test)}, F1 = {mean(average_f1_test)}, AUC = {mean(average_auc_test)}"
    )
    print(
        f"average_acc_test = {average_acc_test},/n average_f1_test={average_f1_test},/n average_auc_test = {average_auc_test}"
    )
    logger.info(
        f"Vggish Classification Average Results: Acc.= {mean(average_acc_test)}, F1 = {mean(average_f1_test)}, AUC = {mean(average_auc_test)}"
    )
Beispiel #41
0
    local_image_url = item.get('image')
    new_image_url = local_image_url.replace('list', 'large')
    r = requests.get('http:' + new_image_url)
    if r.status_code == 200:
        file_path = img_path + os.path.sep + '{0}.{1}'.format(
            md5(r.content).hexdigest(), 'jpg')
        if not os.path.exists(file_path):
            with open(file_path, 'wb') as f:
                f.write(r.content)


def saveToMongo(item):
    if db[MONGO_TABLE].insert(item):
        print('储存到MONGODB成功', item)
    return False


def main(offset):
    json = getPage(offset)
    for item in getImage(json):
        saveImage(item)
        saveToMongo(item)


if __name__ == '__main__':
    pool = Pool()
    groups = [x * 20 for x in range(2)]  #爬取五页
    pool.map(main, groups)
    pool.close()  #关闭进程池(pool),使其不在接受新的任务
    pool.join()  #主进程阻塞等待子进程的退出
Beispiel #42
0
import random
from multiprocessing.pool import Pool
from time import sleep, time

import os


def run(name):
    print("%s子进程开始,进程ID:%d" % (name, os.getpid()))
    start = time()
    sleep(random.choice([1, 2, 3, 4]))
    end = time()
    print("%s子进程结束,进程ID:%d。耗时0.2%f" % (name, os.getpid(), end - start))


if __name__ == "__main__":
    print("父进程开始")
    # 创建多个进程,表示可以同时执行的进程数量。默认大小是CPU的核心数
    p = Pool(8)
    for i in range(10):
        # 创建进程,放入进程池统一管理
        p.apply_async(run, args=(i, ))
    # 如果我们用的是进程池,在调用join()之前必须要先close(),并且在close()之后不能再继续往进程池添加新的进程
    p.close()
    # 进程池对象调用join,会等待进程吃中所有的子进程结束完毕再去结束父进程
    p.join()
    print("父进程结束。")
    def validate(self,
                 do_mirroring: bool = True,
                 use_sliding_window: bool = True,
                 step_size: float = 0.5,
                 save_softmax: bool = True,
                 use_gaussian: bool = True,
                 overwrite: bool = True,
                 validation_folder_name: str = 'validation_raw',
                 debug: bool = False,
                 all_in_gpu: bool = False,
                 segmentation_export_kwargs: dict = None):

        current_mode = self.network.training
        self.network.eval()

        assert self.was_initialized, "must initialize, ideally with checkpoint (or train first)"
        if self.dataset_val is None:
            self.load_dataset()
            self.do_split()

        if segmentation_export_kwargs is None:
            if 'segmentation_export_params' in self.plans.keys():
                force_separate_z = self.plans['segmentation_export_params'][
                    'force_separate_z']
                interpolation_order = self.plans['segmentation_export_params'][
                    'interpolation_order']
                interpolation_order_z = self.plans[
                    'segmentation_export_params']['interpolation_order_z']
            else:
                force_separate_z = None
                interpolation_order = 1
                interpolation_order_z = 0
        else:
            force_separate_z = segmentation_export_kwargs['force_separate_z']
            interpolation_order = segmentation_export_kwargs[
                'interpolation_order']
            interpolation_order_z = segmentation_export_kwargs[
                'interpolation_order_z']

        output_folder = join(self.output_folder, validation_folder_name)
        maybe_mkdir_p(output_folder)

        if do_mirroring:
            mirror_axes = self.data_aug_params['mirror_axes']
        else:
            mirror_axes = ()

        pred_gt_tuples = []

        export_pool = Pool(2)
        results = []

        transpose_backward = self.plans.get('transpose_backward')

        for k in self.dataset_val.keys():
            properties = load_pickle(self.dataset[k]['properties_file'])
            data = np.load(self.dataset[k]['data_file'])['data']

            # concat segmentation of previous step
            seg_from_prev_stage = np.load(
                join(self.folder_with_segs_from_prev_stage,
                     k + "_segFromPrevStage.npz"))['data'][None]

            print(data.shape)
            data[-1][data[-1] == -1] = 0
            data_for_net = np.concatenate(
                (data[:-1],
                 to_one_hot(seg_from_prev_stage[0], range(1,
                                                          self.num_classes))))

            softmax_pred = self.predict_preprocessed_data_return_seg_and_softmax(
                data_for_net,
                do_mirroring,
                mirror_axes,
                use_sliding_window,
                step_size,
                use_gaussian,
                all_in_gpu=all_in_gpu)[1]

            if transpose_backward is not None:
                transpose_backward = self.plans.get('transpose_backward')
                softmax_pred = softmax_pred.transpose(
                    [0] + [i + 1 for i in transpose_backward])

            fname = properties['list_of_data_files'][0].split("/")[-1][:-12]

            if save_softmax:
                softmax_fname = join(output_folder, fname + ".npz")
            else:
                softmax_fname = None
            """There is a problem with python process communication that prevents us from communicating obejcts 
            larger than 2 GB between processes (basically when the length of the pickle string that will be sent is 
            communicated by the multiprocessing.Pipe object then the placeholder (\%i I think) does not allow for long 
            enough strings (lol). This could be fixed by changing i to l (for long) but that would require manually 
            patching system python code. We circumvent that problem here by saving softmax_pred to a npy file that will 
            then be read (and finally deleted) by the Process. save_segmentation_nifti_from_softmax can take either 
            filename or np.ndarray and will handle this automatically"""
            if np.prod(softmax_pred.shape) > (2e9 / 4 *
                                              0.85):  # *0.85 just to be save
                np.save(fname + ".npy", softmax_pred)
                softmax_pred = fname + ".npy"

            results.append(
                export_pool.starmap_async(
                    save_segmentation_nifti_from_softmax,
                    ((softmax_pred, join(output_folder, fname + ".nii.gz"),
                      properties, interpolation_order,
                      self.regions_class_order, None, None, softmax_fname,
                      None, force_separate_z, interpolation_order_z), )))

            pred_gt_tuples.append([
                join(output_folder, fname + ".nii.gz"),
                join(self.gt_niftis_folder, fname + ".nii.gz")
            ])

        _ = [i.get() for i in results]

        task = self.dataset_directory.split("/")[-1]
        job_name = self.experiment_name
        _ = aggregate_scores(pred_gt_tuples,
                             labels=list(range(self.num_classes)),
                             json_output_file=join(output_folder,
                                                   "summary.json"),
                             json_name=job_name,
                             json_author="Fabian",
                             json_description="",
                             json_task=task)

        # in the old nnunet we would stop here. Now we add a postprocessing. This postprocessing can remove everything
        # except the largest connected component for each class. To see if this improves results, we do this for all
        # classes and then rerun the evaluation. Those classes for which this resulted in an improved dice score will
        # have this applied during inference as well
        self.print_to_log_file("determining postprocessing")
        determine_postprocessing(self.output_folder,
                                 self.gt_niftis_folder,
                                 validation_folder_name,
                                 final_subf_name=validation_folder_name +
                                 "_postprocessed",
                                 debug=debug)
        # after this the final predictions for the vlaidation set can be found in validation_folder_name_base + "_postprocessed"
        # They are always in that folder, even if no postprocessing as applied!

        # detemining postprocesing on a per-fold basis may be OK for this fold but what if another fold finds another
        # postprocesing to be better? In this case we need to consolidate. At the time the consolidation is going to be
        # done we won't know what self.gt_niftis_folder was, so now we copy all the niftis into a separate folder to
        # be used later
        gt_nifti_folder = join(self.output_folder_base, "gt_niftis")
        maybe_mkdir_p(gt_nifti_folder)
        for f in subfiles(self.gt_niftis_folder, suffix=".nii.gz"):
            success = False
            attempts = 0
            while not success and attempts < 10:
                try:
                    shutil.copy(f, gt_nifti_folder)
                    success = True
                except OSError:
                    attempts += 1
                    sleep(1)

        self.network.train(current_mode)
        export_pool.close()
        export_pool.join()
Beispiel #44
0
def encodeHVLAD_(images, encoder, dmd_options, level):

    descrs = []
    pool = Pool(processes=8)
    encoder = encoder.tolist()
    centers = encoder['centers']
    features = [
        pool.apply_async(computeSDMD, args=(img, dmd_options, level))
        for img in images
    ]
    pool.close()
    pool.join()

    # print(centers)
    # vars = encoder['vars']
    # skews = encoder['skews']

    print('转到encodeHvlad')
    for feature in features:
        feature = feature.get().T

        new_features = np.zeros((feature.shape[0], feature.shape[1]),
                                dtype=np.float32)
        new_features[:, :] = feature[:, :]
        predicted_labels = kmeans_quantize(data=new_features, centers=centers)
        n_cluster = centers.shape[0]
        [n_patch, n_feature] = new_features.shape

        Vm = np.zeros([n_cluster, n_feature], dtype=np.float32)
        Vc = np.zeros([n_cluster, n_feature], dtype=np.float32)
        Vs = np.zeros([n_cluster, n_feature], dtype=np.float32)
        for i in range(n_cluster):
            Ni = np.sum(predicted_labels == i)
            if Ni > 0:
                i_features = new_features[predicted_labels == i, :]
                mi = np.mean(i_features, axis=0)
                Vm[i] = Ni * (mi - centers[i])
                Vc[i] = (1 / Ni) * np.sum(
                    (i_features - mi)**2, axis=0) - (1 / Ni) * np.sum(
                        (i_features - centers[i])**2, axis=0)
                Vs[i] = ((1 / Ni) * (np.sum(
                    (i_features - mi)**3, axis=0))) / np.maximum(
                        ((1 / Ni) * np.sum(
                            (i_features - mi)**2, axis=0))**1.5, 1e-12) - (
                                (1 / Ni) * (np.sum(
                                    (i_features - centers[i])**3,
                                    axis=0))) / np.maximum(((1 / Ni) * np.sum(
                                        (i_features - centers[i])**2, axis=0))
                                                           **1.5, 1e-12)
        # power normalization, also called square-rooting normalization
        Vm = np.sign(Vm) * np.sqrt(np.abs(Vm))
        Vc = np.sign(Vc) * np.sqrt(np.abs(Vc))
        Vs = np.sign(Vs) * np.sqrt(np.abs(Vs))
        # # L2 normalization
        # Vm /= np.maximum(np.linalg.norm(Vm, axis=1)[:, None], 1e-12)
        # Vc /= np.maximum(np.linalg.norm(Vc, axis=1)[:, None], 1e-12)
        # Vs /= np.maximum(np.linalg.norm(Vs, axis=1)[:, None], 1e-12)
        V_all = np.vstack((Vm, Vc, Vc)).flatten()[None, :]
        descrs = V_all if len(descrs) == 0 else np.concatenate(
            (descrs, V_all), axis=0)
    return descrs.astype(np.float32)
Beispiel #45
0
class PyRAMmp():
    '''
    The PyRAMmp class sets up and runs a multiprocessing pool to enable
    parallel PyRAM model runs.
    '''
    def __init__(self, processes=None, maxtasksperchild=None):
        '''
        Initialise the pool and variable lists.
        processes and maxtasksperchild are passed to the pool.
        '''

        self.pool = Pool(processes=processes,
                         maxtasksperchild=maxtasksperchild)
        self.results = []  # Results from PyRAM.run()
        self._outputs = [
        ]  # New outputs from PyRAM.run() for transfer to self.results
        self._waiting = []  # Waiting runs
        self._num_waiting = 0  # Number of waiting runs
        self._num_active = 0  # Number of active runs
        self._sleep_time = 1e-2  # Minimum sleep time between adding runs to pool
        self._new = True  # Flag to indicate ready for new set of runs

    def submit_runs(self, runs):
        '''
        Submit new runs to the pool as resources become available
        runs is a list of PyRAM input tuples (args, kwargs)
        '''

        # Add to waiting list
        for run in runs:
            self._waiting.append(run)
        self._num_waiting = len(self._waiting)

        # Check how many active runs have finished
        for _ in range(len(self._outputs)):
            run = self._outputs.pop(0)
            self.results.append(run)
            self._num_active -= 1

        num_start = self.pool._processes - self._num_active
        num_start = min(num_start, self._num_waiting)

        # Start new runs if processes are free
        for _ in range(num_start):
            run = self._waiting.pop(0)
            self.pool.apply_async(run_pyram,
                                  args=(run, ),
                                  callback=self._get_output)
            self._num_active += 1

        if self._new:
            self._new = False
            self._wait()

    def _wait(self):
        '''
        Wait for all submitted runs to complete.
        '''

        while self._num_active > 0:
            self.submit_runs([])
            sleep(self._sleep_time)

        self._new = True

    def close(self):
        '''
        Close the pool and wait for all processes to finish.
        '''

        self.pool.close()
        self.pool.join()

    def _get_output(self, output):
        '''
        Get a PyRAM output.
        '''

        self._outputs.append(output)

    def __del__(self):

        self.close()
Beispiel #46
0
def authorate(arguments):
    """Main function which delegates to fabric tasks."""
    global engine
    engine = create_engine('sqlite:///' + arguments['--db'])
    create_db(engine)

    global VERBOSE
    VERBOSE = arguments['--verbose']
    multi_thread = not arguments['--one']

    if arguments['-C']:
        classify.classifiers_dir = arguments['-C']

    # Assume successful return value
    ret = 0
    if arguments['load']:

        # Load in words and word counts from file
        session = get_session(engine)
        if len(session.query(Word_Count).all()) == 0:
            subprocess.call('sqlite3 ' + arguments['--db'] +
                            ' < import_words.sql',
                            shell=True)

        prefix = arguments['--prefix']
        if os.path.exists(prefix):
            # Determine how many snippets to get per path.
            snippets_count = arguments['<snippets-per-path>']
            if not snippets_count:
                snippets_count = DEFAULT_SNIPPETS_COUNT

            pool = Pool(cpu_count() if multi_thread else 1)
            with open(arguments['<paths-file>'], 'r') as paths_file:
                paths = paths_file.readlines()
                for path in paths:
                    res = load_path(pool,
                                    path.rstrip(),
                                    prefix=prefix,
                                    multi_thread=multi_thread)
                    if not res:
                        ret = 3
            # Join the pool
            pool.close()
            pool.join()
        else:
            display_error(
                "The given prefix does not exist: {path}".format(path=prefix))
            ret = 2

    elif arguments['process']:
        # Cleanup the classifier dir
        classify.clean_classifier_dir()

        # Get and scale data from snippets
        session = get_session(engine)
        snippets = session.query(Book, Snippet).join(Snippet).all()
        data = [text_to_vector(snip.text, session) for _, snip in snippets]
        scaler = classify.create_and_save_scaler(data)
        scaled_data = scaler.transform(data)
        targets = [book.path_id for book, _ in snippets]

        # Train the classifiers
        for (Cls, kwargs) in classify.classifier_types:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                classifier = Cls(**kwargs)
                classifier.fit(scaled_data, targets)
            classify.save_classifier(classifier)

    elif arguments['classify']:
        snip_file = arguments['<snippet-file>']
        input_files = [snip_file if snip_file else '-']
        classify.classify_all(
            engine, " ".join([
                unicode(line.rstrip(), errors='ignore')
                for line in fileinput.input(input_files)
            ]))

    elif arguments['test']:
        session = get_session(engine)
        snippets = session.query(Book, Snippet).join(Snippet).all()
        if VERBOSE:
            print("Converting raw data to vectors. . .")
        data = [text_to_vector(snip.text, session) for _, snip in snippets]
        targets = [book.path_id for book, _ in snippets]
        classify.test_all(engine, data, targets)

    else:
        display_error("No subcommand given.")
        ret = 1
    return ret
Beispiel #47
0
def ensemble(training_output_folder1, training_output_folder2, output_folder, task, validation_folder, folds, allow_ensembling: bool = True):
    print("\nEnsembling folders\n", training_output_folder1, "\n", training_output_folder2)

    output_folder_base = output_folder
    output_folder = join(output_folder_base, "ensembled_raw")

    # only_keep_largest_connected_component is the same for all stages
    dataset_directory = join(preprocessing_output_dir, task)
    plans = load_pickle(join(training_output_folder1, "plans.pkl"))  # we need this only for the labels

    files1 = []
    files2 = []
    property_files = []
    out_files = []
    gt_segmentations = []

    folder_with_gt_segs = join(dataset_directory, "gt_segmentations")
    # in the correct shape and we need the original geometry to restore the niftis

    for f in folds:
        validation_folder_net1 = join(training_output_folder1, "fold_%d" % f, validation_folder)
        validation_folder_net2 = join(training_output_folder2, "fold_%d" % f, validation_folder)

        if not isdir(validation_folder_net1):
            raise AssertionError("Validation directory missing: %s. Please rerun validation with `nnUNet_train CONFIG TRAINER TASK FOLD -val --npz`" % validation_folder_net1)
        if not isdir(validation_folder_net2):
            raise AssertionError("Validation directory missing: %s. Please rerun validation with `nnUNet_train CONFIG TRAINER TASK FOLD -val --npz`" % validation_folder_net2)

        # we need to ensure the validation was successful. We can verify this via the presence of the summary.json file
        if not isfile(join(validation_folder_net1, 'summary.json')):
            raise AssertionError("Validation directory incomplete: %s. Please rerun validation with `nnUNet_train CONFIG TRAINER TASK FOLD -val --npz`" % validation_folder_net1)
        if not isfile(join(validation_folder_net2, 'summary.json')):
            raise AssertionError("Validation directory missing: %s. Please rerun validation with `nnUNet_train CONFIG TRAINER TASK FOLD -val --npz`" % validation_folder_net2)

        patient_identifiers1_npz = [i[:-4] for i in subfiles(validation_folder_net1, False, None, 'npz', True)]
        patient_identifiers2_npz = [i[:-4] for i in subfiles(validation_folder_net2, False, None, 'npz', True)]

        # we don't do postprocessing anymore so there should not be any of that noPostProcess
        patient_identifiers1_nii = [i[:-7] for i in subfiles(validation_folder_net1, False, None, suffix='nii.gz', sort=True) if not i.endswith("noPostProcess.nii.gz") and not i.endswith('_postprocessed.nii.gz')]
        patient_identifiers2_nii = [i[:-7] for i in subfiles(validation_folder_net2, False, None, suffix='nii.gz', sort=True) if not i.endswith("noPostProcess.nii.gz") and not i.endswith('_postprocessed.nii.gz')]

        if not all([i in patient_identifiers1_npz for i in patient_identifiers1_nii]):
            raise AssertionError("Missing npz files in folder %s. Please run the validation for all models and folds with the '--npz' flag." % (validation_folder_net1))
        if not all([i in patient_identifiers2_npz for i in patient_identifiers2_nii]):
            raise AssertionError("Missing npz files in folder %s. Please run the validation for all models and folds with the '--npz' flag." % (validation_folder_net2))

        patient_identifiers1_npz.sort()
        patient_identifiers2_npz.sort()

        assert all([i == j for i, j in zip(patient_identifiers1_npz, patient_identifiers2_npz)]), "npz filenames do not match. This should not happen."

        os.makedirs(output_folder, exist_ok=True)

        for p in patient_identifiers1_npz:
            files1.append(join(validation_folder_net1, p + '.npz'))
            files2.append(join(validation_folder_net2, p + '.npz'))
            property_files.append(join(validation_folder_net1, p) + ".pkl")
            out_files.append(join(output_folder, p + ".nii.gz"))
            gt_segmentations.append(join(folder_with_gt_segs, p + ".nii.gz"))

    p = Pool(default_num_threads)
    p.map(merge, zip(files1, files2, property_files, out_files))
    p.close()
    p.join()

    if not isfile(join(output_folder, "summary.json")) and len(out_files) > 0:
        aggregate_scores(tuple(zip(out_files, gt_segmentations)), labels=plans['all_classes'],
                     json_output_file=join(output_folder, "summary.json"), json_task=task,
                     json_name=task + "__" + os.path.basename(output_folder_base), num_threads=default_num_threads)

    if allow_ensembling and not isfile(join(output_folder_base, "postprocessing.json")):
        # now lets also look at postprocessing. We cannot just take what we determined in cross-validation and apply it
        # here because things may have changed and may also be too inconsistent between the two networks
        determine_postprocessing(output_folder_base, folder_with_gt_segs, "ensembled_raw", "temp",
                                 "ensembled_postprocessed", default_num_threads, dice_threshold=0)

        out_dir_all_json = join(network_training_output_dir, "summary_jsons")
        json_out = load_json(join(output_folder_base, "ensembled_postprocessed", "summary.json"))

        json_out["experiment_name"] = os.path.basename(output_folder_base)
        save_json(json_out, join(output_folder_base, "ensembled_postprocessed", "summary.json"))

        os.makedirs(out_dir_all_json, exist_ok=True)
        shutil.copy(join(output_folder_base, "ensembled_postprocessed", "summary.json"),
                    join(out_dir_all_json, "%s__%s.json" % (task, os.path.basename(output_folder_base))))
# 如果还没满,就创建一个新的进程来执行该请求,否则,该请求就会等待,直到池中有进程结束,才会创建新的进程

import os
import time
from multiprocessing.pool import Pool
from random import random


def task(task_name):
    print("开始我的新任务啦....", task_name, os.getpid())
    starttime = time.time()
    time.sleep(random() * 3)
    endtime = time.time()
    #print("我的任务--{}--完成啦...耗时{},进程{}".format(task_name, endtime-starttime, os.getpid()))
    return "我的任务--{}--完成啦...耗时{},进程{}".format(task_name, endtime - starttime, os.getpid())


def callback_func(n):
    print(n)


if __name__ == "__main__":
    # 进程池
    pool = Pool(5)
    tasks = ["听音乐", "吃饭", "打游戏", "看孩子", "做饭", "跑步", "学习", "打架", "听音乐", "吃饭", "打游戏", "看孩子",
             "做饭", "跑步", "学习", "打架"]
    for t in tasks:
        pool.apply_async(task, args=(t,), callback = callback_func)  # 异步方式,非阻塞
    pool.close()  # 进程池添加结束
    pool.join()  # 使主进程阻塞
        os.mkdir(item.get('title'))
    try:
        response = requests.get(item.get('image'))  # 请求图片的网址链接
        if response.status_code == 200:
            file_path = '{0}/{1}.{2}'.format(item.get('title'),md5(response.content).hexdigest(),'jpg')
            # 0表示文件夹名,1表示文件名,用图片内容的md5值,2表示jpg格式
            if not os.path.exists(file_path):
                with open(file_path,'wb') as f:
                    f.write(response.content)
            else:
                print("Already downloaded",file_path)
    except requests.ConnectionError:
        print('Failed to Save Image')

from multiprocessing.pool import Pool  # python进程池

def main(offset):
    json = get_page(offset)
    for item in get_images(json):
        print(item)
        save_image(item)

GROUP_START = 0
GROUP_END = 20

if __name__ == '__main__':
    pool = Pool()
    groups = ([x * 20 for x in range(GROUP_START, GROUP_END+1)])
    pool.map(main,groups)
    pool.close()  # 关闭进程池,表示不能在往进程池中添加进程
    pool.join()  # 等待进程池中的所有进程执行完毕,必须在close()之后调用
Beispiel #50
0
def encodeHVLAD(images, encoder, dmd_options):
    l_descrs, m_descrs, s_descrs, all_descrs = [], [], [], []

    pool = Pool(processes=4)
    l_features = [pool.apply_async(computeSDMD, args=(img, dmd_options, 0)) for img in images]#零层高斯金字塔出现的图片特征
    m_features = [pool.apply_async(computeSDMD, args=(img, dmd_options, 1)) for img in images]#1层高斯金字塔出现的图片特征
    s_features = [pool.apply_async(computeSDMD, args=(img, dmd_options, 2)) for img in images]#2层高斯金字塔出现的图片特征
    pool.close()
    pool.join()

    centers = encoder['centers']
    # vars = encoder['vars']
    # skews = encoder['skews']
#以下的循环,第一步:循环每一次的变量,就是从上面通过computeSDMD获得的金字塔一层中,一个图片的SDMD特征(80,8649),把它转置命名为features,和newfeatures
    # 第二步:为每个特征和聚类中心索引上,就有了predicted_labels(1,8649)
    # 把每个类中心索引的特征值单拿出来,计算Vm,特征值的平均值和聚类中心相减再乘上索引到当前聚类中心的特征个数,计算Vc,
    # Vc就是当前聚类中心特征的方差,减去以聚类中心作为均值的方差,最后按坐标累加再除以特征个数,每个聚类中心都会形成(1,80),然后经过
    # 128循环,形成(128,80)
    # V_all就是把Vm和Vc拼接在一起,然后再转化成一维(1,128*80)
    # 最后经过40次循环,形成(40,128*80)这个就是一层高斯金字塔的encode
    # 接下来的另外两个循环也是一样的,只不过就是再高斯金字塔的更高层而已,计算完l,s,m也就是1.2.3层的vlad编码,再把这三个放在一起取平均值
    # 得到的decris
    print('进入HVLAD')
    for features in l_features:
        features = features.get().T#图片特征就是(80,8649),经过转置,图片特征是(8649,80)

        new_features = np.zeros((features.shape[0], features.shape[1]), dtype=np.float32)#全零矩阵(8649,80)
        new_features[:, :] = features[:, :]#把features的值赋给new_features

        predicted_labels = kmeans_quantize(data=new_features, centers=centers)#就是聚类为每个聚类中心分配特征,或者说为每一行的特征找索引
        n_cluster = centers.shape[0]#center(128,80)
        [n_patch, n_feature] = features.shape
#以上是对图片的特征进行与聚类中心的索引
        Vm = np.zeros([n_cluster, n_feature], dtype=np.float32)#Vm(128,80)
        Vc = np.zeros([n_cluster, n_feature], dtype=np.float32)#vc (128,80)
        # Vs = np.zeros([n_cluster, n_feature], dtype=np.float32)
        for i in range(n_cluster):
            Ni = np.sum(predicted_labels == i)
            if Ni > 0:
                i_features = features[predicted_labels == i, :] #挑选相应的列,(  Ni,80)
                mi = np.mean(i_features, axis=0)#mi (1,80)
                Vm[i] = Ni * (mi - centers[i])#特征与聚类中心相减然后再乘上使用这个聚类中心的索引到聚类中心的个数
                Vc[i] = (1 / Ni) * np.sum((i_features - mi) ** 2, axis=0) - (1 / Ni) * np.sum(
                    (i_features - centers[i]) ** 2, axis=0)#前面的np.sum是先计算ifeatures每一行的值减去均值,然后平方累加,好像就是求方差,第二个就是把聚类中心当作均值来求方差
                #上面应该是不同均值计算的方差均值相减
                # Vs[i] = ((1 / Ni) * (np.sum((i_features - mi) ** 3, axis=0))) / np.maximum(
                #     ((1 / Ni) * np.sum((i_features - mi) ** 2, axis=0)) ** 1.5, 1e-12) - (
                #                 (1 / Ni) * (np.sum((i_features - centers[i]) ** 3, axis=0))) / np.maximum(
                #     ((1 / Ni) * np.sum((i_features - centers[i]) ** 2, axis=0)) ** 1.5, 1e-12)
                #
        # power normalization, also called square-rooting normalization
        Vm = np.sign(Vm) * np.sqrt(np.abs(Vm))
        Vc = np.sign(Vc) * np.sqrt(np.abs(Vc))
        # Vs = np.sign(Vs) * np.sqrt(np.abs(Vs))
        # # L2 normalization
        # Vm /= np.maximum(np.linalg.norm(Vm, axis=1)[:, None], 1e-12)
        # Vc /= np.maximum(np.linalg.norm(Vc, axis=1)[:, None], 1e-12)
        # Vs /= np.maximum(np.linalg.norm(Vs, axis=1)[:, None], 1e-12)
        # V_all = np.vstack((Vm, Vc, Vs)).flatten()[None, :]
        V_all = np.vstack((Vm, Vc)).flatten()[None, :]#拼接到一起,先是合并到一起(128,160),然后转成一维
        l_descrs = V_all if len(l_descrs) == 0 else np.concatenate((l_descrs, V_all), axis=0)

    for features in m_features:
        features = features.get().T

        new_features = np.zeros((features.shape[0], features.shape[1]), dtype=np.float32)
        new_features[:, :] = features[:, :]
        predicted_labels = kmeans_quantize(data=new_features, centers=centers)
        n_cluster = centers.shape[0]
        [n_patch, n_feature] = features.shape

        Vm = np.zeros([n_cluster, n_feature], dtype=np.float32)
        Vc = np.zeros([n_cluster, n_feature], dtype=np.float32)
        # Vs = np.zeros([n_cluster, n_feature], dtype=np.float32)
        for i in range(n_cluster):
            Ni = np.sum(predicted_labels == i)
            if Ni > 0:
                i_features = features[predicted_labels == i, :]
                mi = np.mean(i_features, axis=0)
                Vm[i] = Ni * (mi - centers[i])
                Vc[i] = (1 / Ni) * np.sum((i_features - mi) ** 2, axis=0) - (1 / Ni) * np.sum(
                    (i_features - centers[i]) ** 2, axis=0)
                # Vs[i] = ((1 / Ni) * (np.sum((i_features - mi) ** 3, axis=0))) / np.maximum(
                #     ((1 / Ni) * np.sum((i_features - mi) ** 2, axis=0)) ** 1.5, 1e-12) - (
                #                 (1 / Ni) * (np.sum((i_features - centers[i]) ** 3, axis=0))) / np.maximum(
                #     ((1 / Ni) * np.sum((i_features - centers[i]) ** 2, axis=0)) ** 1.5, 1e-12)
        # power normalization, also called square-rooting normalization
        Vm = np.sign(Vm) * np.sqrt(np.abs(Vm))
        Vc = np.sign(Vc) * np.sqrt(np.abs(Vc))
        # Vs = np.sign(Vs) * np.sqrt(np.abs(Vs))
        # # L2 normalization
        # Vm /= np.maximum(np.linalg.norm(Vm, axis=1)[:, None], 1e-12)
        # Vc /= np.maximum(np.linalg.norm(Vc, axis=1)[:, None], 1e-12)
        # Vs /= np.maximum(np.linalg.norm(Vs, axis=1)[:, None], 1e-12)
        # V_all = np.vstack((Vm, Vc, Vs)).flatten()[None, :]
        V_all = np.vstack((Vm, Vc)).flatten()[None, :]
        m_descrs = V_all if len(m_descrs) == 0 else np.concatenate((m_descrs, V_all), axis=0)

    for features in s_features:
        features = features.get().T

        new_features = np.zeros((features.shape[0], features.shape[1]), dtype=np.float32)
        new_features[:, :] = features[:, :]
        predicted_labels = kmeans_quantize(data=new_features, centers=centers)
        n_cluster = centers.shape[0]
        [n_patch, n_feature] = features.shape

        Vm = np.zeros([n_cluster, n_feature], dtype=np.float32)
        Vc = np.zeros([n_cluster, n_feature], dtype=np.float32)
        # Vs = np.zeros([n_cluster, n_feature], dtype=np.float32)
        for i in range(n_cluster):
            Ni = np.sum(predicted_labels == i)
            if Ni > 0:
                i_features = features[predicted_labels == i, :]
                mi = np.mean(i_features, axis=0)
                Vm[i] = Ni * (mi - centers[i])
                Vc[i] = (1 / Ni) * np.sum((i_features - mi) ** 2, axis=0) - (1 / Ni) * np.sum(
                    (i_features - centers[i]) ** 2, axis=0)
                # Vs[i] = ((1 / Ni) * (np.sum((i_features - mi) ** 3, axis=0))) / np.maximum(
                #     ((1 / Ni) * np.sum((i_features - mi) ** 2, axis=0)) ** 1.5, 1e-12) - (
                #                 (1 / Ni) * (np.sum((i_features - centers[i]) ** 3, axis=0))) / np.maximum(
                #     ((1 / Ni) * np.sum((i_features - centers[i]) ** 2, axis=0)) ** 1.5, 1e-12)
        # power normalization, also called square-rooting normalization
        Vm = np.sign(Vm) * np.sqrt(np.abs(Vm))
        Vc = np.sign(Vc) * np.sqrt(np.abs(Vc))
        # Vs = np.sign(Vs) * np.sqrt(np.abs(Vs))
        # # L2 normalization
        # Vm /= np.maximum(np.linalg.norm(Vm, axis=1)[:, None], 1e-12)
        # Vc /= np.maximum(np.linalg.norm(Vc, axis=1)[:, None], 1e-12)
        # Vs /= np.maximum(np.linalg.norm(Vs, axis=1)[:, None], 1e-12)
        # V_all = np.vstack((Vm, Vc, Vs)).flatten()[None, :]
        V_all = np.vstack((Vm, Vc)).flatten()[None, :]
        s_descrs = V_all if len(s_descrs) == 0 else np.concatenate((s_descrs, V_all), axis=0)

    descrs = (l_descrs + m_descrs + s_descrs)/3
    return descrs.astype(np.float32)
def mp_plantation_preparation(gadm_index_shp, planted_index_shp):

    os.chdir(cn.docker_base_dir)

    # ## Not actually using this but leaving it here in case I want to add this functionality eventually. This
    # # was to allow users to run plantations for a select (contiguous) area rather than for the whole planet.
    # # List of bounding box coordinates
    # bound_list = args.bounding_box
    # # Checks if bounding box coordinates are in multiples of 10 (10 degree tiles). If they're not, the script stops.
    # for bound in bound_list:
    #     if bound%10:
    #         uu.exception_log(bound, 'not a multiple of 10. Please make bounding box coordinates are multiples of 10.')

    # Checks the validity of the two arguments. If either one is invalid, the script ends.
    if (gadm_index_path not in cn.gadm_plant_1x1_index_dir
            or planted_index_path not in cn.gadm_plant_1x1_index_dir):
        uu.exception_log(
            'Invalid inputs. Please provide None or s3 shapefile locations for both arguments.'
        )

    # List of all possible 10x10 Hansen tiles except for those at very extreme latitudes (not just WHRC biomass tiles)
    total_tile_list = uu.tile_list_s3(cn.pixel_area_dir)
    uu.print_log("Number of possible 10x10 tiles to evaluate:",
                 len(total_tile_list))

    # Removes the latitude bands that don't have any planted forests in them according to Liz Goldman.
    # i.e., Liz Goldman said by Slack on 1/2/19 that the nothernmost planted forest is 69.5146 and the southernmost is -46.938968.
    # This creates a more focused list of 10x10 tiles to iterate through (removes ones that definitely don't have planted forest).
    # NOTE: If the planted forest gdb is updated, the list of latitudes to exclude below may need to be changed to not exclude certain latitude bands.
    planted_lat_tile_list = [
        tile for tile in total_tile_list if '90N' not in tile
    ]
    planted_lat_tile_list = [
        tile for tile in planted_lat_tile_list if '80N' not in tile
    ]
    planted_lat_tile_list = [
        tile for tile in planted_lat_tile_list if '50S' not in tile
    ]
    planted_lat_tile_list = [
        tile for tile in planted_lat_tile_list if '60S' not in tile
    ]
    planted_lat_tile_list = [
        tile for tile in planted_lat_tile_list if '70S' not in tile
    ]
    planted_lat_tile_list = [
        tile for tile in planted_lat_tile_list if '80S' not in tile
    ]
    # planted_lat_tile_list = ['10N_080W']

    uu.print_log(planted_lat_tile_list)
    uu.print_log(
        "Number of 10x10 tiles to evaluate after extreme latitudes have been removed:",
        len(planted_lat_tile_list))

    # If a planted forest extent 1x1 tile index shapefile isn't supplied
    if 'None' in args.planted_tile_index:

        ### Entry point 1:
        # If no shapefile of 1x1 tiles for countries with planted forests is supplied, 1x1 tiles of country extents will be created.
        # This runs the process from the very beginning and will take a few days.
        if 'None' in args.gadm_tile_index:

            uu.print_log(
                "No GADM 1x1 tile index shapefile provided. Creating 1x1 planted forest country tiles from scratch..."
            )

            # Downloads and unzips the GADM shapefile, which will be used to create 1x1 tiles of land areas
            uu.s3_file_download(cn.gadm_path, cn.docker_base_dir)
            cmd = ['unzip', cn.gadm_zip]
            # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging
            process = Popen(cmd, stdout=PIPE, stderr=STDOUT)
            with process.stdout:
                uu.log_subprocess_output(process.stdout)

            # Creates a new GADM shapefile with just the countries that have planted forests in them.
            # This limits creation of 1x1 rasters of land area on the countries that have planted forests rather than on all countries.
            # NOTE: If the planted forest gdb is updated and has new countries added to it, the planted forest country list
            # in constants_and_names.py must be updated, too.
            uu.print_log(
                "Creating shapefile of countries with planted forests...")
            os.system(
                '''ogr2ogr -sql "SELECT * FROM gadm_3_6_adm2_final WHERE iso IN ({0})" {1} gadm_3_6_adm2_final.shp'''
                .format(str(cn.plantation_countries)[1:-1], cn.gadm_iso))

            # Creates 1x1 degree tiles of countries that have planted forests in them.
            # I think this can handle using 50 processors because it's not trying to upload files to s3 and the tiles are small.
            # This takes several days to run because it iterates through at least 250 10x10 tiles.
            # For multiprocessor use.
            processes = 50
            uu.print_log('Rasterize GADM 1x1 max processors=', processes)
            pool = Pool(processes)
            pool.map(plantation_preparation.rasterize_gadm_1x1,
                     planted_lat_tile_list)
            pool.close()
            pool.join()

            # # Creates 1x1 degree tiles of countries that have planted forests in them.
            # # For single processor use.
            # for tile in planted_lat_tile_list:
            #
            #     plantation_preparation.rasterize_gadm_1x1(tile)

            # Creates a shapefile of the boundaries of the 1x1 GADM tiles in countries with planted forests
            os.system('''gdaltindex {0}_{1}.shp GADM_*.tif'''.format(
                cn.pattern_gadm_1x1_index, uu.date_time_today))
            cmd = [
                'aws', 's3', 'cp', cn.docker_base_dir,
                cn.gadm_plant_1x1_index_dir, '--exclude', '*', '--include',
                '{}*'.format(cn.pattern_gadm_1x1_index), '--recursive'
            ]

            # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging
            process = Popen(cmd, stdout=PIPE, stderr=STDOUT)
            with process.stdout:
                uu.log_subprocess_output(process.stdout)

            # # Saves the 1x1 country extent tiles to s3
            # # Only use if the entire process can't run in one go on the spot machine
            # cmd = ['aws', 's3', 'cp', cn.docker_base_dir, 's3://gfw2-data/climate/carbon_model/temp_spotmachine_output/', '--exclude', '*', '--include', 'GADM_*.tif', '--recursive']

            # # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging
            # process = Popen(cmd, stdout=PIPE, stderr=STDOUT)
            # with process.stdout:
            #     uu.log_subprocess_output(process.stdout)

            # Delete the aux.xml files
            os.system('''rm GADM*.tif.*''')

            # List of all 1x1 degree countey extent tiles created
            gadm_list_1x1 = uu.tile_list_spot_machine(".", "GADM_")
            uu.print_log(
                "List of 1x1 degree tiles in countries that have planted forests, with defining coordinate in the northwest corner:",
                gadm_list_1x1)
            uu.print_log(len(gadm_list_1x1))

        ### Entry point 2:
        # If a shapefile of the boundaries of 1x1 degree tiles of countries with planted forests is supplied,
        # a list of the 1x1 tiles is created from the shapefile.
        # This avoids creating the 1x1 country extent tiles all over again because the relevant tile extent are supplied
        # in the shapefile.
        elif cn.gadm_plant_1x1_index_dir in args.gadm_tile_index:

            uu.print_log(
                "Country extent 1x1 tile index shapefile supplied. Using that to create 1x1 planted forest tiles..."
            )

            uu.print_log('{}/'.format(gadm_index_path))

            # Copies the shapefile of 1x1 tiles of extent of countries with planted forests
            cmd = [
                'aws', 's3', 'cp', '{}/'.format(gadm_index_path),
                cn.docker_base_dir, '--recursive', '--exclude', '*',
                '--include', '{}*'.format(gadm_index_shp)
            ]

            # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging
            process = Popen(cmd, stdout=PIPE, stderr=STDOUT)
            with process.stdout:
                uu.log_subprocess_output(process.stdout)

            # Gets the attribute table of the country extent 1x1 tile shapefile
            gadm = glob.glob('{}*.dbf'.format(cn.pattern_gadm_1x1_index))[0]

            # Converts the attribute table to a dataframe
            dbf = Dbf5(gadm)
            df = dbf.to_dataframe()

            # Converts the column of the dataframe with the names of the tiles (which contain their coordinates) to a list
            gadm_list_1x1 = df['location'].tolist()
            gadm_list_1x1 = [str(y) for y in gadm_list_1x1]
            uu.print_log(
                "List of 1x1 degree tiles in countries that have planted forests, with defining coordinate in the northwest corner:",
                gadm_list_1x1)
            uu.print_log("There are", len(gadm_list_1x1),
                         "1x1 country extent tiles to iterate through.")

        # In case some other arguments are provided
        else:
            uu.exception_log(
                'Invalid GADM tile index shapefile provided. Please provide a valid shapefile.'
            )

        # Creates 1x1 degree tiles of plantation growth wherever there are plantations.
        # Because this is iterating through all 1x1 tiles in countries with planted forests, it first checks
        # whether each 1x1 tile intersects planted forests before creating a 1x1 planted forest tile for that
        # 1x1 country extent tile.
        # 55 processors seems to use about 350 GB of memory, which seems fine. But there was some error about "PQconnectdb failed-- sorry, too many clients already".
        # So, moved the number of processors down to 48.
        # For multiprocessor use
        processes = 48
        uu.print_log('Create 1x1 plantation from 1x1 gadm max processors=',
                     processes)
        pool = Pool(processes)
        pool.map(plantation_preparation.create_1x1_plantation_from_1x1_gadm,
                 gadm_list_1x1)
        pool.close()
        pool.join()

        # # Creates 1x1 degree tiles of plantation growth wherever there are plantations
        # # For single processor use
        # for tile in gadm_list_1x1:
        #
        #     plantation_preparation.create_1x1_plantation(tile)

        # Creates a shapefile in which each feature is the extent of a plantation extent tile.
        # This index shapefile can be used the next time this process is run if starting with Entry Point 3.
        os.system('''gdaltindex {0}_{1}.shp plant_gain_*.tif'''.format(
            cn.pattern_plant_1x1_index, uu.date_time_today))
        cmd = [
            'aws', 's3', 'cp', cn.docker_base_dir, cn.gadm_plant_1x1_index_dir,
            '--exclude', '*', '--include',
            '{}*'.format(cn.pattern_plant_1x1_index), '--recursive'
        ]

        # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging
        process = Popen(cmd, stdout=PIPE, stderr=STDOUT)
        with process.stdout:
            uu.log_subprocess_output(process.stdout)

    ### Entry point 3
    # If a shapefile of the extents of 1x1 planted forest tiles is provided.
    # This is the part that actually creates the sequestration rate and forest type tiles.

    if cn.pattern_plant_1x1_index in args.planted_tile_index:

        uu.print_log(
            "Planted forest 1x1 tile index shapefile supplied. Using that to create 1x1 planted forest growth rate and forest type tiles..."
        )

        # Copies the shapefile of 1x1 tiles of extent of planted forests
        cmd = [
            'aws', 's3', 'cp', '{}/'.format(planted_index_path),
            cn.docker_base_dir, '--recursive', '--exclude', '*', '--include',
            '{}*'.format(planted_index_shp), '--recursive'
        ]

        # Solution for adding subprocess output to log is from https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging
        process = Popen(cmd, stdout=PIPE, stderr=STDOUT)
        with process.stdout:
            uu.log_subprocess_output(process.stdout)

        # Gets the attribute table of the planted forest extent 1x1 tile shapefile
        gadm = glob.glob('{}*.dbf'.format(cn.pattern_plant_1x1_index))[0]

        # Converts the attribute table to a dataframe
        dbf = Dbf5(gadm)
        df = dbf.to_dataframe()

        # Converts the column of the dataframe with the names of the tiles (which contain their coordinates) to a list
        planted_list_1x1 = df['location'].tolist()
        planted_list_1x1 = [str(y) for y in planted_list_1x1]
        uu.print_log(
            "List of 1x1 degree tiles in countries that have planted forests, with defining coordinate in the northwest corner:",
            planted_list_1x1)
        uu.print_log("There are", len(planted_list_1x1),
                     "1x1 planted forest extent tiles to iterate through.")

        # Creates 1x1 degree tiles of plantation growth and type wherever there are plantations.
        # Because this is iterating through only 1x1 tiles that are known to have planted forests (from a previous run
        # of this script), it does not need to check whether there are planted forests in this tile. It goes directly
        # to intersecting the planted forest table with the 1x1 tile.

        # For single processor use
        #for tile in planted_list_1x1:
        #    plantation_preparation.create_1x1_plantation_growth_from_1x1_planted(tile)

        # For multiprocessor use
        # processes=40 uses about 360 GB of memory. Works on r4.16xlarge with space to spare
        # processes=52 uses about 465 GB of memory (quite stably), so this is basically the max.
        num_of_processes = 52
        pool = Pool(num_of_processes)
        pool.map(
            plantation_preparation.
            create_1x1_plantation_growth_from_1x1_planted, planted_list_1x1)
        pool.close()
        pool.join()

        # This works with 50 processors on an r4.16xlarge marchine. Uses about 430 GB out of 480 GB.
        num_of_processes = 52
        pool = Pool(num_of_processes)
        processes = 50
        uu.print_log('Create 1x1 plantation type max processors=', processes)
        pool = Pool(processes)
        pool.map(
            plantation_preparation.create_1x1_plantation_type_from_1x1_planted,
            planted_list_1x1)
        pool.close()
        pool.join()

        # This rasterizes the plantation removal factor standard deviations
        # processes=50 peaks at about 450 GB
        num_of_processes = 50
        pool = Pool(num_of_processes)
        pool.map(
            plantation_preparation.
            create_1x1_plantation_stdev_from_1x1_planted, planted_list_1x1)
        pool.close()
        pool.join()

    ### All script entry points meet here: creation of 10x10 degree planted forest gain rate and rtpe tiles
    ### from 1x1 degree planted forest gain rate and type tiles

    # Name of the vrt of 1x1 planted forest gain rate tiles
    plant_gain_1x1_vrt = 'plant_gain_1x1.vrt'

    # Creates a mosaic of all the 1x1 plantation gain rate tiles
    uu.print_log("Creating vrt of 1x1 plantation gain rate tiles")
    os.system('gdalbuildvrt {} plant_gain_*.tif'.format(plant_gain_1x1_vrt))

    # Creates 10x10 degree tiles of plantation gain rate by iterating over the set of pixel area tiles supplied
    # at the start of the script that are in latitudes with planted forests.
    # For multiprocessor use
    processes = 20
    uu.print_log('Create 10x10 plantation gain rate max processors=',
                 processes)
    pool = Pool(processes)
    pool.map(
        partial(plantation_preparation.create_10x10_plantation_gain,
                plant_gain_1x1_vrt=plant_gain_1x1_vrt), planted_lat_tile_list)
    pool.close()
    pool.join()

    # Creates 10x10 degree tiles of plantation gain rate by iterating over the set of pixel area tiles supplied
    #at the start of the script that are in latitudes with planted forests.
    # For single processor use
    #for tile in planted_lat_tile_list:
    #     plantation_preparation.create_10x10_plantation_gain(tile, plant_gain_1x1_vrt)

    # Name of the vrt of 1x1 planted forest type tiles
    plant_type_1x1_vrt = 'plant_type_1x1.vrt'

    # Creates a mosaic of all the 1x1 plantation type tiles
    uu.print_log("Creating vrt of 1x1 plantation type tiles")
    os.system('gdalbuildvrt {} plant_type_*.tif'.format(plant_type_1x1_vrt))

    # Creates 10x10 degree tiles of plantation type by iterating over the set of pixel area tiles supplied
    # at the start of the script that are in latitudes with planted forests.
    # For multiprocessor use
    num_of_processes = 26
    pool = Pool(num_of_processes)
    uu.print_log('Create 10x10 plantation type max processors=', processes)
    pool.map(
        partial(plantation_preparation.create_10x10_plantation_type,
                plant_type_1x1_vrt=plant_type_1x1_vrt), planted_lat_tile_list)
    pool.close()
    pool.join()

    # # Creates 10x10 degree tiles of plantation type by iterating over the set of pixel area tiles supplied
    # at the start of the script that are in latitudes with planted forests.
    # # For single processor use
    # for tile in planted_lat_tile_list:
    #
    #     plantation_preparation.create_10x10_plantation_type(tile, plant_type_1x1_vrt)

    # Name of the vrt of 1x1 planted forest gain rate standard deviation tiles
    plant_stdev_1x1_vrt = 'plant_stdev_1x1.vrt'

    # Creates a mosaic of all the 1x1 plantation gain rate standard deviation tiles
    uu.print_log(
        "Creating vrt of 1x1 plantation gain rate standard deviation tiles")
    os.system('gdalbuildvrt {} plant_stdev_*.tif'.format(plant_stdev_1x1_vrt))

    # Creates 10x10 degree tiles of plantation gain rate standard deviation by iterating over the set of pixel area tiles supplied
    # at the start of the script that are in latitudes with planted forests.
    # For multiprocessor use
    num_of_processes = 26
    pool = Pool(num_of_processes)
    pool.map(
        partial(plantation_preparation.create_10x10_plantation_gain_stdev,
                plant_stdev_1x1_vrt=plant_stdev_1x1_vrt),
        planted_lat_tile_list)
    pool.close()
    pool.join()
Beispiel #52
0
    def _read_obs(self, stns_ids=None):

        # Saw extreme decreased performance due to garbage collection when
        # pandas ran checks for a chained assignment. Turn off this check
        # temporarily.
        opt_val = pd.get_option('mode.chained_assignment')
        pd.set_option('mode.chained_assignment', None)

        try:

            if stns_ids is None:
                stns_obs = self.stns
            else:
                stns_obs = self.stns.loc[stns_ids]
            
            nstns = len(stns_obs.station_id)
            nprocs = self.nprocs if nstns >= self.nprocs else nstns
            
            if self.has_start_end_dates:
                start_end = (self.start_date, self.end_date)
            else:
                start_end = None
            
            if nprocs > 1:
                
                # http://stackoverflow.com/questions/24171725/
                # scikit-learn-multicore-attributeerror-stdin-instance-
                # has-no-attribute-close
                if not hasattr(sys.stdin, 'close'):
                    def dummy_close():
                        pass
                    sys.stdin.close = dummy_close
                
                iter_stns = [(None, a_id, self.elems, start_end)
                             for a_id in stns_obs.station_id]
                
                pool = Pool(processes=nprocs)                
                obs = pool.map(_parse_ghcnd_dly_star_remote, iter_stns)
                
                pool.close()
                pool.join()
            
            else:
            
                obs = []
    
                for a_id in stns_obs.station_id:
                    
                    abuf = open_remote_file('https://www1.ncdc.noaa.gov/'
                                            'pub/data/ghcn/daily/all/%s.dly' % a_id)
                                       
                    obs_stn = _parse_ghcnd_dly(abuf, a_id, self.elems, start_end)
                    obs.append(obs_stn)

            df_obs = pd.concat(obs, ignore_index=True)

        finally:

            pd.set_option('mode.chained_assignment', opt_val)

        df_obs = df_obs.set_index(['station_id', 'elem', 'time'])
        df_obs = df_obs.sortlevel(0, sort_remaining=True)

        return df_obs
def train(dataset,
          learn_rate=1e-4,
          prior_type='uniform',
          pretrained_ae_ckpt_path=None,
          pretrained_aae_ckpt_path=None):

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    logger.info("using prior: {}".format(prior_type))

    pool_ = Pool(4)

    if dataset == 'MNIST':
        data = MNIST()
        data_name = "MNIST"
        w_init = "kaiming_uniform"
        encoder_dims = [500, 500, 1000, 10]
        discriminator_dims = [1000, 1]
        stack_ae = True
        update_interval = 100
        update_aae_mu_interval = 10000
        aae_finetune_iteration = 30000
        initialize_iteration = 50000
        finetune_iteration = 100000
        finetune_epoch = 200
        aae_finetune_epoch = 40
        batch_size = 256
        aae_ae_enhance = 1
    elif dataset == "StackOverflow":
        data = StackOverflow()
        data_name = dataset
        encoder_dims = [500, 500, 2000, 20]
        discriminator_dims = [1000, 1]
        w_init = "glorot_uniform"
        stack_ae = False
        update_interval = 500
        aae_finetune_iteration = 5000
        update_aae_mu_interval = 5000
        finetune_epoch = 15
        aae_finetune_epoch = None
        batch_size = 64
        aae_ae_enhance = 1
        finetune_iteration = finetune_epoch * (data.train_y.shape[0] /
                                               batch_size)
    else:
        assert False, "Undefined dataset."
    logger.info("running on data set: {}".format(dataset))

    dec_aae_model = DEC_AAE(
        params={
            "encoder_dims": encoder_dims,
            "n_clusters": data.num_classes,
            "input_dim": data.feature_dim,
            "alpha": 1.0,
            "discriminator_dims": discriminator_dims,
            "learn_rate": learn_rate,
            "w_init": w_init
        })
    if dataset == 'MNIST':
        # learning_rate = tf.train.exponential_decay(learning_rate=0.1,
        #                                            global_step=tf.train.get_or_create_global_step(),
        #                                            decay_steps=20000,
        #                                            decay_rate=0.1,
        #                                            staircase=True)
        # dec_aae_model.dec.ae.optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9).\
        #     minimize(dec_aae_model.dec.ae.loss)
        dec_aae_model.dec.ae.optimizer = tf.train.AdamOptimizer(0.0001). \
                minimize(dec_aae_model.dec.ae.loss)
    elif dataset == "StackOverflow":
        dec_aae_model.dec.ae.optimizer = tf.train.AdamOptimizer(0.001, beta1=0.9, beta2=0.999, epsilon=1e-8).\
            minimize(dec_aae_model.dec.ae.loss)

    ae_saver = tf.train.Saver(var_list=dec_aae_model.ae_vars, max_to_keep=None)
    aae_saver = tf.train.Saver(var_list=dec_aae_model.d_vars +
                               dec_aae_model.ae_vars,
                               max_to_keep=None)
    dec_saver = tf.train.Saver(var_list=dec_aae_model.dec_vars,
                               max_to_keep=None)
    saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=None)
    # phase 1: ae parameter initialization
    log_interval = 500
    if pretrained_ae_ckpt_path is None:
        logger.info("pre training auto encoder")
        sae = StackedAutoEncoder(encoder_dims=encoder_dims,
                                 input_dim=data.feature_dim)
        ae_ckpt_path = os.path.join('ae_ckpt',
                                    'model{}.ckpt'.format(data_name))

        with tf.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())
            if stack_ae:
                # initialize sae
                next_ = data.gen_next_batch(batch_size=batch_size,
                                            is_train_set=True,
                                            iteration=initialize_iteration)
                cur_ae_data = data.train_x
                for i, sub_ae in enumerate(sae.layerwise_autoencoders):
                    # train sub_ae
                    for iter_, (batch_x, _, _) in enumerate(next_):
                        _, loss = sess.run([sub_ae.optimizer, sub_ae.loss],
                                           feed_dict={
                                               sub_ae.input_: batch_x,
                                               sub_ae.keep_prob: 0.8
                                           })
                        if iter_ % log_interval == 0:
                            logger.info("[SAE-{}] iter: {}\tloss: {}".format(
                                i, iter_, loss))

                    # assign pretrained sub_ae's weight
                    encoder_w_assign_op, encoder_b_assign_op = dec_aae_model.dec.ae.layers[
                        i].get_assign_ops(sub_ae.layers[0])
                    decoder_w_assign_op, decoder_b_assign_op = dec_aae_model.dec.ae.layers[
                        (i + 1) * -1].get_assign_ops(sub_ae.layers[1])
                    _ = sess.run([
                        encoder_w_assign_op, encoder_b_assign_op,
                        decoder_w_assign_op, decoder_b_assign_op
                    ])

                    # get next sub_ae's input
                    cur_ae_data = sess.run(sub_ae.encoder,
                                           feed_dict={
                                               sub_ae.input_: cur_ae_data,
                                               sub_ae.keep_prob: 1.0
                                           })
                    embedding = Dataset(train_x=cur_ae_data,
                                        train_y=cur_ae_data)
                    next_ = embedding.gen_next_batch(
                        batch_size=batch_size,
                        is_train_set=True,
                        iteration=initialize_iteration)

            # finetune AE
            for iter_, (batch_x, _, _) in enumerate(
                    data.gen_next_batch(
                        batch_size=batch_size,
                        is_train_set=True,
                        # iteration=finetune_iteration,
                        epoch=finetune_epoch)):
                _, loss = sess.run(
                    [
                        dec_aae_model.dec.ae.optimizer,
                        dec_aae_model.dec.ae.loss
                    ],
                    feed_dict={
                        dec_aae_model.dec.ae.input_: batch_x,
                        dec_aae_model.dec.ae.keep_prob: 1.0
                    })
                if iter_ % log_interval == 0:
                    logger.info("[AE-finetune] iter: {}\tloss: {}".format(
                        iter_, loss))
                if iter_ % (10 * log_interval) == 0:
                    xmlr_x = data.train_x[:10000, :]
                    xmlr_id = data.train_y[:10000]
                    z = sess.run(dec_aae_model.z,
                                 feed_dict={
                                     dec_aae_model.input_: xmlr_x,
                                     dec_aae_model.keep_prob: 1.0
                                 })
                    pool_.apply_async(
                        pu.save_scattered_image,
                        (z, xmlr_id,
                         "./results/z_ae_map_{}.jpg".format(iter_)))
                    # pu.save_scattered_image(z, xmlr_id, "./results/z_ae_map_{}.jpg".format(iter_))
            ae_saver.save(sess, ae_ckpt_path)
        pool_.close()  # 关闭进程池,表示不能在往进程池中添加进程
        pool_.join()  # 等待进程池中的所有进程执行完毕,必须在close()之后调用
        exit()

    else:
        ae_ckpt_path = pretrained_ae_ckpt_path

    # exit()
    # phase 2: aae parameter initialization
    if pretrained_aae_ckpt_path is None:
        logger.info("pre training adversarial auto encoder")
        aae_ckpt_path = os.path.join('aae_ckpt',
                                     'model{}.ckpt'.format(data_name))
        # aae_ckpt_path = os.path.join('aae_ckpt', 'model.ckpt-100000')
        with tf.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())
            ae_saver.restore(sess, ae_ckpt_path)
            # aae_saver.restore(sess, aae_ckpt_path)
            z = sess.run(dec_aae_model.z,
                         feed_dict={
                             dec_aae_model.input_: data.train_x,
                             dec_aae_model.keep_prob: 1.0
                         })
            assign_mu_op = dec_aae_model.dec.get_assign_cluster_centers_op(z)
            _ = sess.run(assign_mu_op)
            mu = sess.run(dec_aae_model.dec.mu)
            total_y = data.train_y
            total_pred = sess.run(dec_aae_model.dec.pred,
                                  feed_dict={
                                      dec_aae_model.input_:
                                      data.train_x,
                                      dec_aae_model.batch_size:
                                      data.train_x.shape[0],
                                      dec_aae_model.keep_prob:
                                      1.0
                                  })
            logger.info("[Total DEC] epoch: {}\tacc: {}".format(
                -1, dec_aae_model.dec.cluster_acc(total_y, total_pred)))

            for iter_, (batch_x, batch_y, batch_idxs) in enumerate(
                    data.gen_next_batch(
                        batch_size=batch_size,
                        is_train_set=True,
                        # iteration=aae_finetune_iteration,
                        epoch=aae_finetune_epoch,
                    )):
                # if iter_ % update_aae_mu_interval == 0 and iter_ != 0:
                #     z = sess.run(dec_aae_model.z,
                #                  feed_dict={dec_aae_model.input_: data.train_x, dec_aae_model.keep_prob: 1.0})
                #     assign_mu_op = dec_aae_model.dec.get_assign_cluster_centers_op(z)
                #     _ = sess.run(assign_mu_op)
                #     mu = sess.run(dec_aae_model.dec.mu)

                z_sample, z_id_one_hot, z_id_ = \
                    prior.get_sample(prior_type, batch_size, dec_aae_model.z_dim, n_labels=data.num_classes, mu=mu)
                train_dec_feed = {
                    dec_aae_model.input_: batch_x,
                    dec_aae_model.batch_size: batch_x.shape[0],
                    dec_aae_model.keep_prob: 1,
                    dec_aae_model.z_sample: z_sample,
                }

                # if iter_ < 100:
                #     # discriminator loss
                #     _, d_loss = sess.run(
                #         (dec_aae_model.train_op_d, dec_aae_model.D_loss), feed_dict=train_dec_feed)
                #     logger.info("[ADVER] epoch %d:  d_loss %03.2f" % (
                #         iter_, d_loss))
                #     continue
                for _ in range(aae_ae_enhance):
                    # reconstruction loss
                    _, ae_loss = sess.run(
                        (dec_aae_model.train_op_ae, dec_aae_model.ae_loss),
                        feed_dict=train_dec_feed)
                    #
                # discriminator loss
                _, d_loss = sess.run(
                    (dec_aae_model.train_op_d, dec_aae_model.D_loss),
                    feed_dict=train_dec_feed)
                #
                # generator loss
                _, g_loss = sess.run(
                    (dec_aae_model.train_op_g, dec_aae_model.G_loss),
                    feed_dict=train_dec_feed)
                #
                tot_loss = ae_loss + d_loss + g_loss
                #
                if iter_ % 500 == 0:
                    # logger.info cost every epoch
                    logger.info(
                        "[ADVER] epoch %d: L_tot %03.4f L_likelihood %03.4f d_loss %03.2f g_loss %03.4f"
                        % (iter_, tot_loss, ae_loss, d_loss, g_loss))
                if iter_ % 2500 == 0:
                    # logger.info cost every epoch

                    xmlr_x = data.train_x[:10000, :]
                    xmlr_id = data.train_y[:10000]
                    z = sess.run(dec_aae_model.z,
                                 feed_dict={
                                     dec_aae_model.input_: data.train_x,
                                     dec_aae_model.keep_prob: 1.0
                                 })
                    # pu.save_scattered_image(z, xmlr_id, "./results/z_map_{}.jpg".format(iter_))

                    # pred_y = sess.run(dec_aae_model.dec.pred,
                    #              feed_dict={dec_aae_model.input_: data.train_x, dec_aae_model.keep_prob: 1.0,
                    #                         dec_aae_model.batch_size: data.train_x.shape[0]
                    #                         })
                    # logger.info("[Total DEC] iteration: {}\targ_acc: {}".
                    #             format(iter_, dec_aae_model.dec.cluster_acc(data.train_y, pred_y)))
                    #
                    # kmeans = KMeans(n_clusters=data.num_classes, n_init=20)
                    # pred_y = kmeans.fit_predict(z)
                    # logger.info("[Total DEC] iteration: {}\tkmeans_acc: {}".
                    #             format(iter_, dec_aae_model.dec.cluster_acc(data.train_y, pred_y)))
                    z = z[:10000]
                    pool_.apply_async(
                        pu.save_scattered_image,
                        (z, xmlr_id,
                         "./results/z_aae_map_{}.jpg".format(iter_)))

            aae_saver.save(sess, aae_ckpt_path)

        pool_.close()  # 关闭进程池,表示不能在往进程池中添加进程
        pool_.join()  # 等待进程池中的所有进程执行完毕,必须在close()之后调用
        exit()
    else:
        aae_ckpt_path = pretrained_aae_ckpt_path

    # phase 3: parameter optimization
    dec_ckpt_path = os.path.join('dec_ckpt', 'model{}.ckpt'.format(data_name))
    t_ckpt_path = os.path.join('adver_ckpt', 'model{}.ckpt'.format(data_name))
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())

        retrain = False
        dec_mode = True
        idec_mode = False
        adec_mode = False
        best_score = 0.
        if dec_mode or idec_mode:
            if retrain:
                logger.info("retraining the dec")
                saver.restore(sess, t_ckpt_path)
                bais = 100
            else:
                logger.info("training the dec")
                ae_saver.restore(sess, ae_ckpt_path)
                bais = 0
                # initialize mu
                z = sess.run(dec_aae_model.z,
                             feed_dict={
                                 dec_aae_model.input_: data.train_x,
                                 dec_aae_model.keep_prob: 1.0
                             })
                assign_mu_op = dec_aae_model.dec.get_assign_cluster_centers_op(
                    z)
                _ = sess.run(assign_mu_op)
                # xmlr_x = data.train_x[:10000, :]
                # xmlr_id = data.train_y[:10000]
                # z, xmlr_pred_id = sess.run([dec_aae_model.z, dec_aae_model.dec.pred],
                #                            feed_dict={dec_aae_model.input_: xmlr_x, dec_aae_model.keep_prob: 1.0,
                #                                       dec_aae_model.batch_size: xmlr_x.shape[0]})
                # pool_.apply_async(pu.save_scattered_image,
                #                   (z, xmlr_id, "./results/z_init_map_{}.jpg".format(0 + bais), xmlr_pred_id))
                # pool_.close()  # 关闭进程池,表示不能在往进程池中添加进程
                # pool_.join()  # 等待进程池中的所有进程执行完毕,必须在close()之后调用
                # exit()

                total_y = data.train_y
                total_pred = sess.run(dec_aae_model.dec.pred,
                                      feed_dict={
                                          dec_aae_model.input_:
                                          data.train_x,
                                          dec_aae_model.batch_size:
                                          data.train_x.shape[0],
                                          dec_aae_model.keep_prob:
                                          1.0
                                      })
                logger.info("[Total DEC] epoch: {}\tacc: {}".format(
                    -1, dec_aae_model.dec.cluster_acc(total_y, total_pred)))

                # print("sstart")
                # total_y = total_y[:10000]
                # z = z[:10000]
                # from sklearn.manifold import TSNE
                # z = TSNE(n_components=2, learning_rate=100).fit_transform(z)
                # kmeans = KMeans(n_clusters=data.num_classes, n_init=20)
                # pred_y = kmeans.fit_predict(z)
                # print(pu.cluster_acc(total_y, pred_y))
                # exit()

        else:
            if retrain:
                logger.info("retraining the adec")
                bais = 100
                saver.restore(sess, t_ckpt_path)
            else:
                logger.info("training the adec")
                aae_saver.restore(sess, aae_ckpt_path)
                # ae_saver.restore(sess, ae_ckpt_path)
                bais = 0
                # initialize mu
                z = sess.run(dec_aae_model.z,
                             feed_dict={
                                 dec_aae_model.input_: data.train_x,
                                 dec_aae_model.keep_prob: 1.0
                             })
                assign_mu_op = dec_aae_model.dec.get_assign_cluster_centers_op(
                    z)
                _ = sess.run(assign_mu_op)

                total_y = data.train_y
                total_pred = sess.run(dec_aae_model.dec.pred,
                                      feed_dict={
                                          dec_aae_model.input_:
                                          data.train_x,
                                          dec_aae_model.batch_size:
                                          data.train_x.shape[0],
                                          dec_aae_model.keep_prob:
                                          1.0
                                      })
                logger.info("[Total ADEC] epoch: {}\tacc: {}".format(
                    -1, dec_aae_model.dec.cluster_acc(total_y, total_pred)))
                pool_.apply_async(pu.save_scattered_image,
                                  (z[:10000, ], total_y[:10000],
                                   "./results/z_adec_map_{}.jpg".format(-1),
                                   total_pred[:10000]))

        mu = sess.run(dec_aae_model.dec.mu)
        p = None
        for cur_epoch in range(100):
            for iter_, (batch_x, batch_y, batch_idxs) in enumerate(
                    data.gen_next_batch(
                        batch_size=batch_size,
                        is_train_set=True,
                        epoch=1,
                        # iteration=50000
                    )):
                if cur_epoch % 10 == 0 and iter_ == 0:
                    q = sess.run(dec_aae_model.dec.q,
                                 feed_dict={
                                     dec_aae_model.input_:
                                     data.train_x,
                                     dec_aae_model.batch_size:
                                     data.train_x.shape[0],
                                     dec_aae_model.keep_prob:
                                     1.0
                                 })
                    p = dec_aae_model.dec.target_distribution(q)

                # if (iter_+1) % 10000 == 0:
                #     z = sess.run(dec_aae_model.z,
                #                  feed_dict={dec_aae_model.input_: data.train_x, dec_aae_model.keep_prob: 1.0})
                #     assign_mu_op = dec_aae_model.dec.get_assign_cluster_centers_op(z)
                #     _ = sess.run(assign_mu_op)
                #     mu = sess.run(dec_aae_model.dec.mu)

                batch_p = p[batch_idxs]
                train_dec_feed = {
                    dec_aae_model.input_: batch_x,
                    dec_aae_model.batch_size: batch_x.shape[0],
                    dec_aae_model.dec.p: batch_p,
                    dec_aae_model.keep_prob: 1.,
                }

                # ==========================adversial part ============================
                z_sample, z_id_one_hot, z_id_ = \
                    prior.get_sample(prior_type, batch_size, dec_aae_model.z_dim, n_labels=data.num_classes, mu=mu)
                train_dec_feed.update({
                    dec_aae_model.z_sample: z_sample,
                })
                # ==========================adversial part ============================

                if dec_mode:
                    # logger.info("DEC mode")
                    _, loss, pred = sess.run([
                        dec_aae_model.train_op_dec, dec_aae_model.dec_loss,
                        dec_aae_model.dec.pred
                    ],
                                             feed_dict=train_dec_feed)
                elif idec_mode:
                    # logger.info("IDEC mode")
                    _, loss, pred = sess.run([
                        dec_aae_model.train_op_idec, dec_aae_model.idec_loss,
                        dec_aae_model.dec.pred
                    ],
                                             feed_dict=train_dec_feed)
                elif adec_mode:
                    # logger.info("ADEC mode")
                    _, loss, pred = sess.run([
                        dec_aae_model.train_op_adec, dec_aae_model.adec_loss,
                        dec_aae_model.dec.pred
                    ],
                                             feed_dict=train_dec_feed)
                    ae_loss, g_loss, d_loss = \
                        sess.run([dec_aae_model.ae_loss, dec_aae_model.G_loss, dec_aae_model.D_loss],
                                 feed_dict=train_dec_feed)
                    tot_loss = ae_loss + g_loss + d_loss
                else:
                    raise ValueError("没有这个模式!")

                # if iter_ % 100 == 0:
                # logger.info cost every epoch
                # logger.info("[ADVER] epoch %d: L_tot %03.2f L_likelihood %03.2f d_loss %03.2f g_loss %03.2f" % (
                #     cur_epoch, tot_loss, ae_loss, d_loss, g_loss))
                # ==========================adversial part ============================
                # logger.info("[DEC] epoch: {}\tloss: {}\tacc: {}".format(cur_epoch+bais, loss,
                #                                               dec_aae_model.dec.cluster_acc(batch_y, pred)))
                if iter_ % 2500 == 0:
                    total_y = data.train_y
                    total_pred = sess.run(dec_aae_model.dec.pred,
                                          feed_dict={
                                              dec_aae_model.input_:
                                              data.train_x,
                                              dec_aae_model.batch_size:
                                              data.train_x.shape[0],
                                              dec_aae_model.keep_prob:
                                              1.0
                                          })
                    now_score = pu.cluster_acc(total_y, total_pred)
                    now_nmi = pu.cluster_nmi(total_y, total_pred)
                    if adec_mode:
                        logger.info(
                            "[ADVER] epoch %d: L_tot %03.4f L_likelihood %03.4f d_loss %03.2f g_loss %03.4f"
                            % (cur_epoch, tot_loss, ae_loss, d_loss, g_loss))
                    logger.info(
                        "[Total DEC] iteration: {}\tloss: {}\tacc: {}\tnmi: {}"
                        .format(iter_, loss, now_score, now_nmi))
                    if now_score > best_score:
                        best_score = now_score
                        saver.save(sess, t_ckpt_path)
                if iter_ % 5000 == 0:
                    xmlr_x = data.train_x[:10000, :]
                    xmlr_id = data.train_y[:10000]
                    z, xmlr_pred_id = sess.run(
                        [dec_aae_model.z, dec_aae_model.dec.pred],
                        feed_dict={
                            dec_aae_model.input_: xmlr_x,
                            dec_aae_model.keep_prob: 1.0,
                            dec_aae_model.batch_size: xmlr_x.shape[0]
                        })
                    pool_.apply_async(
                        pu.save_scattered_image,
                        (z, xmlr_id,
                         "./results/z_adec_map_{}.jpg".format(iter_),
                         xmlr_pred_id))

            total_y = data.train_y
            total_pred = sess.run(dec_aae_model.dec.pred,
                                  feed_dict={
                                      dec_aae_model.input_:
                                      data.train_x,
                                      dec_aae_model.batch_size:
                                      data.train_x.shape[0],
                                      dec_aae_model.keep_prob:
                                      1.0
                                  })
            logger.info("[Total DEC] epoch: {}\tloss: {}\tacc: {}".format(
                cur_epoch + bais, loss,
                dec_aae_model.dec.cluster_acc(total_y, total_pred)))
            # dec_saver.save(sess, dec_ckpt_path)

    pool_.close()  # 关闭进程池,表示不能在往进程池中添加进程
    pool_.join()  # 等待进程池中的所有进程执行完毕,必须在close()之后调用
Beispiel #54
0
def peak__partition(v,
                    s1,
                    s2,
                    find_maxima=True,
                    partition_op=None,
                    multiprocessing_process_num=0):
    """
    partition the volume then detect peaks for each partition
    note that this will result in redundant peaks!!
    Clean up must be done afterwards!!
    """
    import aitom.image.vol.partition as IVP

    if multiprocessing_process_num > 0:
        pool = Pool(processes=min(multiprocessing_process_num,
                                  multiprocessing.cpu_count()))
    else:
        pool = None

    if partition_op is None:
        # in this case, just generate a single partition
        siz_max = max(v.shape)
        partition_op = {
            'nonoverlap_width': siz_max * 2,
            'overlap_width': siz_max * 2
        }

    b = IVP.gen_bases(v.shape,
                      nonoverlap_width=partition_op['nonoverlap_width'],
                      overlap_width=partition_op['overlap_width'])
    print('partition num', b.shape)

    ps = []

    if pool is not None:
        pool_re = []
        for i0 in range(b.shape[0]):
            for i1 in range(b.shape[1]):
                for i2 in range(b.shape[2]):
                    bp = N.squeeze(b[i0, i1, i2, :, :])
                    pool_re.append(
                        pool.apply_async(
                            func=peak__partition__single_job,
                            kwds={
                                'v':
                                v[bp[0, 0]:bp[0, 1], bp[1, 0]:bp[1, 1],
                                  bp[2, 0]:bp[2, 1]],
                                's1':
                                s1,
                                's2':
                                s2,
                                'base':
                                bp,
                                'find_maxima':
                                find_maxima,
                                'partition_id': (i0, i1, i2),
                                'save_vg':
                                (partition_op['save_vg']
                                 if 'save_vg' in partition_op else False)
                            }))

        for pool_re_t in pool_re:
            ppsj = pool_re_t.get(9999999)
            ps.extend(ppsj['ps'])
            print('\r', ppsj['partition_id'], '                     ')
            sys.stdout.flush()

        pool.close()
        pool.join()
        del pool

    else:

        for i0 in range(b.shape[0]):
            for i1 in range(b.shape[1]):
                for i2 in range(b.shape[2]):
                    bp = N.squeeze(b[i0, i1, i2, :, :])
                    ppsj = peak__partition__single_job(
                        v=v[bp[0, 0]:bp[0, 1], bp[1, 0]:bp[1, 1],
                            bp[2, 0]:bp[2, 1]],
                        s1=s1,
                        s2=s2,
                        base=bp,
                        find_maxima=find_maxima,
                        partition_id=(i0, i1, i2),
                        save_vg=(partition_op['save_vg']
                                 if 'save_vg' in partition_op else False))
                    ps.extend(ppsj['ps'])
                    print('\r', ppsj['partition_id'], '                     ')
                    sys.stdout.flush()

    # order peaks in ps according to values
    if find_maxima:
        ps = sorted(ps, key=lambda _: (-_['val']))
    else:
        ps = sorted(ps, key=lambda _: _['val'])

    return ps
Beispiel #55
0
import time
import os
from multiprocessing.pool import Pool


def action1(a, b=50):
    for i in range(b):
        print(a, os.getpid(), ' ', i)  # os.getpid(): pid简单来说就是每个进程的“身份证”
        time.sleep(0.1)


if __name__ == '__main__':  # 还要添加这行,否则可能出现异常

    ci = Pool(3)  # 创建一个进程池,容量为3个进程
    ci.apply_async(action1, args=('进程一', ))  # 启动第一个子进程...
    ci.apply_async(action1, args=('进程二', 50))  # 和普通进程的启动方式有很大不同仔细看
    ci.apply_async(action1, args=('进程三', 60))  # Pool的最基本格式记住←
    # 注意:程序现在有4个进程在运行:上面的三个子进程 和一个最为核心的:主进程

    ci.close()  # 关闭进程池(但池子内已启动的子进程还会继续进行)
    ci.join()  # 等待进程池内的所有子进程完毕
    print('比如说这最后的一行输出就是主进程执行任务打印出来的')
Beispiel #56
0
def _execute_sub_tasks(task_id, params, sig_content, verbosity, runmode,
                       sigmode, monitor_interval, resource_monitor_interval):
    '''If this is a master task, execute as individual tasks'''
    m = ProcessMonitor(
        task_id,
        monitor_interval=monitor_interval,
        resource_monitor_interval=resource_monitor_interval,
        max_walltime=params.sos_dict['_runtime'].get('max_walltime', None),
        max_mem=params.sos_dict['_runtime'].get('max_mem', None),
        max_procs=params.sos_dict['_runtime'].get('max_procs', None),
        sos_dict=params.sos_dict)
    m.start()

    env.logger.info(f'{task_id} ``started``')

    master_out = os.path.join(os.path.expanduser('~'), '.sos', 'tasks',
                              task_id + '.out')
    master_err = os.path.join(os.path.expanduser('~'), '.sos', 'tasks',
                              task_id + '.err')
    # if this is a master task, calling each sub task
    with open(master_out, 'wb') as out, open(master_err, 'wb') as err:

        def copy_out_and_err(result):
            tid = result['task']
            out.write(
                f'{tid}: {"completed" if result["ret_code"] == 0 else "failed"}\n'
                .encode())
            if 'output' in result:
                out.write(f'output: {result["output"]}\n'.encode())
            sub_out = os.path.join(os.path.expanduser('~'), '.sos', 'tasks',
                                   tid + '.out')
            if os.path.isfile(sub_out):
                with open(sub_out, 'rb') as sout:
                    out.write(sout.read())
                try:
                    os.remove(sub_out)
                except Exception as e:
                    env.logger.warning(f'Failed to remove {sub_out}: {e}')

            sub_err = os.path.join(os.path.expanduser('~'), '.sos', 'tasks',
                                   tid + '.err')
            if 'exception' in result:
                err.write(str(result['exception']).encode())
            err.write(
                f'{tid}: {"completed" if result["ret_code"] == 0 else "failed"}\n'
                .encode())
            if os.path.isfile(sub_err):
                with open(sub_err, 'rb') as serr:
                    err.write(serr.read())
                try:
                    os.remove(sub_err)
                except Exception as e:
                    env.logger.warning(f'Failed to remove {sub_err}: {e}')

            # remove other files as well
            try:
                remove_task_files(tid, ['.out', '.err'])
            except Exception as e:
                env.logger.debug(f'Failed to remove files {tid}: {e}')

        if params.num_workers > 1:
            from multiprocessing.pool import Pool
            p = Pool(params.num_workers)
            results = []
            for t in params.task_stack:
                results.append(
                    p.apply_async(_execute_task,
                                  ((*t, {
                                      t[0]: sig_content.get(t[0], {})
                                  }), verbosity, runmode, sigmode, None, None),
                                  callback=copy_out_and_err))
            for idx, r in enumerate(results):
                results[idx] = r.get()
            p.close()
            p.join()
            # we wait for all results to be ready to return or raise
            # but we only raise exception for one of the subtasks
            # for res in results:
            #     if 'exception' in res:
            #         failed = [x.get("task", "")
            #                   for x in results if "exception" in x]
            #         env.logger.error(
            #             f'{task_id} ``failed`` due to failure of subtask{"s" if len(failed) > 1 else ""} {", ".join(failed)}')
            #         return {'ret_code': 1, 'exception': res['exception'], 'task': task_id}
        else:
            results = []
            for tid, tdef in params.task_stack:
                # no monitor process for subtasks
                res = _execute_task((tid, tdef, {
                    tid: sig_content.get(tid, {})
                }),
                                    verbosity=verbosity,
                                    runmode=runmode,
                                    sigmode=sigmode,
                                    monitor_interval=None,
                                    resource_monitor_interval=None)
                try:
                    copy_out_and_err(res)
                except Exception as e:
                    env.logger.warning(
                        f'Failed to copy result of subtask {tid}: {e}')
                results.append(res)
            # for res in results:
            #     if 'exception' in res:
            #         failed = [x.get("task", "")
            #                   for x in results if "exception" in x]
            #         env.logger.error(
            #             f'{task_id} ``failed`` due to failure of subtask{"s" if len(failed) > 1 else ""} {", ".join(failed)}')
            #         return {'ret_code': 1, 'exception': res['exception'], 'task': task_id}
    #
    # now we collect result
    all_res = {
        'ret_code': 0,
        'output': None,
        'subtasks': {},
        'shared': {},
        'skipped': 0,
        'signature': {}
    }
    for tid, x in zip(params.task_stack, results):
        all_res['subtasks'][tid[0]] = x

        if 'exception' in x:
            all_res['exception'] = x['exception']
            all_res['ret_code'] += 1
            continue
        all_res['ret_code'] += x['ret_code']
        if all_res['output'] is None:
            all_res['output'] = x['output']
        else:
            try:
                all_res['output'].extend(x['output'], keep_groups=True)
            except Exception as e:
                env.logger.warning(
                    f"Failed to extend output {all_res['output']} with {x['output']}"
                )
        all_res['shared'].update(x['shared'])
        # does not care if one or all subtasks are executed or skipped.
        all_res['skipped'] += x.get('skipped', 0)
        if 'signature' in x:
            all_res['signature'].update(x['signature'])

    if all_res['ret_code'] != 0:
        if all_res['ret_code'] == len(results):
            env.logger.info(
                f'All {len(results)} tasks in {task_id} ``failed``')
        else:
            env.logger.info(
                f'{all_res["ret_code"]} of {len(results)} tasks in {task_id} ``failed``'
            )
        # if some failed, some skipped, not skipped
        if 'skipped' in all_res:
            all_res.pop('skipped')
    elif all_res['skipped']:
        if all_res['skipped'] == len(results):
            env.logger.info(
                f'All {len(results)} tasks in {task_id} ``ignored`` or skipped'
            )
        else:
            # if only partial skip, we still save signature and result etc
            env.logger.info(
                f'{all_res["skipped"]} of {len(results)} tasks in {task_id} ``ignored`` or skipped'
            )
            all_res.pop('skipped')
    else:
        env.logger.info(f'All {len(results)} tasks in {task_id} ``completed``')
    return all_res
Beispiel #57
0
    # Parameters
    process_num = 24
    image_size = (512, 512)
    url = 'http://v18.proteinatlas.org/images/'
    csv_path =  "../input/HPAv18RBGY_wodpl.csv"
    save_dir = "./external_data"

    # Create the directory to save the images in case it doesn't exist
    try:
        os.makedirs(save_dir)
    except OSError as exc:
        if exc.errno != errno.EEXIST:
            raise
        pass

    print('Parent process %s.' % os.getpid())
    img_list = pd.read_csv(csv_path)['Id']
    list_len = len(img_list)
    
    p = Pool(process_num)
    for i in range(process_num):
        start = int(i * list_len / process_num)
        end = int((i + 1) * list_len / process_num)
        process_images = img_list[start:end]
        p.apply_async(
            download, args=(str(i), process_images, url, save_dir, image_size)
        )
    print('Waiting for all subprocesses done...')
    p.close()		# 调用close()之后不能继续添加新的Prpcess
    p.join()		# p.join()之前必须等待所有的子进程执行完毕
    print('All subprocesses done.')
def pos_type_classify(bamfile,
                      chrom,
                      start,
                      end,
                      is_single,
                      read_length,
                      temp_dir,
                      extension=None,
                      center=True,
                      maxsize=None,
                      process=20,
                      minmapq=0,
                      is_multmapfilter=False):
    print bamfile, chrom, start, end, is_single, read_length, temp_dir, extension, center
    if is_single:
        total_reads_type6_left = [
        ]  # 6. in left place of del and second read is on the breakpoint
        total_reads_type6_right = [
        ]  # 6. in right place of del and first read is on the breakpoint
        total_reads_type7 = []  # 7. reads within the del
        # temp_prefix = "%s/classify_%s" % (temp_dir, sub_num)
        if extension:
            rel_start = start - extension
            rel_end = end + extension
        else:
            rel_start = start
            rel_end = end
        if center:
            reads_type6_left, reads_type6_right, reads_type7, filtered_reads_num = posType_sub_single(
                bamfile, chrom, rel_start, rel_end, start, end, minmapq,
                is_multmapfilter)
        else:
            rel_start_left = rel_start
            rel_end_left = start + maxsize
            rel_start_right = end - maxsize
            rel_end_right = rel_end
            reads_type6_left_1, reads_type6_right_1, reads_type7_1, filtered_reads_num_1 = posType_sub_single(
                bamfile, chrom, rel_start_left, rel_end_left, start, end,
                minmapq, is_multmapfilter)
            reads_type6_left_2, reads_type6_right_2, reads_type7_2, filtered_reads_num_2 = posType_sub_single(
                bamfile, chrom, rel_start_right, rel_end_right, start, end,
                minmapq, is_multmapfilter)
            reads_type6_left = reads_type6_left_1 + reads_type6_right_1
            reads_type6_right = reads_type6_right_1 + reads_type6_right_2
            reads_type7 = reads_type7_1 + reads_type7_2
            filtered_reads_num = filtered_reads_num_1 + filtered_reads_num_2

        total_reads_type6_left.extend(reads_type6_left)
        total_reads_type6_right.extend(reads_type6_right)
        total_reads_type7.extend(reads_type7)
        total_filtered_reads = filtered_reads_num
        print total_reads_type6_left, total_reads_type6_right, total_reads_type7, total_filtered_reads
        return total_reads_type6_left, total_reads_type6_right, total_reads_type7, total_filtered_reads
    else:
        total_reads_type1_left = [
        ]  # 1. in left place of del and second read is on the breakpoint
        total_reads_type1_right = [
        ]  # 1. in right place of del and first read is on the breakpoint
        total_reads_type2_left = [
        ]  # 2. in left place of del and first read is on the breakpoint
        total_reads_type2_right = [
        ]  # 2. in right place of del and second read is on the breakpoint
        total_reads_type3_left = [
        ]  # 3. in left place of del and first read and right read is crossover breakpoint with no intersection
        total_reads_type3_right = [
        ]  # 3. in right place of del and first read and right read is crossover breakpoint with no intersection
        total_reads_type4 = []  # 4. reads within the del
        total_reads_type5_left = [
        ]  # 5. in left place of del and first read and right read are all has intersection
        total_reads_type5_right = [
        ]  # 3. in right place of del and first read and right read are all has intersection
        total_filtered_reads = 0

        length = end - start + 1
        sub_num = length / read_length

        # when start = end, translocation of chromosome
        if start == end:
            rel_start = start - maxsize
            rel_end = end + maxsize
            print rel_start, rel_end
            # temp_prefix = "%s/classify_%s" % (temp_dir, "whole")
            (reads_type1_left, reads_type1_right, reads_type2_left, reads_type2_right, reads_type3_left,
             reads_type3_right, reads_type4, reads_type5_left, reads_type5_right, filtered_reads_num) \
                = posType_sub_paired(bamfile, chrom, rel_start, rel_end, start, end, read_length, minmapq,
                                     is_multmapfilter, extension=extension)
            total_reads_type1_left.extend(reads_type1_left)
            total_reads_type1_right.extend(reads_type1_right)
            total_reads_type2_left.extend(reads_type2_left)
            total_reads_type2_right.extend(reads_type2_right)
            total_reads_type3_left.extend(reads_type3_left)
            total_reads_type3_right.extend(reads_type3_right)
            total_reads_type4.extend(reads_type4)
            total_reads_type5_left.extend(reads_type5_left)
            total_reads_type5_right.extend(reads_type5_right)
            total_filtered_reads = filtered_reads_num

        # end - start < read_length and there is no need to extend its scope
        elif sub_num == 0 and not extension:
            # temp_prefix = "%s/classify_%s" % (temp_dir, sub_num)
            (reads_type1_left, reads_type1_right, reads_type2_left, reads_type2_right, reads_type3_left,
             reads_type3_right, reads_type4, reads_type5_left, reads_type5_right, filtered_reads_num) \
                = posType_sub_paired(bamfile, chrom, start, end, start, end, read_length, minmapq, is_multmapfilter,
                                     extension=extension)
            total_reads_type1_left.extend(reads_type1_left)
            total_reads_type1_right.extend(reads_type1_right)
            total_reads_type2_left.extend(reads_type2_left)
            total_reads_type2_right.extend(reads_type2_right)
            total_reads_type3_left.extend(reads_type3_left)
            total_reads_type3_right.extend(reads_type3_right)
            total_reads_type4.extend(reads_type4)
            total_reads_type5_left.extend(reads_type5_left)
            total_reads_type5_right.extend(reads_type5_right)
            total_filtered_reads = filtered_reads_num

        # there should be more than one process to calculate.
        else:
            run_pool = Pool(process)
            result_list = []
            # extension the range to cover whole reads
            if extension:
                rel_start = start - extension
                rel_end = end + extension
                length = rel_end - rel_start + 1
                sub_num = length / read_length
            else:
                rel_start = start
                rel_end = end
            # if center should be consider or center is no need to consider, but the center size is too less
            if center or (not center and maxsize is not None
                          and length < maxsize * 2):
                for i in range(sub_num):
                    sub_start = i * read_length + rel_start
                    if i == sub_num - 1:
                        sub_end = rel_end
                    else:
                        sub_end = sub_start + 1
                    print "Sub Process: %s" % i, sub_start, sub_end
                    result_list.append(
                        run_pool.apply_async(
                            posType_sub_paired,
                            args=(bamfile, chrom, sub_start, sub_end, start,
                                  end, read_length, minmapq, is_multmapfilter,
                                  extension)))
                run_pool.close()
                run_pool.join()
            # if center is no need to consider
            else:
                rel_start_left = rel_start
                rel_end_left = start + maxsize
                rel_start_right = end - maxsize
                rel_end_right = rel_end
                # print rel_start_left, rel_end_left, rel_start_right, rel_end_right
                length = rel_end_left - rel_start_left + 1
                sub_num = length / read_length
                for i in range(sub_num):
                    sub_start = i * read_length + rel_start_left
                    if i == sub_num - 1:
                        sub_end = rel_end_left
                    else:
                        sub_end = sub_start + 1
                    print "Sub Process: %s" % i, sub_start, sub_end
                    # temp_prefix = "%s/classify_%s" % (temp_dir, i)
                    result_list.append(
                        run_pool.apply_async(
                            posType_sub_paired,
                            args=(bamfile, chrom, sub_start, sub_end, start,
                                  end, read_length, minmapq, is_multmapfilter,
                                  extension)))
                length = rel_end_right - rel_start_right + 1
                sub_num = length / read_length
                for i in range(sub_num):
                    sub_start = i * read_length + rel_start_right
                    if i == sub_num - 1:
                        sub_end = rel_end_right
                    else:
                        sub_end = sub_start + 1
                    print "Sub Process: %s" % i, sub_start, sub_end
                    # temp_prefix = "%s/classify_%s" % (temp_dir, i)
                    result_list.append(
                        run_pool.apply_async(
                            posType_sub_paired,
                            args=(bamfile, chrom, sub_start, sub_end, start,
                                  end, read_length, minmapq, is_multmapfilter,
                                  extension)))
                run_pool.close()
                run_pool.join()

            for res in result_list:
                reads_type1_left, reads_type1_right, reads_type2_left, reads_type2_right, reads_type3_left, reads_type3_right, reads_type4, reads_type5_left, reads_type5_right, filtered_reads_num = res.get(
                )
                total_reads_type1_left.extend(reads_type1_left)
                total_reads_type1_right.extend(reads_type1_right)
                total_reads_type2_left.extend(reads_type2_left)
                total_reads_type2_right.extend(reads_type2_right)
                total_reads_type3_left.extend(reads_type3_left)
                total_reads_type3_right.extend(reads_type3_right)
                total_reads_type4.extend(reads_type4)
                total_reads_type5_left.extend(reads_type5_left)
                total_reads_type5_right.extend(reads_type5_right)
                total_filtered_reads += filtered_reads_num

        print "type1_left: %s; type1_right: %s, type2_left: %s; type2_right: %s, type3_left: %s; " \
              "type3_right: %s, type4: %s; type5_left: %s; type5_right: %s" % (
                  len(total_reads_type1_left), len(total_reads_type1_right), len(total_reads_type2_left),
                  len(total_reads_type2_right), len(total_reads_type3_left), len(total_reads_type3_right),
                  len(total_reads_type4), len(total_reads_type5_left), len(total_reads_type5_right))
        print "total_filtered_reads: %s" % total_filtered_reads
        return total_reads_type1_left, total_reads_type1_right, total_reads_type2_left, total_reads_type2_right, total_reads_type3_left, total_reads_type3_right, total_reads_type4, total_reads_type5_left, total_reads_type5_right, total_filtered_reads
Beispiel #59
0
def aggregate_scores(test_ref_pairs,
                     evaluator=NiftiEvaluator,
                     labels=None,
                     nanmean=True,
                     json_output_file=None,
                     json_name="",
                     json_description="",
                     json_author="Fabian",
                     json_task="",
                     num_threads=2,
                     **metric_kwargs):
    """
    test = predicted image
    :param test_ref_pairs:
    :param evaluator:
    :param labels: must be a dict of int-> str or a list of int
    :param nanmean:
    :param json_output_file:
    :param json_name:
    :param json_description:
    :param json_author:
    :param json_task:
    :param metric_kwargs:
    :return:
    """

    if type(evaluator) == type:
        evaluator = evaluator()

    if labels is not None:
        evaluator.set_labels(labels)

    all_scores = OrderedDict()
    all_scores["all"] = []
    all_scores["mean"] = OrderedDict()

    test = [i[0] for i in test_ref_pairs]
    ref = [i[1] for i in test_ref_pairs]
    p = Pool(num_threads)
    all_res = p.map(
        run_evaluation,
        zip(test, ref, [evaluator] * len(ref), [metric_kwargs] * len(ref)))
    p.close()
    p.join()

    for i in range(len(all_res)):
        all_scores["all"].append(all_res[i])

        # append score list for mean
        for label, score_dict in all_res[i].items():
            if label in ("test", "reference"):
                continue
            if label not in all_scores["mean"]:
                all_scores["mean"][label] = OrderedDict()
            for score, value in score_dict.items():
                if score not in all_scores["mean"][label]:
                    all_scores["mean"][label][score] = []
                all_scores["mean"][label][score].append(value)

    for label in all_scores["mean"]:
        for score in all_scores["mean"][label]:
            if nanmean:
                all_scores["mean"][label][score] = float(
                    np.nanmean(all_scores["mean"][label][score]))
            else:
                all_scores["mean"][label][score] = float(
                    np.mean(all_scores["mean"][label][score]))

    # save to file if desired
    # we create a hopefully unique id by hashing the entire output dictionary
    if json_output_file is not None:
        json_dict = OrderedDict()
        json_dict["name"] = json_name
        json_dict["description"] = json_description
        timestamp = datetime.today()
        json_dict["timestamp"] = str(timestamp)
        json_dict["task"] = json_task
        json_dict["author"] = json_author
        json_dict["results"] = all_scores
        json_dict["id"] = hashlib.md5(
            json.dumps(json_dict).encode("utf-8")).hexdigest()[:12]
        save_json(json_dict, json_output_file)

    return all_scores
Beispiel #60
0
    def run(self,
            dimension,
            stage_idx,
            prev_stage_value=0,
            num_population=100,
            num_generations=100,
            elite_ratio=0.05,
            parents_ratio=0.15,
            ratio_decay=1,
            num_finetune=1,
            best_sol_1st=None):

        num_generations = num_generations
        num_population = num_population
        num_elite = int(num_population * elite_ratio)
        pool = Pool(min(num_population + num_elite, cpu_count()))
        best_reward_list = []
        best_reward = [-float("Inf") for _ in range(len(self.fitness))]
        best_sol = None
        population = [
            self.create_genome_fixedSL() for _ in range(num_population)
        ] if ((stage_idx == 0) or (best_sol_1st is None)) else [
            best_sol_1st for _ in range(num_population)
        ]
        fitness = np.ones((num_population, len(self.fitness)), float)
        num_parents = num_population
        for g in range(num_generations):
            finetine_iter = 1 if g < num_generations // 2 else num_finetune
            for f in range(finetine_iter):
                is_finetune = f > 0
                gen_best = -float("Inf")
                gen_best_idx = 0
                count_non_valid = 0
                if num_parents < 1:  # restart
                    population = [
                        self.create_genome_fixedSL()
                        for _ in range(num_population)
                    ] if ((stage_idx == 0) or (best_sol_1st is None)) else [
                        best_sol_1st for _ in range(num_population)
                    ]
                    fitness = np.ones((num_population, len(self.fitness)),
                                      float)
                    print("Reinitialize population")
                    num_parents = num_population
                population, fitness, parents = self.select_parents(
                    population,
                    fitness,
                    num_parents,
                    num_population,
                    stage_idx,
                    first_stage_value=prev_stage_value)
                elite = copy.deepcopy(parents[:num_elite])
                elite_fitness = copy.deepcopy(fitness[:(len(elite))])
                if is_finetune:
                    self.mutate_tile(population,
                                     num_mu_loc=3,
                                     range_alpha=0.1,
                                     alpha=0.52,
                                     is_finetune=True)
                else:
                    self.crossover_tile(parents, population, alpha=0.57)
                    self.mutate_tile(population,
                                     num_mu_loc=3,
                                     range_alpha=0.53,
                                     alpha=0.52,
                                     is_finetune=False)
                    self.swap_order(population, alpha=0.47)
                    self.born_cluster(population, alpha=0.57)
                    self.kill_cluster(population, alpha=0.27)

                population = elite + population
                fitness = np.concatenate((elite_fitness, fitness))
                reward_list = pool.map(self.thread_fun, population)
                for i in range(len(population)):
                    reward = reward_list[i]
                    if reward is None or any(np.array(reward) >= 0):
                        reward = [
                            float("-Inf") for _ in range(len(best_reward))
                        ]
                        count_non_valid += 1
                    elif stage_idx > 0:
                        if any([
                                reward[kk] < prev_stage_value[kk]
                                for kk in range(len(prev_stage_value))
                        ]):
                            reward = [
                                float("-Inf") for _ in range(len(best_reward))
                            ]
                            count_non_valid += 1
                    judging_reward = reward[stage_idx]
                    fitness[i] = reward
                    if gen_best < judging_reward:
                        gen_best = judging_reward
                        gen_best_idx = i
                judging_best_reward = best_reward[stage_idx]
                if judging_best_reward < gen_best:
                    best_reward = copy.deepcopy(fitness[gen_best_idx])
                    best_sol = copy.deepcopy(population[gen_best_idx])

                num_parents = int(num_population * parents_ratio)
                num_parents = min(num_parents,
                                  len(population) - count_non_valid)
                parents_ratio *= ratio_decay
                best_reward_list.append(best_reward)
                chkpt = {
                    "best_reward": best_reward,
                    "best_reward_list": best_reward_list,
                    "best_sol": best_sol,
                    "num_population": num_population,
                    "num_generations": num_generations,
                    "fitness_use": self.fitness,
                    "num_pe": self.num_pe,
                    "l1_size": self.l1_size,
                    "l2_size": self.l2_size,
                    "NocBW": self.NocBW,
                    "dimension": dimension
                }
                if self.log_level == 2:
                    print(
                        "[Stage {}]Gen {}: Gen reward: {:3e}, 1st stage Reward: {}, Best reward: {}, Non_valid: {}"
                        .format(stage_idx + 1, (g + 1), gen_best,
                                np.abs(prev_stage_value), np.abs(best_reward),
                                count_non_valid))
                elif self.log_level == 1:
                    if stage_idx == 0:
                        print("[Stage {}]Gen {}: Best reward: {}".format(
                            stage_idx + 1, (g + 1),
                            np.abs(best_reward)[0]))
                    else:
                        print(
                            "[Stage {}]Gen {}:  1st stage Reward: {}, Best reward: {}"
                            .format(stage_idx + 1, (g + 1),
                                    np.abs(prev_stage_value),
                                    np.abs(best_reward)))
        pool.close()
        return chkpt