def calculate_small_parsimony(inq, outq, stopiter, treefile, matfile,bootstrap_replicates, row_index, iolock , verbose = False ): inq = Queue('inq') outq = Queue('outq') #setup the tree and matrix for each worker with h5py.File(matfile) as hf: align_array = hf['MSA2array'] missing = 0 sys.setrecursionlimit( 10 **8 ) t = dendropy.Tree.get( path=treefile, schema='newick') #init the blank tree for i,n in enumerate(t.nodes()): n.matrow = i n.symbols = None n.scores = None n.event = None n.char = None n.eventype = None n.AAevent = 0 for i,l in enumerate(t.leaf_nodes()): l.event = {} l.scores = {} l.symbols = {} l.char= {} l.calc = {} #work on a fresh tree each time while stopiter == False or inq.qsize()>0: codon ,pos = inq.get() #assign leaf values #repeat here for bootstrap for i in range(bootsrap_replicates): #select portion of random genomes to take out if bootstrap_replicates >1: del_genomes = set(np.random.randint( align_array.shape[0], size= int(align_array.shape[0]*bootstrap) ) ) else: del_genomes = set([]) #change a subset of leaves to ambiguous characters for pos,col in enumerate(pos): for l in t.leaf_nodes(): if type(col[1]) is not None: #column has no events l.calc[pos] = False char = col[1] l.event[pos] = 0 l.scores[pos] = { c:10**10 for c in allowed_symbols } if char.upper() in allowed_symbols: l.symbols[pos] = { char } l.scores[pos][char] = 0 else: #ambiguous leaf l.symbols[pos] = allowed_symbols else: #setup for small_pars1 l.calc[pos] = True l.event[pos] = 0 l.scores[pos] = { c:10**10 for c in allowed_symbols } if str(l.taxon).replace("'", '') in row_index: char = align_array[ row_index[str(l.taxon).replace("'", '')] , col[0] ] if char.upper() in allowed_symbols: l.symbols[pos] = { char } l.scores[pos][char] = 0 elif col[0] in del_genomes: l.symbols[pos] = allowed_symbols else: #ambiguous leaf l.symbols[pos] = allowed_symbols else: missing += 1 char = None l.symbols[pos] = allowed_symbols if verbose == True: iolock.acquire() print( 'err ! alncol: ', l.taxon , aln_column ) iolock.release() l.char[pos] = min(l.scores[pos], key=l.scores[pos].get) #done tree init #up process_node_smallpars_1(t.seed_node) #down process_node_smallpars_2(t.seed_node) #collect events eventdict = {} for pos in [0,1,2]: eventindex = [ n.matrow for n in t.nodes() if n.event[pos] > 0 ] eventtypes = [ n.eventype[pos] for n in t.nodes() if n.event[pos] > 0 ] eventdict[pos] = { 'type': eventtypes , 'index' : eventindex } AAeventindex = [ n.matrow for n in t.nodes() if n.AAevent ] AAeventypes = [ n.AAevent for n in t.nodes() if n.AAevent ] outq.put(col, eventdict , AAeventindex , AAeventypes)
class ClusterDaskDistributor(DistributorBaseClass): """Distributor using a dask cluster. meaning that the calculation is spread over a cluster. :param str address: The `address` of dask-scheduler. eg. `tcp://127.0.0.1:8786`. """ def __init__(self, address): """Set up a distributor that connects to a dask-scheduler to distribute the calculaton. :param address: the ip address and port number of the dask-scheduler. :type address: str """ self.address = address self.future_set = set() self._queue_lock = Lock() def get_client(self): """Initialize a Client by pointing it to the address of a dask-scheduler. also, will init the worker count `self.n_workers` and two queue : `self.process_queue` and `self.result_queue` to save running process and results respectively. :return: return new client that is the primary entry point for users of dask.distributed. :rtype: distributed.Cient """ from dask.distributed import Client from dask.distributed import Queue client = Client(address=self.address) self.n_workers = len(client.scheduler_info()["workers"]) self.process_queue = Queue(client=client, maxsize=self.n_workers) self.result_queue = Queue(client=client) return client def get_worker_count(self): """Get the worker count of current Client in dask-scheduler. :return: the worker count of current Client in dask-scheduler. :rtype: int """ return self.n_workers def update_queues(self): """Update current client status, include all queue and set.""" with self._queue_lock: finished_set = set() for f in self.future_set: pid = f[0] future = f[1] if future.done(): self.result_queue.put((pid, future.result())) self.process_queue.get() finished_set.add(f) for f in finished_set: self.future_set.remove(f) def result_queue_empty(self): """Update current client status, and return if the result queue is empty. :return: if the result queue is empty. :rtype: bool """ self.update_queues() return self.result_queue.qsize() == 0 def result_queue_get(self): """Get a (pid, reslut) pair from result queue if it is not empty. :return: first (pid, result) pair in result queue. :rtype: (str or int or None, a user-defined result or None) """ self.update_queues() if self.result_queue.qsize() != 0: pid, result = self.result_queue.get() return pid, result else: return None, None def process_queue_full(self): """Check if current process queue is full. :return: if current process queue is full return True, otherwise False. :rtype: bool """ self.update_queues() return self.process_queue.qsize() == self.n_workers def process_queue_empty(self): """Check if current process queue is empty. :return: if current process queue is empty return True, otherwise False. :rtype: bool """ self.update_queues() return self.process_queue.qsize() == 0 def distribute(self, client, pid, func, kwargs): """Submit a calculation task to cluster. the calculation task will be executed asynchronously on one worker of the cluster. the `client` is the cluster entry point, `pid` is a user-defined unique id for this task, `func` is the function or object that do the calculation, `kwargs` is the parameters for `func`. :param distributed.Client client: the target `client` to run this task. :param pid: unique `pid` to descript this task. :type pid: str or int(defined by user). :param func: A serializable function or object(callable and has `__call__` function) which need to be distributed calculaton. :type func: function or object. :param dict kwargs: Parameter of `func`. """ future = client.submit(func, **kwargs) f = (pid, future) self.future_set.add(f) self.process_queue.put(pid) def close(self, client): """Close the connection to the local Dask Scheduler. :param distributed.Client client: the target `client` to close. """ client.close() def join(self): """Wait all process in process_queue to finish.""" while not self.process_queue_empty(): time.sleep(0.1) return
for n in range(bootstrap_replicates): #select portion of random genomes to take out if bootstrap_replicates >1: del_genomes = np.random.randint( align_array.shape[0], size= int(align_array.shape[0]*bootstrap) ) for annot_index,annot_row in annotation.iterrows(): #indexing starts at 1 for blast #####switch to sending the coordinates and masking for the matrix for j,codon in enumerate(range(annot_row.qstart-1, annot_row.qend-1 , 3 )): positions = [] for col in [codon, codon+1 , codon+2]: if col in informativesites: positions.append( (col, None) ) else: #just add the alignment character if it doesnt change. positions.append( (col, align_array[0,col] ) ) #submit codons inq.put( (codon, positions) ) if workers_started == False and inq.qsize() > start_worker_trigger: #start workers for workers in range(NCORE): client.submit( calculate_small_parsimony , inq= inq ,outq = outq ,stopiter= stopiter , treefile=treefile , row_index= remote_index , iolock = lock, verbose = False ) if saver_started == False and inq.qsize() > future_clean_trigger: print('starting saver') client.submit( collect_futures , queue= queue , stopiter=stopiter , runName= runName , nucleotides_only =False ) saver_started = True stopiter.set(True) print('done iterating')