def calculate_small_parsimony(inq, outq, stopiter, treefile, matfile,bootstrap_replicates, row_index, iolock , verbose  = False ):
        inq = Queue('inq')
        outq = Queue('outq')
        #setup the tree and matrix for each worker
        with h5py.File(matfile) as hf:
            align_array = hf['MSA2array']
            missing = 0
            sys.setrecursionlimit( 10 **8 )
            t = dendropy.Tree.get(
                    path=treefile,
                    schema='newick')
            #init the blank tree
            for i,n in enumerate(t.nodes()):
                n.matrow = i
                n.symbols = None
                n.scores = None
                n.event = None
                n.char = None
                n.eventype = None
                n.AAevent = 0

            for i,l in enumerate(t.leaf_nodes()):
                l.event = {}
                l.scores = {}
                l.symbols = {}
                l.char= {}
                l.calc = {}



            #work on a fresh tree each time
            while stopiter == False or inq.qsize()>0:
                codon ,pos = inq.get()
                #assign leaf values


                #repeat here for bootstrap

                for i in range(bootsrap_replicates):
                    #select portion of random genomes to take out
                    if bootstrap_replicates >1:
                        del_genomes = set(np.random.randint( align_array.shape[0], size= int(align_array.shape[0]*bootstrap) ) )
                    else:
                        del_genomes = set([])

                    #change a subset of leaves to ambiguous characters
                    for pos,col in enumerate(pos):
                        for l in t.leaf_nodes():
                            if type(col[1]) is not None:
                                #column has no events
                                l.calc[pos] = False
                                char = col[1]
                                l.event[pos] = 0
                                l.scores[pos] = { c:10**10 for c in allowed_symbols }
                                if char.upper() in allowed_symbols:
                                    l.symbols[pos] = { char }

                                    l.scores[pos][char] = 0
                                else:
                                    #ambiguous leaf
                                    l.symbols[pos] = allowed_symbols
                            else:
                                #setup for small_pars1
                                l.calc[pos] = True
                                l.event[pos] = 0
                                l.scores[pos] = { c:10**10 for c in allowed_symbols }
                                if str(l.taxon).replace("'", '') in row_index:

                                    char = align_array[ row_index[str(l.taxon).replace("'", '')] , col[0] ]
                                    if char.upper() in allowed_symbols:
                                        l.symbols[pos] = { char }
                                        l.scores[pos][char] = 0
                                    elif col[0] in del_genomes:
                                        l.symbols[pos] =  allowed_symbols
                                    else:
                                        #ambiguous leaf
                                        l.symbols[pos] =  allowed_symbols
                                else:
                                    missing += 1
                                    char = None
                                    l.symbols[pos] =  allowed_symbols
                                    if verbose == True:
                                        iolock.acquire()
                                        print( 'err ! alncol: ', l.taxon , aln_column  )
                                        iolock.release()
                                l.char[pos] = min(l.scores[pos], key=l.scores[pos].get)
                    #done tree init
                    #up
                    process_node_smallpars_1(t.seed_node)
                    #down
                    process_node_smallpars_2(t.seed_node)
                    #collect events
                    eventdict = {}
                    for pos in [0,1,2]:
                        eventindex = [ n.matrow for n in t.nodes() if n.event[pos] > 0 ]
                        eventtypes = [ n.eventype[pos] for n in t.nodes() if n.event[pos] > 0 ]
                        eventdict[pos] = { 'type': eventtypes , 'index' : eventindex }
                    AAeventindex = [ n.matrow for n in t.nodes() if n.AAevent  ]
                    AAeventypes = [ n.AAevent for n in t.nodes() if n.AAevent  ]
                    outq.put(col, eventdict , AAeventindex , AAeventypes)
Beispiel #2
0
class ClusterDaskDistributor(DistributorBaseClass):
    """Distributor using a dask cluster.

    meaning that the calculation is spread over a cluster.

    :param str address: The `address` of dask-scheduler.
        eg. `tcp://127.0.0.1:8786`.

    """
    def __init__(self, address):
        """Set up a distributor that connects to a dask-scheduler to distribute the calculaton.

        :param address: the ip address and port number of the dask-scheduler.
        :type address: str
        """
        self.address = address
        self.future_set = set()
        self._queue_lock = Lock()

    def get_client(self):
        """Initialize a Client by pointing it to the address of a dask-scheduler.

        also, will init the worker count `self.n_workers` and two queue :
        `self.process_queue` and `self.result_queue` to save running process
        and results respectively.

        :return: return new client that is the primary entry point for users of
             dask.distributed.
        :rtype: distributed.Cient

        """
        from dask.distributed import Client
        from dask.distributed import Queue
        client = Client(address=self.address)
        self.n_workers = len(client.scheduler_info()["workers"])
        self.process_queue = Queue(client=client, maxsize=self.n_workers)
        self.result_queue = Queue(client=client)
        return client

    def get_worker_count(self):
        """Get the worker count of current Client in dask-scheduler.

        :return: the worker count of current Client in dask-scheduler.
        :rtype: int

        """
        return self.n_workers

    def update_queues(self):
        """Update current client status, include all queue and set."""
        with self._queue_lock:
            finished_set = set()
            for f in self.future_set:
                pid = f[0]
                future = f[1]
                if future.done():
                    self.result_queue.put((pid, future.result()))
                    self.process_queue.get()
                    finished_set.add(f)
            for f in finished_set:
                self.future_set.remove(f)

    def result_queue_empty(self):
        """Update current client status, and return if the result queue is empty.

        :return: if the result queue is empty.
        :rtype: bool

        """
        self.update_queues()
        return self.result_queue.qsize() == 0

    def result_queue_get(self):
        """Get a (pid, reslut) pair from result queue if it is not empty.

        :return: first (pid, result) pair in result queue.
        :rtype: (str or int or None, a user-defined result or None)

        """
        self.update_queues()
        if self.result_queue.qsize() != 0:
            pid, result = self.result_queue.get()
            return pid, result
        else:
            return None, None

    def process_queue_full(self):
        """Check if current process queue is full.

        :return: if current process queue is full return True, otherwise False.
        :rtype: bool

        """
        self.update_queues()
        return self.process_queue.qsize() == self.n_workers

    def process_queue_empty(self):
        """Check if current process queue is empty.

        :return: if current process queue is empty return True, otherwise False.
        :rtype: bool

        """
        self.update_queues()
        return self.process_queue.qsize() == 0

    def distribute(self, client, pid, func, kwargs):
        """Submit a calculation task to cluster.

        the calculation task will be
        executed asynchronously on one worker of the cluster. the `client` is
        the cluster entry point, `pid` is a user-defined unique id for this
        task, `func` is the function or object that do the calculation,
        `kwargs` is the parameters for `func`.

        :param distributed.Client client: the target `client` to run this task.
        :param pid: unique `pid` to descript this task.
        :type pid: str or int(defined by user).
        :param func: A serializable function or object(callable and has
            `__call__` function) which need to be distributed calculaton.
        :type func: function or object.
        :param dict kwargs: Parameter of `func`.

        """
        future = client.submit(func, **kwargs)
        f = (pid, future)
        self.future_set.add(f)
        self.process_queue.put(pid)

    def close(self, client):
        """Close the connection to the local Dask Scheduler.

        :param distributed.Client client: the target `client` to close.

        """
        client.close()

    def join(self):
        """Wait all process in process_queue to finish."""
        while not self.process_queue_empty():
            time.sleep(0.1)
        return
    for n in range(bootstrap_replicates):
        #select portion of random genomes to take out
        if bootstrap_replicates >1:
            del_genomes = np.random.randint( align_array.shape[0], size= int(align_array.shape[0]*bootstrap) )


        for annot_index,annot_row in annotation.iterrows():
            #indexing starts at 1 for blast
            #####switch to sending the coordinates and masking for the matrix
            for j,codon in enumerate(range(annot_row.qstart-1, annot_row.qend-1 , 3 )):
                positions = []
                for col in [codon, codon+1 , codon+2]:
                    if col in informativesites:
                        positions.append( (col, None) )
                    else:
                        #just add the alignment character if it doesnt change.
                        positions.append( (col, align_array[0,col] ) )
                #submit codons
                inq.put( (codon, positions)  )
                if workers_started == False and  inq.qsize() > start_worker_trigger:
                    #start workers
                    for workers in range(NCORE):
                        client.submit(  calculate_small_parsimony , inq= inq ,outq = outq ,stopiter= stopiter ,  treefile=treefile  ,  row_index= remote_index , iolock = lock, verbose  = False  )

                if saver_started == False and  inq.qsize() > future_clean_trigger:
                    print('starting saver')
                    client.submit(  collect_futures , queue= queue , stopiter=stopiter , runName= runName , nucleotides_only =False  )
                    saver_started = True
    stopiter.set(True)
    print('done iterating')