def run_ssh_multi(cmd, instances, user, identity_file): p = mp.Pool(len(instances)) async_results = [] for instance in instances: async_results.append(p.apply_async(run_ssh, args=(cmd, instance.public_ip_address, user, identity_file))) monitor_mp_jobs(async_results) p.close() p.join()
def mutation_probabilities(collections, standard, outfile=None, query=None, seq_field='vdj_aa', chunksize=100): ''' Calculates the probability of each mutation from the standard sequence, given one or more collections. Inputs ------ collections: One or more pymongo collection objects, as an iterable. standard: The target amino acid sequence, as a string. query: Query parameters, as a dict. Will be passed directly to collection.find() seq_field: The MongoDB field to be used for comparison to the target. Default is 'vdj_aa'. chunksize: Number of sequences to be submitted for each alignment job. Default is 100. Returns ------- A dictionary of normalized mutation probabilities, of the format: {'12A': 0.01, '15F': 0.12, ...} Mutation names are a concatenation of the mutation position (1-based indexing of the standard sequence) and the mutated residue. ''' async_results = [] p = Pool() for collection in collections: print('\n' + collection) print('-' * len(collection)) print('querying for sequences...') sequences = get_sequences(collection, query=query, seq_field=seq_field) print('performing alignments:') for chunk in chunker(sequences, chunksize): async_results.append( p.apply_async(get_mutations, args=(chunk, standard))) monitor_mp_jobs(async_results) mutations = [] for ar in async_results: mutations.extend(ar.get()) print('\ncalculating mutation probabilities...') mcounts = Counter(mutations) total = sum(mcounts.values()) norm_counts = {k: float(v) / total for k, v in mcounts.items()} prob_string = '\n'.join( ['{}\t{}'.format(k, v) for k, v in list(norm_counts.items())]) if outfile is not None: open(outfile, 'w').write(prob_string) return norm_counts
def configure(self): instances = [self.master_instance] + self.worker_instances instance_lookup = dict(self.master, **self.workers) instance_names = sorted(instance_lookup.keys()) # # update Ab[x] tools # self.update_abx(instances) # build base image print('') if len(instances) == 1: print('Building base image...') configure_base_image(instances[0].public_ip_address, self.opts.user, self.opts.identity_file) else: print('Building base image on all nodes...') p = mp.Pool(len(instances)) async_results = [] for instance in instances: async_results.append(p.apply_async(configure_base_image, args=(instance.public_ip_address, self.opts.user, self.opts.identity_file))) monitor_mp_jobs(async_results) p.close() p.join() # deploy SSH key to nodes for passwordless SSH print('') print("Generating cluster's SSH key on master...") key_setup = """ [ -f ~/.ssh/id_rsa ] || (ssh-keygen -q -t rsa -N '' -f ~/.ssh/id_rsa && cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys)""" self.run(self.master_instance, key_setup) get_ssh_tar = 'tar c - .ssh' dot_ssh_tar, _ = self.run(self.master_instance, get_ssh_tar) if self.worker_instances: print("Transferring SSH key to workers:") put_ssh_tar = 'tar x' for i, worker in enumerate(self.worker_instances): progbar.progress_bar(i, len(self.worker_instances)) self.run(worker, put_ssh_tar, stdin=dot_ssh_tar) progbar.progress_bar(len(self.worker_instances), len(self.worker_instances)) print('') # modify /etc/hosts on all nodes print('Updating /etc/hosts on all nodes...') hosts = ['{} {}'.format(self.get_ip(i), n) for n, i in instance_lookup.items()] host_string = '\n'.join(hosts) host_cmd = """sudo -- sh -c 'echo "{}" >> /etc/hosts'""".format(host_string) for instance in instances: self.run(instance, host_cmd) # build and share an EBS volumne on the master node devices = ['/dev/xvda' + string.ascii_lowercase[i] for i in range(self.opts.master_ebs_vol_num)] if len(devices) > 1: volume = self.build_ebs_raid_volume(devices) elif len(devices) == 1: volume = self.format_single_ebs_device(devices[0]) if len(self.worker_instances) > 0: self.share_nfs_volume(volume) # start Celery workers on all nodes if self.opts.celery and len(self.worker_instances) > 0: self.start_redis_server(self.master_instance) self.start_celery_workers(self.worker_instances) self.start_flower() # upload BaseSpace credentials file if self.opts.basespace_credentials: print('') print('Uploading BaseSpace credentials file...') cred_file = os.path.expanduser('~/.abstar/basespace_credentials') remote_path = '/home/{}/.abstar/basespace_credentials'.format(self.opts.user) if os.path.exists(cred_file): self.put(self.master_name, cred_file, remote_path) else: print('ERROR: Local credentials file was not found. No credentials were uploaded.') # configure and start a Jupyter Notebook server if self.opts.jupyter: self.setup_jupyter_notebook() # configure and start a MongoDB server if self.opts.mongodb: self.setup_mongodb() else: self.stop_mongod() # write config information to master self.write_config_info() print('')