def checkpoint_network(self): """ This method will export to a temporary directory the state of the network interface for the node that is running the u'/fmoncentralpipecentral/mongoccentral/mongocentralcentral' task. We do this to check how much traffic does the central node receives with a centralised approach """ netcheck_directory = ".netcheck" curl_node = list(self.masters)[0] p = general_util.SshProcess( 'curl "http://leader.mesos/service/marathon-user/v2/tasks"', host=curl_node).run() d = json.loads(p.stdout) mongo_tasks = filter( lambda task: task['appId'] == u'/fmoncentralpipecentral/mongoccentral/mongocentralcentral', d.get('tasks')) mongo_host = mongo_tasks[0]['host'] p = general_util.SshProcess('/sbin/ifconfig', host=mongo_host, shell=True, pty=True).run() now = strftime("%d_%b_%Y_%H:%M") if not exists(netcheck_directory): makedirs(netcheck_directory) with open('.netcheck/net_checkpoint' + now, 'w') as f: f.write(p.stdout)
def save_results(self): logs_from_images = [ 'ches/kafka', 'alvarobrandon/spark-worker', 'alvarobrandon/spark-master', 'uhopper/hadoop-datanode:2.8.1', 'uhopper/hadoop-namenode:2.8.1', 'zookeeper', 'mesosphere/marathon-lb:v1.11.1', 'alvarobrandon/spark-bench', 'alvarobrandon/ycsb', 'cassandra' ] # Extract here from the marathon API all the Mesos TaskIDs for the different applications for agent in self.private_agents: for image in logs_from_images: p = general_util.SshProcess( 'sudo docker ps -f "ancestor={0}" -q -a'.format(image), host=agent).run() image_dir = image.replace('/', '_') for containerid in p.stdout.split('\r\n')[:-1]: if image == 'alvarobrandon/spark-worker': print(containerid, image_dir) p = general_util.SshProcess( 'mkdir /home/vagrant/{0}_{1}_logs'.format( image_dir, containerid), host=agent).run() p = general_util.SshProcess( 'sudo docker logs {1} >> /home/vagrant/{0}_{1}_logs/stdout_{0}_{1}.out 2>&1' .format(image_dir, containerid), host=agent).run() if image == 'ches/kafka': # if image_dir is kafka then copy some extra logs p = general_util.SshProcess( 'sudo docker cp {1}:/kafka/logs /home/vagrant/{0}_{1}_logs/' .format(image_dir, containerid), host=agent).run() if image == 'alvarobrandon/spark-worker': # if image_dir is spark copy the extra logs p = general_util.SshProcess( 'sudo docker cp {1}:/spark/work/ /home/vagrant/{0}_{1}_logs/' .format(image_dir, containerid), host=agent).run() RcaVagrantExperiment.save_results(self) # clean the jars first since we don't want them general_util.Remote( hosts=self.private_agents, cmd= "sudo rm -f /home/vagrant/*/work/*/*/spark-bench-2.1.1_0.3.0-RELEASE.jar" ).run() general_util.Remote(hosts=self.private_agents, cmd="sudo rm /home/vagrant/*.scrap.gz").run() general_util.Get(hosts=self.private_agents, remote_files=["/home/vagrant/"], local_location=self.results_directory).run()
def start_kafka_queue(self): curl_node = list(self.masters)[0] nbrokers = list(self.regions[0]).__len__() general_util.replace_infile("fmone-resources/kafka.json", "fmone-resources/exec.json", {"@nbrokers@": str(nbrokers)}) general_util.Put(hosts=curl_node, local_files=["fmone-resources/exec.json"], remote_location="/home/vagrant/exec.json").run() p = general_util.SshProcess( 'curl -X POST "http://leader.mesos/service/marathon-user/v2/groups" -H "content-type: application/json" -d@/home/vagrant/exec.json', host=curl_node).run()
def start_dummy_containers(self): curl_node = list(self.masters)[0] ninstances = list(self.private_agents).__len__() * 5 general_util.replace_infile("fmone-resources/dummy.json", "fmone-resources/exec.json", {"@ninstances@": str(ninstances)}) general_util.Put(hosts=curl_node, local_files=["fmone-resources/exec.json"], remote_location="/home/vagrant/exec.json").run() p = general_util.SshProcess( 'curl -X POST "http://leader.mesos/service/marathon-user/v2/apps" -H "content-type: application/json" -d@/home/vagrant/exec.json', host=curl_node).run()
def check_elasticity(self, nslaves, force_pull, region): curl_node = list(self.masters)[0] general_util.replace_infile("fmone-resources/basic.json", "fmone-resources/exec.json", { "@nslaves@": nslaves, "@region@": region }) general_util.Put(hosts=curl_node, local_files=["fmone-resources/exec.json"], remote_location="/home/vagrant/exec.json").run() p = general_util.SshProcess( 'curl -X POST "http://leader.mesos/service/marathon-user/v2/apps" -H "content-type: application/json" -d@/home/vagrant/exec.json', host=curl_node).run() print p.stdout print p.stderr print "Sleeping for a while" sleep(60) p = general_util.SshProcess( 'curl "http://leader.mesos/service/marathon-user/v2/tasks"', host=curl_node).run() d = json.loads(p.stdout) # use the basic.json from fmone-resources fmone_tasks = filter(lambda task: task['appId'] == u'/fmone/fmones', d.get('tasks')) start_end = [(task.get('stagedAt'), task.get('startedAt')) for task in fmone_tasks] time_differences = map( lambda pair: mktime(strptime(pair[1][:-5], '%Y-%m-%dT%H:%M:%S')) - mktime(strptime(pair[0][:-5], '%Y-%m-%dT%H:%M:%S')), start_end) print "The mean time to start {0} nslaves instances with pulled {1} is: {2} and its variance {3}"\ .format(nslaves, force_pull, mean(time_differences), std(time_differences)) p = general_util.SshProcess( 'curl -X DELETE "http://leader.mesos/service/marathon-user/v2/groups/fmone" -H "content-type: application/json"', host=curl_node).run() sleep(20) return (nslaves, force_pull, mean(time_differences), std(time_differences))
def clean_marathon_groups(self): curl_node = list(self.masters)[0] p = general_util.SshProcess( 'curl -X DELETE "http://leader.mesos/service/marathon-user/v2/groups/" -H "content-type: application/json"', host=curl_node).run()
def check_resilience( self): # Would be possible to add the region here as a parameter? results = [] ## here we are going to include all of the results curl_node = list(self.masters)[0] p = general_util.SshProcess( 'curl "http://leader.mesos/service/marathon-user/v2/tasks"', host=curl_node).run() d = json.loads(p.stdout) fmone_tasks = filter( lambda task: task['appId'] == u'/fmonmongorpipe2/fmondocker2/fmoneagentdockerregion2', d.get('tasks')) kill_host = fmone_tasks[0].get('host') general_util.Remote('sudo docker rm -f $(sudo docker ps -a -q)', hosts=kill_host, process_args={ "nolog_exit_code": True }).run() time1 = time() sleep(20) ## We leave some time till the fmone agent runs again p = general_util.SshProcess( 'curl "http://leader.mesos/service/marathon-user/v2/tasks"', host=curl_node).run() d = json.loads(p.stdout) killed_host = filter(lambda task: (task['host'] == kill_host), d.get('tasks')) start_end = [(task.get('stagedAt'), task.get('startedAt')) for task in killed_host] time_differences = map( lambda pair: (mktime(strptime(pair[1][:-5], '%Y-%m-%dT%H:%M:%S'))) - (time1 - 7200), start_end) print "The mean time to recover for a Fmone agent is: {0} and its variance {1}"\ .format(mean(time_differences), std(time_differences)) results.append(mean(time_differences)) mongo_tasks = filter( lambda task: task['appId'] == u'/fmonmongorpipe2/mongor2/mongoregion2', d.get('tasks')) kill_host = mongo_tasks[0].get('host') general_util.Remote('sudo docker rm -f $(sudo docker ps -a -q)', hosts=kill_host, process_args={ "nolog_exit_code": True }).run() time1 = time() sleep( 60 ) ## we leave some time until all the fmone agents are up and running again p = general_util.SshProcess( 'curl "http://leader.mesos/service/marathon-user/v2/tasks"', host=curl_node).run() d = json.loads(p.stdout) fmone_tasks = filter( lambda task: task['appId'] == u'/fmonmongorpipe2/fmondocker2/fmoneagentdockerregion2', d.get('tasks')) df = pd.DataFrame(fmone_tasks) df['startedAt'] = pd.to_datetime(df['startedAt']) last_started = (df.sort_values( 'startedAt', ascending=False).head(1)['startedAt'].values[0].astype('uint64') / 1e9) print "The mean time to recover a Fmone pipeline is: {0}".format( last_started - time1) results.append(last_started - time1) general_util.Remote('sudo docker rm -f $(sudo docker ps -a -q)', hosts=self.private_agents, process_args={ "nolog_exit_code": True }).run() time1 = time() sleep(260) p = general_util.SshProcess( 'curl "http://leader.mesos/service/marathon-user/v2/tasks"', host=curl_node).run() d = json.loads(p.stdout) fmone_tasks = filter( lambda task: task['appId'] == u'/fmonmongorpipe2/fmondocker2/fmoneagentdockerregion2', d.get('tasks')) df = pd.DataFrame(fmone_tasks) df['startedAt'] = pd.to_datetime(df['startedAt']) last_started = (df.sort_values( 'startedAt', ascending=False).head(1)['startedAt'].values[0].astype('uint64') / 1e9) print "The mean time to recover from a general failure is: {0}".format( last_started - time1) results.append(last_started - time1) return results