def read_app_started_csv(path): df = pd.read_csv(path) jobs = [] for index, row in df.iterrows(): job = Job() job.name = row['queue'] job.run_time = row['elapsedTime'] * 0.001 job.memory_seconds = row['memorySeconds'] jobs.append(job) return jobs
def read_app_csv(path): df = pd.read_csv(path) jobs = [] for index, row in df.iterrows(): job = Job() job.name = row['queue'] job.run_time = row['elapsedTime'] * 0.001 job.memory_seconds = row[ 'allocatedMB'] * 300 # five minute per sampling jobs.append(job) return jobs
def read_app_started_csv(path): df = pd.read_csv(path) jobs = [] for index, row in df.iterrows(): job = Job() job.name = row['queue'] job.run_time = row['elapsedTime'] * 0.001 job.memory_seconds = row['memorySeconds'] jobs.append(job) """ for i in range(df.shape[1]): print(i, cols[i], row[i]) print( '-----------------------------------') """ return jobs
def read_app_csv(path): df = pd.read_csv(path) # cols = df.columns.tolist() jobs = [] for index, row in df.iterrows(): job = Job() job.name = row['queue'] job.run_time = row['elapsedTime'] * 0.001 job.memory_seconds = row['allocatedMB']*300 # five minute per sampling jobs.append(job) """ for i in range(df.shape[1]): print(i, cols[i], row[i]) print( '-----------------------------------') """ return jobs
def scrape_jobs(self): try: jobs = self.browser.execute_script( """return ( function(){ var jobs = []; var els = document.getElementById('experience-section').getElementsByTagName('ul')[0].getElementsByTagName('li'); for (var i=0;i<els.length; i++){ if(els[i].className!='pv-entity__position-group-role-item-fading-timeline'){ if(els[i].getElementsByClassName('pv-entity__position-group-role-item-fading-timeline').length>0){ } else { try {position = els[i].getElementsByClassName('pv-entity__summary-info')[0].getElementsByTagName('h3')[0].innerText;} catch(err) { position = ''; } try { company_name = els[i].getElementsByClassName('pv-entity__summary-info')[0].getElementsByClassName('pv-entity__secondary-title')[0].innerText;} catch (err) { company_name = ''; } try{date_ranges = els[i].getElementsByClassName('pv-entity__summary-info')[0].getElementsByClassName('pv-entity__date-range')[0].getElementsByTagName('span')[1].innerText;} catch (err) {date_ranges = ''; } try {exp=els[i].getElementsByClassName('pv-entity__summary-info')[0].getElementsByTagName('h4')[1].getElementsByTagName('span')[1].innerText;} catch(err) {exp='';} try{job_location = els[i].getElementsByClassName('pv-entity__summary-info')[0].getElementsByClassName('pv-entity__location')[0].getElementsByTagName('span')[1].innerText;} catch (err) {job_location = ''; } try{company_url =els[i].getElementsByTagName('a')[0].href;} catch (err) {company_url = ''; } jobs.push([position, company_name, company_url, date_ranges, exp, job_location]);}}} return jobs; })();""") except WebDriverException: jobs = [] parsed_jobs = [] for job in jobs: try: company_industry, company_employees = self.scrape_company_details(job[2]) parsed_jobs.append( Job( position=job[0], company=Company( name=job[1], industry=company_industry, employees=company_employees, ), location=Location(job[5]), exp=job[4], date_range=job[3] ) ) except: pass return parsed_jobs
def do_POST(self): """Save a file following a HTTP POST request""" # TODO: check if the sender is master try: file_length = int(self.headers['Content-Length']) job_id = self.headers['job_id'] start_with = self.headers['start_with'] log.info('received job {}'.format(job_id)) except Exception as e: log.error('invalid request received: {}'.format(e.message)) return file_path = '{}/{}'.format(self.file_dir, job_id) with open(file_path, 'wb') as output_file: while file_length > 0: read_length = BUFFER_SIZE if file_length > BUFFER_SIZE else file_length output_file.write(self.rfile.read(read_length)) file_length -= read_length self.send_response(201, 'Created') self.end_headers() self.job_queue.put(Job(start_with, file_path))
def read_app_csv(path): job_count = np.random.randint(10, 50) queue = ['spark', 'hive', 'ProgrammerAlliance'] # queue = ['spark', 'hive'] jobs = [] for i in range(job_count): job = Job() job.name = queue[np.random.randint(0,3)] job.wait_time = np.random.randint(0, 25) job.run_time = np.random.randint(10, 40) job.memory_seconds = 1024*job.run_time*0.05 jobs.append(job) """ print '%d: queue: %s, wait time: %d, run time: %d, memory seconds: %d' %(i, job.name, job.wait_time, job.run_time, job.memory_seconds) """ print('%d jobs finished during this interval' % job_count) return jobs
def read_app_stopped_csv(path): df = pd.read_csv(path) # cols = df.columns.tolist() # print(cols) jobs = [] for index, row in df.iterrows(): job = Job() job.name = row['queue'] # job.wait_time = np.random.randint(50) #暂时用随机数模拟 job.run_time = row['elapsedTime'] * 0.001 job.memory_seconds = row['memorySeconds'] if job.run_time > 150: job.memory_seconds = job.memory_seconds * 150 / job.run_time # print("STOPPED: ", job.memory_seconds) jobs.append(job) """ for i in range(df.shape[1]): print(i, cols[i], row[i]) print( '-----------------------------------') """ return jobs
def scrap_profile(self, profile_linkedin_url, profile_known_graduation_date): if not is_url_valid(profile_linkedin_url): return ScrapingResult('BadFormattedLink') # Scraping of the profile may fail due to human check forced by LinkedIn try: # Setting of the delay (seconds) between operations that need to be sure loading of page is ended loading_pause_time = 2 loading_scroll_time = 1 # Opening of the profile page self.browser.get(profile_linkedin_url) if not str(self.browser.current_url).strip( ) == profile_linkedin_url.strip(): if self.browser.current_url == 'https://www.linkedin.com/in/unavailable/': return ScrapingResult('ProfileUnavailable') else: raise HumanCheckException # Scraping the Email Address from Contact Info (email) # > click on 'Contact info' link on the page self.browser.execute_script( "(function(){try{for(i in document.getElementsByTagName('a')){let el = document.getElementsByTagName('a')[i]; " "if(el.innerHTML.includes('Contact info')){el.click();}}}catch(e){}})()" ) time.sleep(loading_pause_time) # > gets email from the 'Contact info' popup try: email = self.browser.execute_script( "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let " "el = " "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes(" "'ci-email')){ " "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()" ) except: pass # Scraping the Phone from Contact Info (email) try: phone = self.browser.execute_script( "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let " "el = " "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes(" "'ci-phone')){ " "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()" ) except: pass # Scraping the Birthday from Contact Info (email) try: birthday = self.browser.execute_script( "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let " "el = " "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes(" "'ci-birthday')){ " "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()" ) except: pass # Scraping the Date Connected from Contact Info (email) try: connectedDate = self.browser.execute_script( "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let " "el = " "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes(" "'ci-connected')){ " "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()" ) self.browser.execute_script( "document.getElementsByClassName('artdeco-modal__dismiss')[0].click()" ) except: pass # Loading the entire page (LinkedIn loads content asynchronously based on your scrolling) window_height = self.browser.execute_script( "return window.innerHeight") scrolls = 1 while scrolls * window_height < self.browser.execute_script( "return document.body.offsetHeight"): self.browser.execute_script( f"window.scrollTo(0, {window_height * scrolls});") time.sleep(loading_scroll_time) scrolls += 1 try: self.browser.execute_script( "document.getElementsByClassName('pv-profile-section__see-more-inline')[0].click()" ) time.sleep(loading_pause_time) except: pass # Get all the job positions try: job_positions = self.browser.find_element_by_id( 'experience-section').find_elements_by_tag_name('li') except: job_positions = [] #Get all the educations try: educations = self.browser.find_element_by_id( 'education-section').find_elements_by_tag_name('li') except: educations = [] # Parsing of the page html structure soup = BeautifulSoup(self.browser.page_source, 'lxml') # Scraping the Name (using soup) try: name_div = soup.find('div', {'class': 'flex-1 mr5'}) name_loc = name_div.find_all('ul') headline = name_div.find_all('h2') headline = headline[0].get_text().strip() profile_name = name_loc[0].find('li').get_text().strip() locationNConnection = name_loc[1].find_all('li') location = locationNConnection[0].get_text().strip() try: connection = locationNConnection[1].find('a').find( 'span').get_text().strip() except: connection = locationNConnection[1].find( 'span').get_text().strip() except: return ScrapingResult('ERROR IN SCRAPING NAME') # Scraping the Desc (using soup) try: self.browser.execute_script( "document.getElementsByClassName('lt-line-clamp__more')[0].click()" ) time.sleep(loading_pause_time) except: pass try: if (self.browser.execute_script( "return (els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__line').length)" )): profile_desc = self.browser.execute_script( "return (function(){els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__line');results = [];for (var i=0; i < els.length; i++){results.push(els[i].innerText);}return results;})()" ) else: profile_desc = self.browser.execute_script( "return (function(){els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__raw-line');results = [];for (var i=0; i < els.length; i++){results.push(els[i].innerText);}return results;})()" ) except: profile_desc = [] # print(profile_desc) # Parsing skills try: self.browser.execute_script( "document.getElementsByClassName('pv-skills-section__additional-skills')[0].click()" ) time.sleep(loading_pause_time) except: pass try: skills = self.browser.execute_script( "return (function(){els = document.getElementsByClassName('pv-skill-category-entity');results = [];for (var i=0; i < els.length; i++){results.push(els[i].getElementsByClassName('pv-skill-category-entity__name-text')[0].innerText);}return results;})()" ) except: skills = [] education_list = [] # Parsing the job positions if len(educations) > 0: # Parse job positions to extract relative the data ranges educations_data_ranges = [] x = 1 for education in educations: try: # Scraping of the last (hopefully current) Job exp_section = soup.find('section', {'id': 'education-section'}) exp_section = exp_section.find('ul') div_tags = exp_section.contents[x].find('div') a_tags = div_tags.find('a') x += 1 # Scraping of the last Job - company_name, job_title try: education_name = a_tags.find( 'h3').get_text().strip() except: eudcation_name = None try: education_degree_name = a_tags.find_all( 'p')[0].get_text().strip() except: education_degree_name = None try: education_major = a_tags.find_all( 'p')[1].get_text().strip() except: education_major = None try: education_year = a_tags.find_all( 'p')[2].get_text().strip() except: education_year = None # last_job_company_name = a_tags.find_all('span')[1].get_text().strip() # last_job_title = exp_section.find('ul').find('li').find_all('span')[2].get_text().strip() # spans = exp_section.find('ul').find('li').find_all('span') #last_job_company_name = last_job_company_name.replace('Full-time', '').replace('Part-time', '').strip() # Scraping of last Job - location # last_job_location = Location() # next_span_is_location = False # for span in spans: # if next_span_is_location: # last_job_location.parse_string(span.get_text().strip()) # break # if span.get_text().strip() == 'Location': # next_span_is_location = True # # Scraping of Industry related to last Job # last_job_company_url = a_tags.get('href') # if last_job_company_url not in self.industries_dict: # try: # self.browser.get('https://www.linkedin.com' + last_job_company_url) # self.industries_dict[last_job_company_url] = self.browser.execute_script( # "return document.getElementsByClassName(" # "'org-top-card-summary-info-list__info-item')[" # "0].innerText") # except: # self.industries_dict[last_job_company_url] = 'N/A' # last_job_company_industry = self.industries_dict[last_job_company_url] education_list.append( Education(education_name=education_name, degree_name=education_degree_name, major=education_major, year=education_year)) except: pass for x in range(3 - len(educations)): education_list.append( Education(education_name=None, degree_name=None, major=None, year=None)) last_job = [] # Parsing the job positions if len(job_positions) > 0: # Parse job positions to extract relative the data ranges job_positions_data_ranges = [] x = 1 for job_position in job_positions: # Get the date range of the job position try: date_range_element = job_position.find_element_by_class_name( 'pv-entity__date-range') date_range_spans = date_range_element.find_elements_by_tag_name( 'span') date_range = date_range_spans[1].text job_positions_data_ranges.append(date_range) # Scraping of the last (hopefully current) Job exp_section = soup.find('section', {'id': 'experience-section'}) exp_section = exp_section.find('ul') div_tags = exp_section.contents[x].find('div') a_tags = div_tags.find('a') x += 1 # Scraping of the last Job - company_name, job_title try: last_job_company_name = a_tags.find_all( 'p')[1].get_text().strip() last_job_title = a_tags.find( 'h3').get_text().strip() spans = a_tags.find_all('span') except: last_job_company_name = a_tags.find_all( 'span')[1].get_text().strip() last_job_title = exp_section.find('ul').find( 'li').find_all('span')[2].get_text().strip() spans = exp_section.find('ul').find('li').find_all( 'span') last_job_company_name = last_job_company_name.replace( 'Full-time', '').replace('Part-time', '').strip() # Scraping of last Job - location last_job_location = Location() next_span_is_location = False for span in spans: if next_span_is_location: last_job_location.parse_string( span.get_text().strip()) break if span.get_text().strip() == 'Location': next_span_is_location = True # # Scraping of Industry related to last Job # last_job_company_url = a_tags.get('href') # if last_job_company_url not in self.industries_dict: # try: # self.browser.get('https://www.linkedin.com' + last_job_company_url) # self.industries_dict[last_job_company_url] = self.browser.execute_script( # "return document.getElementsByClassName(" # "'org-top-card-summary-info-list__info-item')[" # "0].innerText") # except: # self.industries_dict[last_job_company_url] = 'N/A' # last_job_company_industry = self.industries_dict[last_job_company_url] last_job.append( Job( position=last_job_title, company=Company( name=last_job_company_name, #industry=last_job_company_industry ), location=last_job_location)) except: last_job.append( Job( position=None, company=Company( name=None, #industry=last_job_company_industry ), location=None)) for x in range(4 - len(job_positions)): last_job.append( Job( position=None, company=Company(name=None, #industry=last_job_company_industry ), location=None)) print( "profile_name {} \n headline {} \n location {} \n connection {} \n profile_desc {} \n email {} \n phone {} \n birthday {} \n connectedDate {} \n skills {} \n last_job {} \n last_job {} \n last_job {} \n last_job {} \n education {} \n" .format(profile_name, headline, location, connection, profile_desc, email, phone, birthday, connectedDate, skills, last_job[0], last_job[1], last_job[2], last_job[3], education_list[0])) return ScrapingResult( Profile( profile_name, headline, location, connection, connectedDate, phone, birthday, profile_desc, email, skills, last_job, JobHistorySummary(profile_known_graduation_date, job_positions_data_ranges), education_list)) else: return ScrapingResult(Profile(profile_name, email, skills)) except HumanCheckException: if self.headless_option: raise CannotProceedScrapingException linkedin_logout(self.browser) linkedin_login(self.browser, self.config.get('linkedin', 'username'), self.config.get('linkedin', 'password')) while self.browser.current_url != 'https://www.linkedin.com/feed/': message_to_user('Please execute manual check', self.config) time.sleep(30) return self.scrap_profile(profile_linkedin_url, profile_known_graduation_date)
data2 = [(0, [(1, 5)]), (1, [(1, 7)]), (2, [(1, 6)])] datas = [] for _ in range(5): job = (random.randint(1, 5), []) for _ in range(3): job[1].append((random.randint(1, 3), random.randint(1, 5))) datas.append(job) data = datas if __name__ == '__main__': # Task(machine_id, duration, order) jobs = [Job(id=i + 1, arrival_time=row[0]) for i, row in enumerate(data)] for i, row in enumerate(data): for col in row[1]: jobs[i].add_task(Task(machine_id=col[0], duration=col[1])) def simulate(jobs, rule): simulator = JobShopSimulator(env=simpy.Environment(), jobs=jobs, rule=rule) simulator.run(until=50) simulator.plot() simulate(jobs, 'FIFO') simulate(jobs, 'LIFO') simulate(jobs, 'SPT')
def exec( self, input_path: str, output_path: str, aux_dir: str, additional_params: t.Optional[t.Dict[str, str]] = None, parallelize: bool = False, cluster_data_dir: t.Optional[str] = None, priority: int = 0, queue: str = "itaym", wait_until_complete: bool = False, get_completion_validator: bool = True, ) -> t.Union[float, str]: """ :param input_path: path to alignment file :param output_path: path in which the program should write its output :param additional_params: additional parameters unique to the program :param parallelize: boolean indicating weather execution of the program should be parallelized in the cluster or not :param cluster_data_dir: cluster directory that is mounted to the container data directory. Must be provided with parallleize is True :param aux_dir: directory in which auxiliary files should be generated by the job submission process :param priority: priority of the jobs :param queue: queue to submit the jobs to :param wait_until_complete: indicator weather the main program should wait until completion of all jobs (recommended: True) :param get_completion_validator: boolean indicating weather a validator file should be generated upon job completion (recommended: True) :return: either the duration of the command in minutes, if no parallelization was selected, or the path to the touch file that is used for validation of job completion in case of parallelization """ additional_args = dict() from .paml import Paml from .busted import Busted if type(self) in [Paml, Busted]: additional_args["input_tree_path"] = re.sub( "\.fas[^.]*", "_tree.nwk", input_path) if type(self) is Paml: additional_args["control_file_path"] = re.sub( "\.fas[^.]*", "_paml.ctl", input_path) command = self.set_command( input_path=input_path, output_path=output_path, additional_params=additional_params, parallelize=parallelize, cluster_data_dir=cluster_data_dir, **additional_args, ) os.makedirs(aux_dir, exist_ok=True) if os.path.exists(output_path): logger.info( f"{self.name} output already exists at {output_path} and will not be generated again" ) return if not parallelize: start_time = time() if type(self) is not Paml: os.chdir( aux_dir ) # move to aux dir as rate4site generates extra files in current running directory for cmd in command: if "cd " in cmd: os.chdir(cmd.replace("cd ", "")) else: res = os.system( f"{cmd} > /dev/null 2>&1" ) # for some reason, rate4 prints some logs into the stderr, # making the typical test (raise error i=f stderr > 0) invalid in this case if res != 0: raise RuntimeError(f"command {cmd} failed to execute.") end_time = time() return (end_time - start_time) / 60 else: commands = ([ f"cd {aux_dir.replace(os.environ['container_data_dir'], cluster_data_dir)}", """timestamp() { date +"%T" # current time } timestamp""", ] + command + ["timestamp"]) job = Job( name=self.name, sh_dir=aux_dir, output_dir=aux_dir, commands=commands, priority=priority, queue=queue, ) completion_validator = job.submit( wait_until_complete=wait_until_complete, get_completion_validator=get_completion_validator, ) return completion_validator
def parsing_jobs(self, job_positions): job_positions_data_ranges = [] #array of Jobs Jobs_array = [] for job_position in job_positions: #print('job_pos.text: {0}\n--'.format(job_position.text)) try: # Get the date range of the job position # get the date_range try: date_range_element = job_position.find_element_by_class_name( 'pv-entity__date-range') date_range_spans = date_range_element.find_elements_by_tag_name( 'span') date_range = date_range_spans[1].text # print('date_range: {0}'.format(date_range)) except NoSuchElementException: date_range = "N/A" try: # get the title title_range_element = job_position.find_element_by_tag_name( 'h3') title = title_range_element.text # print('title: {0}'.format(title)) except NoSuchElementException: title = "N/A" try: # get the companyname companyname_range_element = job_position.find_element_by_class_name( 'pv-entity__secondary-title') companyname = companyname_range_element companyname = companyname.text.replace( 'Full-time', '').replace('Part-time', '').strip() # print('companyname: {0}'.format(companyname)) except NoSuchElementException: companyname = "N/A" try: # get the company info using bautifulsoup company_url_link = job_position.find_element_by_tag_name( 'a').get_attribute('href') except NoSuchElementException: company_url_link = "N/A" try: companylocation_range_element = job_position.find_element_by_class_name( 'pv-entity__location') companylocation_spans = companylocation_range_element.find_elements_by_tag_name( 'span') companylocation = companylocation_spans[1].text except NoSuchElementException: companylocation = "N/A" # print('companylocation: {0}'.format(companylocation)) job_positions_data_ranges.append(date_range) info_company = self.get_company_data(company_url_link) try: if info_company['companyname'] == "N/A": info_company['companyname'] = companyname if info_company['location'].full_string == "N/A": loc = Location() loc.parse_string(companylocation) info_company['location'] = loc except: print("Oops!", sys.exc_info()[0], "occured.") print(info_company['industry']) print(info_company['companyname']) print(info_company['location']) trabajo_oo = Job( position=title.strip(), company=Company(name=info_company['companyname'].strip(), industry=info_company['industry'].strip()), location=info_company['location'], daterange=date_range.strip()) Jobs_array.append(trabajo_oo) # print(trabajo_oo) except: print("Oops!, \n{}\n{}\n{}\noccured.".format( sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2])) print("Job untacking error") pass return { 'Jobs_array': Jobs_array, "job_positions_data_ranges": job_positions_data_ranges }
def main(): start_with = sys.argv[1] if len(sys.argv) > 1 else "" workers = config.workers_url() start_time = time.time() log.info('master starts, start_with is "{}", workers are {}'.format( start_with, workers)) with tempdir() as tmp_dir: all_metrics_file_name = 'all_metric_names.tmp' with open('{}/{}'.format(tmp_dir, all_metrics_file_name), 'w') as all_metrics_file: get_all_metrics_into_file(start_with, all_metrics_file) parts = split_file_into_parts(file_path=all_metrics_file.name, num_parts=len(workers) * WORKER_JOB_NUM, output_dir=tmp_dir) log.info('partition finishes, all jobs are: {}'.format(parts)) jobs = [Job(start_with, part) for part in parts] # master preparation master = MasterServer(workers, jobs) # setup threads listening = Thread(target=master.server.serve_forever) listening.daemon = True sending = Thread(target=master.register_requests) sending.daemon = True checking = Thread(target=master.scan_requests) checking.daemon = True heartbeat = Thread(target=master.heartbeats) heartbeat.daemon = True listening.start() log.info( 'master server starts up, listening on port {}'.format(PORT_NO)) sending.start() checking.start() heartbeat.start() # waiting for results from workers results = [] while len(results) < len(jobs): try: result = master.results.get(timeout=MAX_JOB_INTERVAL) except Empty: log.error('master waited too long for result, shutting down') exit(1) results.append(result) # all work done, shutdown servers for worker in workers: try: log.info('sending shutdown to worker {}'.format(worker)) requests_retry_session(RETRY_NUM).get(worker + '/shutdown') except Exception as e: log.error( 'unable to stop worker {}, error message is {}'.format( worker, e.message)) master.server.shutdown() master.server.socket.close() log.info('master server shutdown, beginning aggregation') # start reducing phase merged, to_expands, total_number = reducer.start(results, start_with) put_to_tsdb(start_with, merged, to_expands) log.info( 'one round master aggregation finished, to_expands are {}'.format( to_expands)) expand(to_expands, tmp_dir, '{}/{}'.format(tmp_dir, all_metrics_file_name), total_number) log.info('finished! total running time is {}'.format(time.time() - start_time))
def exec_pipeline_on_simulations(input_path: click.Path): """Program to simulate multiple datasets and then submit pipeline jobs for each one For example of the json format parameters, see data/test/simulation.json""" # process input json file with open(input_path, "r") as input_file: simulation_params = json.load(input_file) os.makedirs( simulation_params["simulations_output_dir"], exist_ok=True, ) # intialize the logger logging.basicConfig( level=logging.INFO, format= "%(asctime)s module: %(module)s function: %(funcName)s line: %(lineno)d %(message)s", handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler( f"{os.path.dirname(input_path)}/simulations.log"), ], ) logger = logging.getLogger(__name__) logger.info("Json input has been successfully processed") logger.info(f"Processing simulation input from {input_path}") simulation_input = SimulationInput(**simulation_params) logger.info("Json input has been successfully parsed as simulation input") logger.info( f"Simulating data in {simulation_input.simulations_output_dir}") simulations_exist = False simulations_exec_complete = False repetitions_num = simulation_input.nrep if (os.path.exists(simulation_input.simulations_output_dir) and os.listdir( simulation_input.simulations_output_dir) == repetitions_num): simulations_exist = True all_exist = True for path in os.listdir(simulation_params.simulations_output_dir): completion_validator = f"{simulation_params.simulations_output_dir}/{path}/job_aux/pipeline_on_simulated_data.touch" if not os.path.exists(completion_validator): all_exist = False break if all_exist: simulations_exec_complete = True if not simulations_exist: pipeline_input_json_paths = SimulationTools.simulate( simulation_input=simulation_input) simulations_dirs = [ f"{os.path.dirname(json_path)}/" for json_path in pipeline_input_json_paths ] logger.info(f"Simulation is complete.") else: simulations_dirs = [ f"{simulation_input.simulations_output_dir}/{path}/" for path in os.listdir(simulation_input.simulations_output_dir) ] if not simulations_exec_complete: logger.info(f"submitting pipeline jobs for the simulated data") completion_validators = [] for simulations_dir in simulations_dirs: aux_dir = f"{simulations_dir}/job_aux/" json_path = f"{simulations_dir}/input.json" if not os.path.exists( f"{aux_dir}/pipeline_on_simulated_data.touch"): job = Job( name="pipeline_on_simulated_data", sh_dir=aux_dir, output_dir=aux_dir, commands=[ f"python /groups/itay_mayrose/halabikeren/down_sampling_analysis/src/main.py --input_path={json_path}" ], priority=simulation_params["priority"], queue=simulation_params["queue"], ) completion_validators.append( job.submit( wait_until_complete=False, get_completion_validator=True, )) logger.info(f"Job submission is complete") # wait for jobs to complete for validator in completion_validators: while not os.path.exists(validator): sleep(60) # analyze large scale results paths = [ path for path in os.listdir(simulation_input.simulations_output_dir) if "rep" in path ] overlap_dfs = [] for path in paths: overlap_df_path = f"{simulation_input.simulations_output_dir}/{path}/pipeline_dir/samples/samples_overlap.csv" overlap_df = pd.read_csv(overlap_df_path) overlap_df["replicate"] = path overlap_df["compared_methods"] = overlap_df["method_1"].str.cat( overlap_df[["method_2"]], sep=",") overlap_dfs.append(overlap_df) full_overlap_df = pd.concat(overlap_dfs) plot_large_scale_samples_overlap( df=full_overlap_df, output_path= f"{simulation_input.simulations_output_dir}/samples_overlap.svg", ) for program in simulation_input.programs: data = [] paths = [ path for path in os.listdir(simulation_input.simulations_output_dir) if "rep" in path ] for path in paths: df_path = f"{simulation_input.simulations_output_dir}/{path}/pipeline_dir/tables/{program}_summary.csv" try: rep_data = pd.read_csv(df_path) rep_data["replicate"] = path data.append(rep_data) except Exception as e: logger.error( f"Failed to load dataframe from {df_path} due to error {e}" ) full_df = pd.concat(data) full_df["full_bias"] = full_df["result"] - full_df["full_result"] full_df["simulated_bias"] = full_df["result"] - full_df["simulated"] full_df_grouped = (full_df.groupby( ["replicate", "sampling_fraction", "sampling_method"]).mean().reset_index()) full_df_grouped.to_csv( f"{simulation_params['simulations_output_dir']}/{program}_aggregated_data.csv" ) # plot large scale data plot_large_scale_error( df=full_df_grouped, output_path= f"{simulation_input.simulations_output_dir}/{program}_absolute_error.svg", use_relative_error=False, ) plot_large_scale_error( df=full_df_grouped, output_path= f"{simulation_input.simulations_output_dir}/{program}_relative_error.svg", use_relative_error=True, ) plot_large_scale_bias( df=full_df_grouped, output_path= f"{simulation_input.simulations_output_dir}/{program}_bias.svg", )