Exemple #1
0
def read_app_started_csv(path):
    df = pd.read_csv(path)
    jobs = []
    for index, row in df.iterrows():
        job = Job()
        job.name = row['queue']
        job.run_time = row['elapsedTime'] * 0.001
        job.memory_seconds = row['memorySeconds']
        jobs.append(job)
    return jobs
Exemple #2
0
def read_app_csv(path):
    df = pd.read_csv(path)
    jobs = []
    for index, row in df.iterrows():
        job = Job()
        job.name = row['queue']
        job.run_time = row['elapsedTime'] * 0.001
        job.memory_seconds = row[
            'allocatedMB'] * 300  # five minute per sampling
        jobs.append(job)
    return jobs
def read_app_started_csv(path):
    df = pd.read_csv(path)
    jobs = []
    for index, row in df.iterrows():
        job = Job()
        job.name = row['queue']
        job.run_time = row['elapsedTime'] * 0.001
        job.memory_seconds = row['memorySeconds']
        jobs.append(job)
        """
        for i in range(df.shape[1]):
            print(i, cols[i], row[i])
        print( '-----------------------------------')
        """
    return jobs
def read_app_csv(path):
    df = pd.read_csv(path)
    # cols = df.columns.tolist()
    jobs = []
    for index, row in df.iterrows():
        job = Job()
        job.name = row['queue']
        job.run_time = row['elapsedTime'] * 0.001
        job.memory_seconds = row['allocatedMB']*300 # five minute per sampling
        jobs.append(job)
        """
        for i in range(df.shape[1]):
            print(i, cols[i], row[i])
        print( '-----------------------------------')
        """
    return jobs
Exemple #5
0
    def scrape_jobs(self):

        try:
            jobs = self.browser.execute_script(
                """return (
                function(){ 
                 var jobs = []; 
                 var els = document.getElementById('experience-section').getElementsByTagName('ul')[0].getElementsByTagName('li');
                 for (var i=0;i<els.length; i++){
                    if(els[i].className!='pv-entity__position-group-role-item-fading-timeline'){   
                    if(els[i].getElementsByClassName('pv-entity__position-group-role-item-fading-timeline').length>0){ 
                     } 
                    else { 
                        try {position = els[i].getElementsByClassName('pv-entity__summary-info')[0].getElementsByTagName('h3')[0].innerText;} 
                        catch(err) { position = ''; } 
                        try { company_name = els[i].getElementsByClassName('pv-entity__summary-info')[0].getElementsByClassName('pv-entity__secondary-title')[0].innerText;}
                        catch (err) { company_name = ''; }
                        try{date_ranges = els[i].getElementsByClassName('pv-entity__summary-info')[0].getElementsByClassName('pv-entity__date-range')[0].getElementsByTagName('span')[1].innerText;}
                        catch (err) {date_ranges = ''; }
                        try {exp=els[i].getElementsByClassName('pv-entity__summary-info')[0].getElementsByTagName('h4')[1].getElementsByTagName('span')[1].innerText;}
                        catch(err) {exp='';}        
                        try{job_location = els[i].getElementsByClassName('pv-entity__summary-info')[0].getElementsByClassName('pv-entity__location')[0].getElementsByTagName('span')[1].innerText;}
                        catch (err) {job_location = ''; }
                        try{company_url =els[i].getElementsByTagName('a')[0].href;} 
                        catch (err) {company_url = ''; }
                        jobs.push([position, company_name, company_url, date_ranges, exp, job_location]);}}}
                        return jobs; })();""")
        except WebDriverException:
            jobs = []

        parsed_jobs = []

        for job in jobs:
         try:
            company_industry, company_employees = self.scrape_company_details(job[2])

            parsed_jobs.append(
                Job(
                    position=job[0],
                    company=Company(
                        name=job[1],
                        industry=company_industry,
                        employees=company_employees,
                    ),
                    location=Location(job[5]),
                    exp=job[4],
                    date_range=job[3]
                )
            )
         except:
         	pass
        return parsed_jobs
Exemple #6
0
    def do_POST(self):
        """Save a file following a HTTP POST request"""
        # TODO: check if the sender is master
        try:
            file_length = int(self.headers['Content-Length'])
            job_id = self.headers['job_id']
            start_with = self.headers['start_with']
            log.info('received job {}'.format(job_id))
        except Exception as e:
            log.error('invalid request received: {}'.format(e.message))
            return

        file_path = '{}/{}'.format(self.file_dir, job_id)
        with open(file_path, 'wb') as output_file:
            while file_length > 0:
                read_length = BUFFER_SIZE if file_length > BUFFER_SIZE else file_length
                output_file.write(self.rfile.read(read_length))
                file_length -= read_length

        self.send_response(201, 'Created')
        self.end_headers()
        self.job_queue.put(Job(start_with, file_path))
Exemple #7
0
def read_app_csv(path):
    job_count = np.random.randint(10, 50)
    queue = ['spark', 'hive', 'ProgrammerAlliance']
    # queue = ['spark', 'hive']
    jobs = []
    for i in range(job_count):
        job = Job()
        job.name = queue[np.random.randint(0,3)] 
        job.wait_time = np.random.randint(0, 25)
        job.run_time = np.random.randint(10, 40)
        job.memory_seconds = 1024*job.run_time*0.05
        jobs.append(job)
        """
        print '%d: queue: %s, wait time: %d, run time: %d, memory seconds: %d' %(i, job.name, job.wait_time, job.run_time, job.memory_seconds)
        """
    print('%d jobs finished during this interval' % job_count)
    return jobs
def read_app_stopped_csv(path):
    df = pd.read_csv(path)
    # cols = df.columns.tolist()
    # print(cols)
    jobs = []
    for index, row in df.iterrows():
        job = Job()
        job.name = row['queue']
        # job.wait_time = np.random.randint(50) #暂时用随机数模拟
        job.run_time = row['elapsedTime'] * 0.001
        job.memory_seconds = row['memorySeconds']
        if job.run_time > 150:
            job.memory_seconds = job.memory_seconds * 150 / job.run_time
            # print("STOPPED: ", job.memory_seconds)
        jobs.append(job)
        """
        for i in range(df.shape[1]):
            print(i, cols[i], row[i])
        print( '-----------------------------------')
        """
    return jobs
    def scrap_profile(self, profile_linkedin_url,
                      profile_known_graduation_date):

        if not is_url_valid(profile_linkedin_url):
            return ScrapingResult('BadFormattedLink')

        # Scraping of the profile may fail due to human check forced by LinkedIn
        try:

            # Setting of the delay (seconds) between operations that need to be sure loading of page is ended
            loading_pause_time = 2
            loading_scroll_time = 1

            # Opening of the profile page
            self.browser.get(profile_linkedin_url)

            if not str(self.browser.current_url).strip(
            ) == profile_linkedin_url.strip():
                if self.browser.current_url == 'https://www.linkedin.com/in/unavailable/':
                    return ScrapingResult('ProfileUnavailable')
                else:
                    raise HumanCheckException

            # Scraping the Email Address from Contact Info (email)

            # > click on 'Contact info' link on the page
            self.browser.execute_script(
                "(function(){try{for(i in document.getElementsByTagName('a')){let el = document.getElementsByTagName('a')[i]; "
                "if(el.innerHTML.includes('Contact info')){el.click();}}}catch(e){}})()"
            )
            time.sleep(loading_pause_time)

            # > gets email from the 'Contact info' popup
            try:
                email = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-email')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )
            except:
                pass

            # Scraping the Phone from Contact Info (email)
            try:
                phone = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-phone')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )
            except:
                pass

            # Scraping the Birthday from Contact Info (email)
            try:
                birthday = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-birthday')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )
            except:
                pass

            # Scraping the Date Connected from Contact Info (email)
            try:
                connectedDate = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-connected')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )

                self.browser.execute_script(
                    "document.getElementsByClassName('artdeco-modal__dismiss')[0].click()"
                )
            except:
                pass

            # Loading the entire page (LinkedIn loads content asynchronously based on your scrolling)
            window_height = self.browser.execute_script(
                "return window.innerHeight")
            scrolls = 1
            while scrolls * window_height < self.browser.execute_script(
                    "return document.body.offsetHeight"):
                self.browser.execute_script(
                    f"window.scrollTo(0, {window_height * scrolls});")
                time.sleep(loading_scroll_time)
                scrolls += 1

            try:
                self.browser.execute_script(
                    "document.getElementsByClassName('pv-profile-section__see-more-inline')[0].click()"
                )
                time.sleep(loading_pause_time)
            except:
                pass

            # Get all the job positions
            try:
                job_positions = self.browser.find_element_by_id(
                    'experience-section').find_elements_by_tag_name('li')
            except:
                job_positions = []

            #Get all the educations
            try:
                educations = self.browser.find_element_by_id(
                    'education-section').find_elements_by_tag_name('li')
            except:
                educations = []

            # Parsing of the page html structure
            soup = BeautifulSoup(self.browser.page_source, 'lxml')

            # Scraping the Name (using soup)
            try:
                name_div = soup.find('div', {'class': 'flex-1 mr5'})
                name_loc = name_div.find_all('ul')
                headline = name_div.find_all('h2')
                headline = headline[0].get_text().strip()
                profile_name = name_loc[0].find('li').get_text().strip()
                locationNConnection = name_loc[1].find_all('li')
                location = locationNConnection[0].get_text().strip()
                try:
                    connection = locationNConnection[1].find('a').find(
                        'span').get_text().strip()
                except:
                    connection = locationNConnection[1].find(
                        'span').get_text().strip()
            except:
                return ScrapingResult('ERROR IN SCRAPING NAME')

            # Scraping the Desc (using soup)
            try:
                self.browser.execute_script(
                    "document.getElementsByClassName('lt-line-clamp__more')[0].click()"
                )
                time.sleep(loading_pause_time)
            except:
                pass

            try:
                if (self.browser.execute_script(
                        "return (els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__line').length)"
                )):
                    profile_desc = self.browser.execute_script(
                        "return (function(){els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__line');results = [];for (var i=0; i < els.length; i++){results.push(els[i].innerText);}return results;})()"
                    )

                else:
                    profile_desc = self.browser.execute_script(
                        "return (function(){els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__raw-line');results = [];for (var i=0; i < els.length; i++){results.push(els[i].innerText);}return results;})()"
                    )

            except:
                profile_desc = []

            # print(profile_desc)

            # Parsing skills
            try:
                self.browser.execute_script(
                    "document.getElementsByClassName('pv-skills-section__additional-skills')[0].click()"
                )
                time.sleep(loading_pause_time)
            except:
                pass

            try:
                skills = self.browser.execute_script(
                    "return (function(){els = document.getElementsByClassName('pv-skill-category-entity');results = [];for (var i=0; i < els.length; i++){results.push(els[i].getElementsByClassName('pv-skill-category-entity__name-text')[0].innerText);}return results;})()"
                )
            except:
                skills = []

            education_list = []
            # Parsing the job positions
            if len(educations) > 0:
                # Parse job positions to extract relative the data ranges
                educations_data_ranges = []
                x = 1
                for education in educations:
                    try:
                        # Scraping of the last (hopefully current) Job
                        exp_section = soup.find('section',
                                                {'id': 'education-section'})
                        exp_section = exp_section.find('ul')
                        div_tags = exp_section.contents[x].find('div')
                        a_tags = div_tags.find('a')
                        x += 1

                        # Scraping of the last Job - company_name, job_title
                        try:
                            education_name = a_tags.find(
                                'h3').get_text().strip()

                        except:
                            eudcation_name = None

                        try:
                            education_degree_name = a_tags.find_all(
                                'p')[0].get_text().strip()
                        except:
                            education_degree_name = None

                        try:
                            education_major = a_tags.find_all(
                                'p')[1].get_text().strip()
                        except:
                            education_major = None

                        try:
                            education_year = a_tags.find_all(
                                'p')[2].get_text().strip()
                        except:
                            education_year = None

                            # last_job_company_name = a_tags.find_all('span')[1].get_text().strip()
                            # last_job_title = exp_section.find('ul').find('li').find_all('span')[2].get_text().strip()

                            # spans = exp_section.find('ul').find('li').find_all('span')

                        #last_job_company_name = last_job_company_name.replace('Full-time', '').replace('Part-time', '').strip()

                        # Scraping of last Job - location
                        # last_job_location = Location()
                        # next_span_is_location = False
                        # for span in spans:
                        #     if next_span_is_location:
                        #         last_job_location.parse_string(span.get_text().strip())
                        #         break
                        #     if span.get_text().strip() == 'Location':
                        #         next_span_is_location = True

                        # # Scraping of Industry related to last Job
                        # last_job_company_url = a_tags.get('href')
                        # if last_job_company_url not in self.industries_dict:
                        #     try:
                        #         self.browser.get('https://www.linkedin.com' + last_job_company_url)
                        #         self.industries_dict[last_job_company_url] = self.browser.execute_script(
                        #             "return document.getElementsByClassName("
                        #             "'org-top-card-summary-info-list__info-item')["
                        #             "0].innerText")
                        #     except:
                        #         self.industries_dict[last_job_company_url] = 'N/A'

                        # last_job_company_industry = self.industries_dict[last_job_company_url]
                        education_list.append(
                            Education(education_name=education_name,
                                      degree_name=education_degree_name,
                                      major=education_major,
                                      year=education_year))

                    except:
                        pass

            for x in range(3 - len(educations)):
                education_list.append(
                    Education(education_name=None,
                              degree_name=None,
                              major=None,
                              year=None))

            last_job = []
            # Parsing the job positions
            if len(job_positions) > 0:
                # Parse job positions to extract relative the data ranges
                job_positions_data_ranges = []
                x = 1
                for job_position in job_positions:
                    # Get the date range of the job position
                    try:
                        date_range_element = job_position.find_element_by_class_name(
                            'pv-entity__date-range')
                        date_range_spans = date_range_element.find_elements_by_tag_name(
                            'span')
                        date_range = date_range_spans[1].text

                        job_positions_data_ranges.append(date_range)

                        # Scraping of the last (hopefully current) Job
                        exp_section = soup.find('section',
                                                {'id': 'experience-section'})
                        exp_section = exp_section.find('ul')
                        div_tags = exp_section.contents[x].find('div')
                        a_tags = div_tags.find('a')
                        x += 1

                        # Scraping of the last Job - company_name, job_title
                        try:
                            last_job_company_name = a_tags.find_all(
                                'p')[1].get_text().strip()
                            last_job_title = a_tags.find(
                                'h3').get_text().strip()

                            spans = a_tags.find_all('span')
                        except:
                            last_job_company_name = a_tags.find_all(
                                'span')[1].get_text().strip()
                            last_job_title = exp_section.find('ul').find(
                                'li').find_all('span')[2].get_text().strip()
                            spans = exp_section.find('ul').find('li').find_all(
                                'span')

                        last_job_company_name = last_job_company_name.replace(
                            'Full-time', '').replace('Part-time', '').strip()

                        # Scraping of last Job - location
                        last_job_location = Location()
                        next_span_is_location = False
                        for span in spans:
                            if next_span_is_location:
                                last_job_location.parse_string(
                                    span.get_text().strip())
                                break
                            if span.get_text().strip() == 'Location':
                                next_span_is_location = True

                        # # Scraping of Industry related to last Job
                        # last_job_company_url = a_tags.get('href')
                        # if last_job_company_url not in self.industries_dict:
                        #     try:
                        #         self.browser.get('https://www.linkedin.com' + last_job_company_url)
                        #         self.industries_dict[last_job_company_url] = self.browser.execute_script(
                        #             "return document.getElementsByClassName("
                        #             "'org-top-card-summary-info-list__info-item')["
                        #             "0].innerText")
                        #     except:
                        #         self.industries_dict[last_job_company_url] = 'N/A'

                        # last_job_company_industry = self.industries_dict[last_job_company_url]

                        last_job.append(
                            Job(
                                position=last_job_title,
                                company=Company(
                                    name=last_job_company_name,
                                    #industry=last_job_company_industry
                                ),
                                location=last_job_location))

                    except:
                        last_job.append(
                            Job(
                                position=None,
                                company=Company(
                                    name=None,
                                    #industry=last_job_company_industry
                                ),
                                location=None))

                for x in range(4 - len(job_positions)):
                    last_job.append(
                        Job(
                            position=None,
                            company=Company(name=None,
                                            #industry=last_job_company_industry
                                            ),
                            location=None))

                print(
                    "profile_name {} \n headline {} \n location {} \n connection {} \n profile_desc {} \n email {} \n phone {} \n birthday {} \n connectedDate {} \n skills {} \n last_job {} \n last_job {} \n last_job {} \n last_job {} \n  education {} \n"
                    .format(profile_name, headline, location, connection,
                            profile_desc, email, phone, birthday,
                            connectedDate, skills, last_job[0], last_job[1],
                            last_job[2], last_job[3], education_list[0]))

                return ScrapingResult(
                    Profile(
                        profile_name, headline, location, connection,
                        connectedDate, phone, birthday, profile_desc, email,
                        skills, last_job,
                        JobHistorySummary(profile_known_graduation_date,
                                          job_positions_data_ranges),
                        education_list))

            else:
                return ScrapingResult(Profile(profile_name, email, skills))

        except HumanCheckException:

            if self.headless_option:
                raise CannotProceedScrapingException

            linkedin_logout(self.browser)

            linkedin_login(self.browser,
                           self.config.get('linkedin', 'username'),
                           self.config.get('linkedin', 'password'))

            while self.browser.current_url != 'https://www.linkedin.com/feed/':
                message_to_user('Please execute manual check', self.config)
                time.sleep(30)

            return self.scrap_profile(profile_linkedin_url,
                                      profile_known_graduation_date)
Exemple #10
0
data2 = [(0, [(1, 5)]), (1, [(1, 7)]), (2, [(1, 6)])]

datas = []
for _ in range(5):
    job = (random.randint(1, 5), [])

    for _ in range(3):
        job[1].append((random.randint(1, 3), random.randint(1, 5)))
    datas.append(job)

data = datas

if __name__ == '__main__':
    # Task(machine_id, duration, order)
    jobs = [Job(id=i + 1, arrival_time=row[0]) for i, row in enumerate(data)]

    for i, row in enumerate(data):
        for col in row[1]:
            jobs[i].add_task(Task(machine_id=col[0], duration=col[1]))

    def simulate(jobs, rule):
        simulator = JobShopSimulator(env=simpy.Environment(),
                                     jobs=jobs,
                                     rule=rule)
        simulator.run(until=50)
        simulator.plot()

    simulate(jobs, 'FIFO')
    simulate(jobs, 'LIFO')
    simulate(jobs, 'SPT')
Exemple #11
0
    def exec(
        self,
        input_path: str,
        output_path: str,
        aux_dir: str,
        additional_params: t.Optional[t.Dict[str, str]] = None,
        parallelize: bool = False,
        cluster_data_dir: t.Optional[str] = None,
        priority: int = 0,
        queue: str = "itaym",
        wait_until_complete: bool = False,
        get_completion_validator: bool = True,
    ) -> t.Union[float, str]:
        """
        :param input_path: path to alignment file
        :param output_path: path in which the program should write its output
        :param additional_params: additional parameters unique to the program
        :param parallelize: boolean indicating weather execution of the program should be parallelized in the cluster or not
        :param cluster_data_dir: cluster directory that is mounted to the container data directory. Must be provided with parallleize is True
        :param aux_dir: directory in which auxiliary files should be generated by the job submission process
        :param priority: priority of the jobs
        :param queue: queue to submit the jobs to
        :param wait_until_complete: indicator weather the main program should wait until completion of all jobs (recommended: True)
        :param get_completion_validator: boolean indicating weather a validator file should be generated upon job completion (recommended: True)
        :return: either the duration of the command in minutes, if no parallelization was selected, or the path to the touch file that is used for validation of job completion in case of parallelization
        """
        additional_args = dict()
        from .paml import Paml
        from .busted import Busted

        if type(self) in [Paml, Busted]:
            additional_args["input_tree_path"] = re.sub(
                "\.fas[^.]*", "_tree.nwk", input_path)
        if type(self) is Paml:
            additional_args["control_file_path"] = re.sub(
                "\.fas[^.]*", "_paml.ctl", input_path)
        command = self.set_command(
            input_path=input_path,
            output_path=output_path,
            additional_params=additional_params,
            parallelize=parallelize,
            cluster_data_dir=cluster_data_dir,
            **additional_args,
        )
        os.makedirs(aux_dir, exist_ok=True)

        if os.path.exists(output_path):
            logger.info(
                f"{self.name} output already exists at {output_path} and will not be generated again"
            )
            return

        if not parallelize:
            start_time = time()
            if type(self) is not Paml:
                os.chdir(
                    aux_dir
                )  # move to aux dir as rate4site generates extra files in current running directory
            for cmd in command:
                if "cd " in cmd:
                    os.chdir(cmd.replace("cd ", ""))
                else:
                    res = os.system(
                        f"{cmd} > /dev/null 2>&1"
                    )  # for some reason, rate4 prints some logs into the stderr,
                    # making the typical test (raise error i=f stderr > 0) invalid in this case
                    if res != 0:
                        raise RuntimeError(f"command {cmd} failed to execute.")
            end_time = time()
            return (end_time - start_time) / 60
        else:
            commands = ([
                f"cd {aux_dir.replace(os.environ['container_data_dir'], cluster_data_dir)}",
                """timestamp() {
                      date +"%T" # current time
                    }
                    timestamp""",
            ] + command + ["timestamp"])

            job = Job(
                name=self.name,
                sh_dir=aux_dir,
                output_dir=aux_dir,
                commands=commands,
                priority=priority,
                queue=queue,
            )
            completion_validator = job.submit(
                wait_until_complete=wait_until_complete,
                get_completion_validator=get_completion_validator,
            )
            return completion_validator
Exemple #12
0
    def parsing_jobs(self, job_positions):
        job_positions_data_ranges = []
        #array of Jobs
        Jobs_array = []

        for job_position in job_positions:
            #print('job_pos.text: {0}\n--'.format(job_position.text))
            try:
                # Get the date range of the job position
                # get the date_range
                try:
                    date_range_element = job_position.find_element_by_class_name(
                        'pv-entity__date-range')
                    date_range_spans = date_range_element.find_elements_by_tag_name(
                        'span')
                    date_range = date_range_spans[1].text
                    # print('date_range: {0}'.format(date_range))
                except NoSuchElementException:
                    date_range = "N/A"

                try:
                    # get the title
                    title_range_element = job_position.find_element_by_tag_name(
                        'h3')
                    title = title_range_element.text
                    # print('title: {0}'.format(title))
                except NoSuchElementException:
                    title = "N/A"

                try:
                    # get the companyname
                    companyname_range_element = job_position.find_element_by_class_name(
                        'pv-entity__secondary-title')
                    companyname = companyname_range_element
                    companyname = companyname.text.replace(
                        'Full-time', '').replace('Part-time', '').strip()
                    # print('companyname: {0}'.format(companyname))
                except NoSuchElementException:
                    companyname = "N/A"

                try:
                    # get the company info using bautifulsoup
                    company_url_link = job_position.find_element_by_tag_name(
                        'a').get_attribute('href')
                except NoSuchElementException:
                    company_url_link = "N/A"

                try:
                    companylocation_range_element = job_position.find_element_by_class_name(
                        'pv-entity__location')
                    companylocation_spans = companylocation_range_element.find_elements_by_tag_name(
                        'span')
                    companylocation = companylocation_spans[1].text
                except NoSuchElementException:
                    companylocation = "N/A"
                # print('companylocation: {0}'.format(companylocation))

                job_positions_data_ranges.append(date_range)
                info_company = self.get_company_data(company_url_link)
                try:
                    if info_company['companyname'] == "N/A":
                        info_company['companyname'] = companyname
                    if info_company['location'].full_string == "N/A":
                        loc = Location()
                        loc.parse_string(companylocation)
                        info_company['location'] = loc
                except:
                    print("Oops!", sys.exc_info()[0], "occured.")
                    print(info_company['industry'])
                    print(info_company['companyname'])
                    print(info_company['location'])

                trabajo_oo = Job(
                    position=title.strip(),
                    company=Company(name=info_company['companyname'].strip(),
                                    industry=info_company['industry'].strip()),
                    location=info_company['location'],
                    daterange=date_range.strip())
                Jobs_array.append(trabajo_oo)
                # print(trabajo_oo)

            except:
                print("Oops!, \n{}\n{}\n{}\noccured.".format(
                    sys.exc_info()[0],
                    sys.exc_info()[1],
                    sys.exc_info()[2]))
                print("Job untacking error")
                pass

        return {
            'Jobs_array': Jobs_array,
            "job_positions_data_ranges": job_positions_data_ranges
        }
Exemple #13
0
def main():
    start_with = sys.argv[1] if len(sys.argv) > 1 else ""
    workers = config.workers_url()
    start_time = time.time()
    log.info('master starts, start_with is "{}", workers are {}'.format(
        start_with, workers))
    with tempdir() as tmp_dir:
        all_metrics_file_name = 'all_metric_names.tmp'
        with open('{}/{}'.format(tmp_dir, all_metrics_file_name),
                  'w') as all_metrics_file:
            get_all_metrics_into_file(start_with, all_metrics_file)
        parts = split_file_into_parts(file_path=all_metrics_file.name,
                                      num_parts=len(workers) * WORKER_JOB_NUM,
                                      output_dir=tmp_dir)
        log.info('partition finishes, all jobs are: {}'.format(parts))
        jobs = [Job(start_with, part) for part in parts]

        # master preparation
        master = MasterServer(workers, jobs)

        # setup threads
        listening = Thread(target=master.server.serve_forever)
        listening.daemon = True
        sending = Thread(target=master.register_requests)
        sending.daemon = True
        checking = Thread(target=master.scan_requests)
        checking.daemon = True
        heartbeat = Thread(target=master.heartbeats)
        heartbeat.daemon = True
        listening.start()
        log.info(
            'master server starts up, listening on port {}'.format(PORT_NO))
        sending.start()
        checking.start()
        heartbeat.start()

        # waiting for results from workers
        results = []
        while len(results) < len(jobs):
            try:
                result = master.results.get(timeout=MAX_JOB_INTERVAL)
            except Empty:
                log.error('master waited too long for result, shutting down')
                exit(1)
            results.append(result)

        # all work done, shutdown servers
        for worker in workers:
            try:
                log.info('sending shutdown to worker {}'.format(worker))
                requests_retry_session(RETRY_NUM).get(worker + '/shutdown')
            except Exception as e:
                log.error(
                    'unable to stop worker {}, error message is {}'.format(
                        worker, e.message))
        master.server.shutdown()
        master.server.socket.close()
        log.info('master server shutdown, beginning aggregation')

        # start reducing phase
        merged, to_expands, total_number = reducer.start(results, start_with)
        put_to_tsdb(start_with, merged, to_expands)
        log.info(
            'one round master aggregation finished, to_expands are {}'.format(
                to_expands))
        expand(to_expands, tmp_dir,
               '{}/{}'.format(tmp_dir, all_metrics_file_name), total_number)
        log.info('finished! total running time is {}'.format(time.time() -
                                                             start_time))
def exec_pipeline_on_simulations(input_path: click.Path):
    """Program to simulate multiple datasets and then submit pipeline jobs for each one
    For example of the json format parameters, see data/test/simulation.json"""

    # process input json file
    with open(input_path, "r") as input_file:
        simulation_params = json.load(input_file)
    os.makedirs(
        simulation_params["simulations_output_dir"],
        exist_ok=True,
    )

    # intialize the logger
    logging.basicConfig(
        level=logging.INFO,
        format=
        "%(asctime)s module: %(module)s function: %(funcName)s line: %(lineno)d %(message)s",
        handlers=[
            logging.StreamHandler(sys.stdout),
            logging.FileHandler(
                f"{os.path.dirname(input_path)}/simulations.log"),
        ],
    )
    logger = logging.getLogger(__name__)
    logger.info("Json input has been successfully processed")

    logger.info(f"Processing simulation input from {input_path}")
    simulation_input = SimulationInput(**simulation_params)
    logger.info("Json input has been successfully parsed as simulation input")

    logger.info(
        f"Simulating data in {simulation_input.simulations_output_dir}")
    simulations_exist = False
    simulations_exec_complete = False
    repetitions_num = simulation_input.nrep
    if (os.path.exists(simulation_input.simulations_output_dir) and os.listdir(
            simulation_input.simulations_output_dir) == repetitions_num):
        simulations_exist = True
        all_exist = True
        for path in os.listdir(simulation_params.simulations_output_dir):
            completion_validator = f"{simulation_params.simulations_output_dir}/{path}/job_aux/pipeline_on_simulated_data.touch"
            if not os.path.exists(completion_validator):
                all_exist = False
                break
        if all_exist:
            simulations_exec_complete = True

    if not simulations_exist:
        pipeline_input_json_paths = SimulationTools.simulate(
            simulation_input=simulation_input)
        simulations_dirs = [
            f"{os.path.dirname(json_path)}/"
            for json_path in pipeline_input_json_paths
        ]
        logger.info(f"Simulation is complete.")

    else:
        simulations_dirs = [
            f"{simulation_input.simulations_output_dir}/{path}/"
            for path in os.listdir(simulation_input.simulations_output_dir)
        ]

    if not simulations_exec_complete:
        logger.info(f"submitting pipeline jobs for the simulated data")
        completion_validators = []
        for simulations_dir in simulations_dirs:
            aux_dir = f"{simulations_dir}/job_aux/"
            json_path = f"{simulations_dir}/input.json"
            if not os.path.exists(
                    f"{aux_dir}/pipeline_on_simulated_data.touch"):
                job = Job(
                    name="pipeline_on_simulated_data",
                    sh_dir=aux_dir,
                    output_dir=aux_dir,
                    commands=[
                        f"python /groups/itay_mayrose/halabikeren/down_sampling_analysis/src/main.py --input_path={json_path}"
                    ],
                    priority=simulation_params["priority"],
                    queue=simulation_params["queue"],
                )
                completion_validators.append(
                    job.submit(
                        wait_until_complete=False,
                        get_completion_validator=True,
                    ))
        logger.info(f"Job submission is complete")

        # wait for jobs to complete
        for validator in completion_validators:
            while not os.path.exists(validator):
                sleep(60)

    # analyze large scale results
    paths = [
        path for path in os.listdir(simulation_input.simulations_output_dir)
        if "rep" in path
    ]
    overlap_dfs = []
    for path in paths:
        overlap_df_path = f"{simulation_input.simulations_output_dir}/{path}/pipeline_dir/samples/samples_overlap.csv"
        overlap_df = pd.read_csv(overlap_df_path)
        overlap_df["replicate"] = path
        overlap_df["compared_methods"] = overlap_df["method_1"].str.cat(
            overlap_df[["method_2"]], sep=",")
        overlap_dfs.append(overlap_df)
    full_overlap_df = pd.concat(overlap_dfs)
    plot_large_scale_samples_overlap(
        df=full_overlap_df,
        output_path=
        f"{simulation_input.simulations_output_dir}/samples_overlap.svg",
    )

    for program in simulation_input.programs:
        data = []
        paths = [
            path
            for path in os.listdir(simulation_input.simulations_output_dir)
            if "rep" in path
        ]
        for path in paths:
            df_path = f"{simulation_input.simulations_output_dir}/{path}/pipeline_dir/tables/{program}_summary.csv"
            try:
                rep_data = pd.read_csv(df_path)
                rep_data["replicate"] = path
                data.append(rep_data)
            except Exception as e:
                logger.error(
                    f"Failed to load dataframe from {df_path} due to error {e}"
                )
        full_df = pd.concat(data)
        full_df["full_bias"] = full_df["result"] - full_df["full_result"]
        full_df["simulated_bias"] = full_df["result"] - full_df["simulated"]
        full_df_grouped = (full_df.groupby(
            ["replicate", "sampling_fraction",
             "sampling_method"]).mean().reset_index())
        full_df_grouped.to_csv(
            f"{simulation_params['simulations_output_dir']}/{program}_aggregated_data.csv"
        )

        # plot large scale data
        plot_large_scale_error(
            df=full_df_grouped,
            output_path=
            f"{simulation_input.simulations_output_dir}/{program}_absolute_error.svg",
            use_relative_error=False,
        )
        plot_large_scale_error(
            df=full_df_grouped,
            output_path=
            f"{simulation_input.simulations_output_dir}/{program}_relative_error.svg",
            use_relative_error=True,
        )
        plot_large_scale_bias(
            df=full_df_grouped,
            output_path=
            f"{simulation_input.simulations_output_dir}/{program}_bias.svg",
        )