def save_job(content): job = Job() job.name = content['name'] session.add(job) session.commit() #Save tasks & taskqueue tasks = content['tasks'] for t in tasks: save_task(t, job.id) return 'ok'
def submit_job(): session = Session() job = Job() job.entity_id = request.form['entity_id'] job.person_id = request.form['person_id'] job.type = request.form['type'] session.add(job) session.commit() return redirect(url_for('item', id=request.form['entity_id']))
def save_job(page): for result in page.find_all('div', attrs={'class': 'result'}): job = result.find('a', attrs={'class': 'turnstileLink'}) title = job.text url = site_url + job.get('href') company = result.find('span', attrs={'class': 'company'}).text location = result.find('span', attrs={'class': 'location'}).text summary = result.find('span', attrs={'class': 'summary'}).text Job.create(title=title, url=url, company=company, location=location, summary=summary, applied=False) next_page(page)
def get(self): active = self.request.get('active', None) jobs = None if active == 'true': jobs = Job.query_whole_by_active(True) # an iterator. elif active == 'false': jobs = Job.query_whole_by_active(False) # an iterator. else: jobs = Job.query_whole() json_dict = {} json_dict['jobs'] = [] for job in jobs: json_dict['jobs'].append(job.to_dict(exclude=['add_date'])) # we don't want to include date objects. self.response.charset = 'utf-8' self.response.content_type = 'application/json' self.response.out.write(json.dumps(json_dict,ensure_ascii=False,indent=2, sort_keys=True).encode('utf-8'))
def init(job): time.sleep(1) driver.get(job.url) if driver.find_elements_by_class_name("indeed-apply-button"): elem = driver.find_elements_by_class_name("indeed-apply-button")[0] elem.click() time.sleep(1) iframe = driver.find_element_by_xpath( "//*[@class='indeed-apply-bd']/iframe") driver.switch_to_frame(iframe) time.sleep(1) iframe = driver.find_element_by_xpath("/html/body/iframe") driver.switch_to_frame(iframe) complete_step_one(job) # driver.close() else: Job.update(applied=True).where(Job.applied == True).execute()
def main(): for job in Job.select().where(Job.applied == False): print(job.title) print(job.applied) print(job.url) try: init(job) except: init(job)
def get(self, hash_id): jobs = Job.query_by_hash(str(hash_id)) # an iterator. json_dict = {} json_dict['jobs'] = [] for job in jobs: json_dict['jobs'].append(job.to_dict(exclude=['add_date'])) # we don't want to include date objects. self.response.charset = 'utf-8' self.response.content_type = 'application/json' self.response.out.write(json.dumps(json_dict,ensure_ascii=False,indent=2, sort_keys=True).encode('utf-8'))
def scrape_jobs(workspace, job_dict, session, api, result): log.debug(f"Scraping job, id: {job_dict['job_id']}") settings = job_dict.get("settings", {}) job = Job( job_id=job_dict["job_id"], created_time=to_time(job_dict["created_time"]), name=settings["name"], workspace_id=workspace.id, max_concurrent_runs=settings["max_concurrent_runs"], timeout_seconds=settings["timeout_seconds"], email_notifications=settings["email_notifications"], # TODO: determine how should we handle the diff between new/existing # clusters new_cluster=(settings.get( "new_cluster", {"cluster_id": settings.get("existing_cluster_id")})), schedule_quartz_cron_expression=(settings.get( "schedule", {}).get("quartz_cron_expression")), schedule_timezone_id=settings.get("schedule", {}).get("timezone_id"), task_type=get_task_type(settings), notebook_path=settings.get("notebook_task", {}).get("notebook_path"), notebook_revision_timestamp=(settings.get( "notebook_task", {}).get("revision_timestamp")), ) if "creator_user_name" in job_dict: job.creator_user_name = job_dict.get("creator_user_name") session.merge(job) result.num_jobs += 1 job_runs_response = api.jobs.list_runs(job_id=job_dict["job_id"], limit=120) job_runs = job_runs_response.get("runs", []) log.debug(f"Scraping job runs for job_id: {job_dict['job_id']}") for job_run in job_runs: scrape_job_run(workspace, job_run, session, result) log.debug(f"Finished job_run scraping for job_id: {job_dict['job_id']}. " f"Runs scraped: {len(job_runs)}")
def post_job(): ''' Users will need to provide their username and password in the payload in order to post a job. They also need to specify the payout amount which cannot exceed the amount in their account.''' try: data = request.form username = data['username'] password = data['password'] user = User.query.filter((User.username == username) & (User.password == password)).first() if user is None: raise ValueError("Invalid username or password") account = Account.query.filter(Account.id == user.account_id).first(): est_hours = data['est_hours'] payout = data['payout'] if account.balance < payout: raise ValueError("Account balance is less than payout") else: account.balance -= payout job_files = request.files['job_files'] if job_files: handle, placeholder = tempfile.mkstemp(dir=app.config['JOB_DIR']) os.close(handle) os.remove(os.path.join(app.config['JOB_DIR'], placeholder)) job_files.save(os.path.join(app.config['JOB_DIR'], placeholder)) newJob = Job(id=int(hash(placeholder)), files=placeholder, est_hours=float(est_hours), payout=float(payout)) db.session.add(newJob) db.session.commit() response = jsonify({'Status':'Success'}) response.status_code = 200 return response except (ValueError,KeyError) as ex: response = jsonify({'Error': str(ex)}) response.status_code = 400 return response
def post(self, hash_id): jsonstring = self.request.body payload = None try: payload = json.loads(jsonstring) if payload['success'] == True: if not payload['response']: raise Exception('need response - a json object') elif payload['success'] == False: if not (payload['will_retry'] or payload['fail_reason']): raise Exception('need will_retry, fail_reason') else: raise Exception('success parameter is either boolean true or false') except Exception as ex: self.error(400) self.response.out.write('Your Data Error, ' + str(ex)) # step 1, get the job in the queue, but if not found, error 404 try: jobs = Job.query_by_hash(str(hash_id)) # an iterator. if len(jobs): for each in jobs: # modify the job status according to the request # step 2, stuff the job status with new data here. if payload['success']: each.success = True each.will_retry = False each.response = payload['response'] else: each.success = False each.will_retry = payload['will_retry'] each.fail_reason = payload['fail_reason'] each.fail_times = each.fail_times + 1 # add one to the failure times each.put() # store it into database self.response.out.write(each.public_hash_id) else: self.error(404) self.response.out.write('Job Not Found') except Exception as ex: self.error(500) self.response.out.write('Database Query Error ' + str(ex))
def scrape_jobs(workspace, job_dict, session, api, result): log.debug(f"Scraping job, id: {job_dict['job_id']}") settings = job_dict.get("settings", {}) job = Job( job_id=job_dict["job_id"], created_time=to_time(job_dict["created_time"]), name=settings.get("name", "Untitled"), workspace_id=workspace.id, max_concurrent_runs=settings.get("max_concurrent_runs", 1), timeout_seconds=settings.get("timeout_seconds"), email_notifications=settings.get("email_notifications", []), new_cluster=settings.get("new_cluster"), existing_cluster_id=settings.get("existing_cluster_id"), task_type=get_task_type(settings), task_parameters=settings.get(get_task_type(settings).lower(), {}) ) if "creator_user_name" in job_dict: job.creator_user_name = job_dict.get("creator_user_name") if "schedule" in settings: schedule = settings.get("schedule", {}) job.schedule_quartz_cron_expression = schedule.get("quartz_cron_expression") job.schedule_timezone_id = schedule.get("timezone_id") if job.task_type == 'notebook': task = settings.get("notebook_task", {}) job.notebook_path = task.get("notebook_path") job.notebook_revision_timestamp = task.get("revision_timestamp") session.merge(job) result.num_jobs += 1 job_runs = query_paginated(api.jobs.list_runs, {'job_id': job_dict["job_id"]}, 'runs') log.debug(f"Scraping job runs for job_id: {job_dict['job_id']}") for job_run in job_runs: scrape_job_run(workspace, job_run, session, result) log.debug(f"Finished job_run scraping for job_id: {job_dict['job_id']}. " f"Runs scraped: {len(job_runs)}")
def save_to_db(self): Job.clear() for i in self.queue: job = Job(i[0], i[1], i[2]) session.add(job) session.commit()
def send_task_2_worker(task_id): """ 定时任务响应函数,负责把任务按账号拆解成job, 并发送给最适合的队列 :param task_id: 任务id :return: 成功返回True, 失败返回False """ try: jobs = [] time_it_beg = datetime.datetime.now() db_scoped_session = ScopedSession() task = db_scoped_session.query( Task.category, Task.configure, Task.limit_counts, Task.succeed_counts, Task.scheduler).filter(Task.id == task_id).first() if not task: logger.error( 'send_task_2_worker can not find the task, id={}. '.format( task_id)) return False category, task_configure, limit_counts, succeed_counts, sch_id = task sch_mode = db_scoped_session.query( Scheduler.mode).filter(Scheduler.id == sch_id).first() # 对于周期性任务,每次产生的job会严格控制, 但对于一次性任务, 用户指定多少个账号,就用多少个账号 if sch_mode[0] in [1, 2]: if limit_counts: # 如果当前任务的成功数大于需求数, 或者成功数加上正在运行的job数目大于用于需求数110%, 则不需要继续产生job if succeed_counts >= int(limit_counts * 1.2): logger.warning( 'send_task_2_worker ignore, task already finished, task id={}, succeed jobs({}) >= limit counts({})*1.2' .format(task_id, succeed_counts, limit_counts)) return True task_running_jobs = db_scoped_session.query(Job).filter( and_(Job.task == task_id, Job.status == 'running')).count() if task_running_jobs + succeed_counts >= int( limit_counts * 1.2): logger.warning( 'send_task_2_worker ignore, task will finish, task id={}, succeed jobs({})+running jobs({}) >= limit counts({})*1.2' .format(task_id, succeed_counts, task_running_jobs, limit_counts)) return True # 一个任务正在运行job积压过多时, 暂时停止产生新的jobs if task_running_jobs >= 10000: logger.warning( 'task({}) jobs num={} has reached jobs limit 10000'. format(task_id, task_running_jobs)) return True # 根据task的类别,找到task对应的处理函数 tcg = db_scoped_session.query(TaskCategory.processor).filter( TaskCategory.category == category).first() if not tcg: return False # 每一个类型的任务都对应一个处理器 task_processor = tcg[0] if not task_processor: logger.error( 'Task(id={}) have no processor, ignore processing.'.format( task_id)) return False logger.info( '---------send_task_2_worker task id={}. --------'.format(task_id)) # 找到任务的所有账号 res = db_scoped_session.query(TaskAccountGroup.account_id).filter( TaskAccountGroup.task_id == task_id).all() account_ids = [x[0] for x in res] accounts = db_scoped_session.query( Account.id, Account.status, Account.account, Account.password, Account.email, Account.email_pwd, Account.gender, Account.phone_number, Account.birthday, Account.national_id, Account.name, Account.active_area, Account.active_browser, Account.profile_path, Account.configure).filter(Account.id.in_(account_ids)).all() # agents = db_scoped_session.query(Agent.id, Agent.active_area).filter(Agent.status != -1).order_by(Agent.status).all() # 一个任务会有多个账号, 按照账号对任务进行第一次拆分 real_accounts_num = 0 for acc in accounts: acc_id, status, account, password, email, email_pwd, gender, phone_number, birthday, national_id, name, \ active_area, active_browser_id, profile_path, account_configure = acc if status == 'invalid': logger.warning( 'account status in invalid. task id={}, account id={}'. format(task_id, acc_id)) continue area = db_scoped_session.query(Area).filter( Area.id == active_area).first() queue_name = 'default' area_id = None if area: area_id, queue_name = area.id, area.name else: logger.warning( 'There have no optimal agent for task, task id={}, account id={}, account area={}' .format(task_id, acc_id, active_area)) active_browser = db_scoped_session.query(FingerPrint.value).filter( FingerPrint.id == active_browser_id).first() if get_system_args()["force_display"] == 0: headless = True if get_environment() == 'pro' else False else: headless = False # 构建任务执行必备参数 inputs = { 'system': { 'headless': headless }, 'task': { 'task_id': task_id, 'configure': json.loads(task_configure) if task_configure else {}, }, 'account': { 'account': account, 'password': password, 'status': status, 'email': email, 'email_pwd': email_pwd, 'gender': gender, 'phone_number': phone_number, 'birthday': birthday, 'national_id': national_id, 'name': name, 'active_area': active_area, 'active_browser': json.loads(active_browser[0]) if active_browser else {}, 'profile_path': profile_path, 'configure': json.loads(account_configure) if account_configure else {} } } celery_task_name = "tasks.tasks.{}".format(task_processor) real_accounts_num += 1 track = app.send_task(celery_task_name, args=(inputs, ), queue=queue_name, routing_key=queue_name) logger.info( '-----send sub task to worker, celery task name={}, area id={}, queue={}, ' 'task id={}, account id={}, track id={}'.format( celery_task_name, area_id, queue_name, task_id, acc_id, track.id)) job = Job() job.task = task_id job.task = task_id job.account = acc_id job.area = area_id job.status = 'running' job.track_id = track.id job.start_time = datetime.datetime.now() jobs.append(job) if sch_mode[0] in [1, 2]: # 如果已经在运行的jobs,加上当前产生的jobs数量超过用户需求数量,则break, 停止生产jobs, 下个调度周期重新检测再试 total_running_jobs = task_running_jobs + real_accounts_num if (limit_counts and total_running_jobs >= int( limit_counts * 1.2)) or total_running_jobs >= 10000: logger.warning( 'task({}) total running jobs num({}) is already more than limit counts({})*1.2' .format(task_id, total_running_jobs, limit_counts)) break # 更新任务状态为running # task实际可用的账号数目, 会根据每次轮循时account状态的不同而变化 db_scoped_session.query(Task).filter(and_(Task.id == task_id, Task.status.in_(['new', 'pending'])))\ .update({Task.status: "running", Task.start_time: datetime.datetime.now(), Task.real_accounts_num: real_accounts_num, Task.last_update: datetime.datetime.now()}, synchronize_session=False) if jobs: db_scoped_session.add_all(jobs) db_scoped_session.commit() logger.info( '----send_task_2_worker send task {}, produce jobs={}, used {} seconds. ' .format(task_id, real_accounts_num, (datetime.datetime.now() - time_it_beg).seconds)) except BaseException as e: logger.exception( 'send_task_2_worker exception task id={}, e={}'.format(task_id, e)) db_scoped_session.rollback() finally: ScopedSession.remove() return True