def init_validator(): """ check if validating thread is alive and if not, then initiate it """ if not ValidatorThreadHandler.__validator_thread.is_alive(): logger.info("Creating validator thread") print('Creating validator thread') ValidatorThreadHandler.__validator_thread = threading.Thread(target=ValidatorThreadHandler.__process_method) ValidatorThreadHandler.__validator_thread.start()
def _reprocess_image(queue: Queue) -> None: global stats while not queue.empty(): img_data = queue.get() img_data["filePath"] = f"{os.path.splitext(img_data['filePath'])[0]}.tif" tif_filename = os.path.basename(img_data["filePath"]) local_file = f"TEMP_{os.path.basename(img_data['id'])}" logger.info(f"Processing {img_data['id']}") if _download_source_file(img_data, local_file): image = _preprocess_image(img_data, local_file) if image: image.tiffsave(tif_filename, tile=True, pyramid=True, compression=config.COMPRESSION_TYPE, tile_width=config.PYTIF_TILE_WIDTH, tile_height=config.PYTIF_TILE_HEIGHT, \ xres=config.DPI_VALUE, yres=config.DPI_VALUE) # noqa new_tiff = Image.tiffload(tif_filename) _upload_files(img_data, local_file, tif_filename) gql.update_item(img_data['id'], new_tiff.height, new_tiff.width) os.remove(tif_filename) os.remove(local_file) logger.info(f'Completed {local_file}') else: gql.remove_missing_item(img_data['id']) Statistic.download_err(img_data) Statistic.attempted() queue.task_done()
def __rule_1_after(msg): """ after the reply from CRCL is back check if the incident is spam """ try: report_id = msg['body']['reportID'] report_type = shared.processed_mgs[report_id]['inc'][ 'incidentType'] precipitation = msg['body']['precipitation'] except (KeyError, TypeError, ValueError, IndexError) as e: logger.info( "Cannot load reportID / report type/ precipitation from processed messages" ) logger.debug(str(type(e)) + str(e)) return logger.info("Validating 803 with type: " + str(report_type) + " and precipitation: " + str(round(precipitation, 2)) + " ID: " + str(report_id)) # type == Precipitation / Heavy Precipitation + precipitation == 0 ==> SPAM if precipitation < .1 and (report_type == 'Precipitation' or report_type == 'HeavyPrecipitation' or report_type == 'Heavy Precipitation' or report_type == "Blizzard"): spam = True else: spam = False return spam
def tournament(url: str, name: str) -> int: s = fetch_tools.fetch(url, character_encoding='utf-8', retry=True) # Tournament details soup = BeautifulSoup(s, 'html.parser') cell = soup.find('div', {'id': 'EventReport'}).find_all('td')[1] name = cell.find('a').string.strip() day_s = cell.find('br').next.strip() if '-0001' in day_s: # Tournament has been incorrectly configured. return 0 dt, competition_series = get_dt_and_series(name, day_s) top_n = find_top_n(soup) if top_n == competition.Top.NONE: # Tournament is in progress. logger.info('Skipping an in-progress tournament.') return 0 db().begin('tournament') competition_id = competition.get_or_insert_competition(dt, dt, name, competition_series, url, top_n) ranks = rankings(soup) medals = medal_winners(s) final = finishes(medals, ranks) n = add_decks(dt, competition_id, final, s) db().commit('tournament') return n
def message_to_queue(message): try: message = json.loads(message) except json.decoder.JSONDecodeError as e: logger.warning("message from bus is not a valid json: " + str(e)) logger.debug(message) return message_queue.MessageQueue.put_message(message) logger.info("Message arrived from bus and inserted in queue") logger.debug(json.dumps(message)) ValidatorThreadHandler.init_validator()
def validate_803(msg): """ find the incident in the local storage and continue validation according to the info from TOP803""" logger.info("Message TOP803 is processed.") logger.debug("TOP803 message: " + str(msg)) if msg['body']['reportID'] not in shared.processed_mgs: logger.warning( "Message TOP803 does not correspond to a stored report. ID: " + str(msg['body']['reportID'])) return Validator.__incident_spam(msg['body']['reportID'], Validator.__rule_1_after(msg))
def process_image_changes(data: list): jobs = Queue() for img_data in data: jobs.put(img_data) logger.info(f"{jobs.qsize()} IMAGES TO PROCESS") start_time = time.time() for i in range(config.MAX_THREADS): threading.Thread(target=_reprocess_image, args=(jobs,)).start() jobs.join() end_time = time.time() elapsed_time = end_time - start_time Statistic.summary() logger.info(f"ELAPSED TIME = {elapsed_time} seconds")
def __incident_spam(reportID, spam): """ when the message is detected to be spam send update TOP801 to KBS Even if not detected to be spam inform KBS via TOP801. So that it is known that the validation step works. """ logger.info("Message IS " + ('' if spam else 'NOT ') + "SPAM! " + " Passed validation successfully. ID: " + str(reportID)) msg = Validator.generate_TOP801(reportID, spam) if msg is None: logger.warning("TOP801 was not generated correctly") return Validator.bus_prod.send(topic=msg['header']['topicName'], message=json.dumps(msg))
def send(self, topic, message): logger.info("Sending: " + str(topic)) logger.debug("Sending: " + str(topic) + ": " + str(message)) # return # Produce and flush message to bus try: self.producer.produce(topic, message.encode('utf-8'), 'key', -1, self.on_delivery) self.producer.flush() except Exception as err: print('Sending data failed') print(err) return False return True
def image(c: str = '') -> wrappers.Response: names = c.split('|') try: requested_cards = oracle.load_cards(names) path = image_fetcher.download_image(requested_cards) if path is None: raise InternalServerError(f'Failed to get image for {c}') return send_file( os.path.abspath(path) ) # Send abspath to work around monolith root versus web root. except TooFewItemsException as e: logger.info(f'Did not find an image for {c}: {e}') if len(names) == 1: return redirect( f'https://api.scryfall.com/cards/named?exact={c}&format=image', code=303) return make_response('', 400)
def main(start=0, end=None): import random from datetime import datetime from bus_communication import bus_producer with open("VAL_TOP030.json", 'r') as f: top030 = json.load(f) bp = bus_producer.BusProducer() max_delay = 1 # delay in the range [0, max_delay] from uniform distribution if end is None: end = len(top030) count = 0 for m in top030: if count >= end: break count += 1 if count < start: continue logger.info("sending message 30 to bus : " + str(count)) try: m['header']['sentUTC'] = datetime.utcnow().isoformat().split( ".")[0] + 'Z' except: pass try: if 'incidents' in m['body']: for inc in m['body']['incidents']: inc['timestamp'] = datetime.utcnow().isoformat().split( ".")[0] + 'Z' except: pass # print(json.dumps(m, indent=2)) bp.send(topic=m['header']['topicName'], message=json.dumps(m)) time.sleep(random.random() * max_delay)
def validate(message): """ read the message and determine if it is TOP 030 or TOP 803 and invoke the corresponding method """ print("Thread id in validator: " + str(threading.get_ident())) try: inc_topic = message['header']['topicName'] except (KeyError, TypeError, ValueError, IndexError) as e: logger.warning("could not read topicName from message. Do nothing") logger.debug(e) logger.debug(message) return logger.info("Message is now processed. TOPIC: " + str(inc_topic)) if inc_topic == 'TOP030_REPORT_REQUESTED': Validator.validate_TOP030(message) elif inc_topic == 'TOP803_WEATHER_REPORT': Validator.validate_803(message) else: logger.warning( "Message read in validator is not TOP030 nor TOP803")
def comment(job_id): data = json.loads(flask.request.data) comment_text = data['text'] comment_date = dateutil.parser.parse(data['date']) salary = data['salary'] or 0 rating = (float(data['rating']) / 5) or 0 if job_id is not None and comment_text: job = Job.objects(id=job_id).first() if not job: return render_template('404.html') logger.info(COMPONENT, 'Adding comment for job: {}'.format(job_id)) new_comment = Comment(comment=comment_text, date=comment_date, salary=salary, crawled=False, rating=AggregateRating(rating=rating, count=1)) job.update(push__comments=new_comment) return json.dumps({'success': True}), 200, {'ContentType': 'application/json'}
def listen(self, performed_action, topics=None): # Topics should be a list of topic names e.g. ['topic1', 'topic2'] if topics is None: topics = self.default_topics self.listening = True # Subscribe to topics try: self.consumer.subscribe(topics) except Exception as e: logger.error("Error @ BusConsumer.listen()") logger.debug(str(type(e)) + str(e)) return False logger.info("listener subscribed successfully to topics:" + str(topics)) # Initiate a loop for continuous listening while self.listening: msg = self.consumer.poll(0) # If a message is received and it is not an error message if msg is not None and msg.error() is None: # Add incoming message to requests database try: message_text = msg.value().decode('utf-8') except: message_text = msg.value() performed_action(message_text) # TODO: check if it works ok with the sleep .5 time.sleep(0.5) # Unsubscribe and close consumer self.consumer.unsubscribe() self.consumer.close()
def import_comment(**kwargs): """Import comment from RateMyCoopJob. Keyword arguments: employer_name -- Employer name job_title -- Title of job comments: -- Array of comments comment -- Comment comment_date -- Date comment was submitted. Note: in non-standard form such as: 5 years ago, 3 weeks ago etc salary -- Job salary (hourly) rating -- Job rating out of 5 (1 - 5 stars on ratemycoopjob) """ employer_name = kwargs['employer_name'].lower() job_title = kwargs['job_title'].lower() # If employer alias exists (ex. Research in motion -> Blackberry), use instead if employer_name in employer_alias.aliases: employer_name = employer_alias.aliases[employer_name].lower() # If employer does not exist if not Employer.objects.search_text( "\"{}\"".format(employer_name)).count() > 0: logger.info( COMPONENT, 'Employer: {} does not exist, ignoring..'.format(employer_name)) return logger.info( COMPONENT, 'Importing comments for job: {} from employer: {}'.format( job_title, employer_name)) employer = Employer.objects.search_text( "\"{}\"".format(employer_name)).no_dereference().first() # Iterate through all comments for index, comment_obj in enumerate(kwargs['comments']): comment = comment_obj['comment'] comment_date = _get_comment_date(comment_obj['comment_date']) salary = float(comment_obj['salary']) rating = float(comment_obj['rating']) / 5 # If job does not exist add to employer if not employer.job_exists(job_title): if employer.comment_exists(comment=comment, date=comment_date, salary=salary, rating=rating): logger.info( COMPONENT, 'Comment: {} already exists for employer: {}, ignoring'. format(index, employer_name)) else: logger.info( COMPONENT, 'Adding comment: {} to employer: {}'.format( index, employer_name)) new_comment = Comment(comment=comment, date=comment_date, salary=salary, crawled=True, rating=AggregateRating(rating=rating, count=1)) employer.update(push__comments=new_comment) # Job already exists else: job = Job.objects(id__in=[job.id for job in employer.jobs], title=job_title).first() if job.comment_exists(comment=comment, date=comment_date, salary=salary, rating=rating): logger.info( COMPONENT, 'Comment: {} already exists for job: {} for employer: {}, ignoring' .format(index, job_title, employer_name)) else: logger.info( COMPONENT, 'Adding comment: {} for job: {} from {}'.format( index, job_title, employer_name)) new_comment = Comment(comment=comment, date=comment_date, salary=salary, crawled=True, rating=AggregateRating(rating=rating, count=1)) job.update(push__comments=new_comment)
def import_job(**kwargs): """Import job. Keyword arguments: employer_name -- Employer name job_title -- Title of job summary -- Job summary year -- Year the job was advertised term -- Term job was advertised [Fall, Winter, Spring] location -- Location job was advertised openings -- Number of job openings remaining -- Number of job openings remaining applicants -- Number of applicants job has (Optional) levels -- Levels job is intended for [Junior, Intermediate, Senior] programs -- Programs the job is specified for url -- URL of job date -- Date job was crawled (useful for knowing exactly # of applicants at what time) index -- Boolean to indicate whether to index or not (default True) """ employer_name = kwargs['employer_name'].lower() job_title = kwargs['job_title'].lower() term = kwargs['term'] levels = [] for level in kwargs['levels']: uw_level = Term.get_level(level) if uw_level: levels.append(uw_level) else: logger.error(COMPONENT, 'Error processing level: {}'.format(level)) programs = [] for program in kwargs['programs']: uw_program = Program.get_program(program) if uw_program: programs.append(uw_program) else: logger.error(COMPONENT, 'Error processing program: {}'.format(program)) location = kwargs['location'].lower() openings = int(kwargs['openings']) remaining = int(kwargs['remaining']) if 'remaining' in kwargs else openings summary = kwargs['summary'] filtered_summary = engine.filter_summary(summary) summary_keywords = engine.get_keywords(filtered_summary, programs) date = kwargs['date'] year = date.year url = kwargs['url'] applicants = 0 try: if kwargs['applicants']: applicants = int(kwargs['applicants']) except Exception: pass index = False if index in kwargs: index = kwargs['index'] logger.info(COMPONENT, 'Importing job: {} from {}'.format(job_title, employer_name)) # If employer does not exist, create it if not Employer.employer_exists(employer_name): logger.info( COMPONENT, 'Employer: {} does not exist, creating..'.format(employer_name)) employer = Employer(name=employer_name) logger.info(COMPONENT, 'Creating job: {}'.format(job_title)) location = Location(name=location) applicant = Applicant(applicants=applicants, date=date) keywords = [ Keyword(keyword=k['keyword'], types=k['types']) for k in summary_keywords ] # New job so number of remaining positions is same as openings job = Job(title=job_title, summary=filtered_summary, year=year, term=term, location=[location], openings=openings, remaining=remaining, applicants=[applicant], levels=levels, programs=programs, url=url, keywords=keywords) job.save() job.reload() employer.jobs.append(job) employer.save() employer.reload() if index: elastic.index_employer_waterlooworks(employer) elastic.index_job_waterlooworks(employer, job) # Employer already exists else: employer = Employer.objects( name=employer_name).no_dereference().first() logger.info(COMPONENT, 'Employer: {} already exists'.format(employer_name)) # If job does not exist, create it if not employer.job_exists(job_title): logger.info(COMPONENT, 'Creating job: {}'.format(job_title)) location = Location(name=location) applicant = Applicant(applicants=applicants, date=date) keywords = [ Keyword(keyword=k['keyword'], types=k['types']) for k in summary_keywords ] # New job so number of remaining positions is same as openings job = Job(title=job_title, summary=engine.filter_summary(summary), year=year, term=term, location=[location], openings=openings, remaining=remaining, applicants=[applicant], levels=levels, programs=programs, url=url, keywords=keywords) job.save() job.reload() employer.update(push__jobs=job) if index: elastic.update_employer_waterlooworks(employer) elastic.index_job_waterlooworks(employer, job) # Job already exists else: logger.info(COMPONENT, 'Job: {} already exists'.format(job_title)) job = Job.objects(id__in=[job.id for job in employer.jobs], title=job_title).first() if not year >= job.year: raise DataIntegrityError( 'Job: {} by {} cannot be advertised before {}'.format( job_title, employer_name, job.year)) filtered_summary_compare = re.sub( r'\W+', '', filtered_summary.lower().strip()).strip() job_summary_compare = re.sub(r'\W+', '', job.summary.lower().strip()).strip() # Job summary is not the same. In this case the employer most likely changed the job if not filtered_summary_compare == job_summary_compare: if openings >= 1: logger.info( COMPONENT, 'Job: {}: different summary detected, deprecating and creating new job..' .format(job_title)) job.update(set__deprecated=True) location = Location(name=location) applicant = Applicant(applicants=applicants, date=date) keywords = [ Keyword(keyword=k['keyword'], types=k['types']) for k in summary_keywords ] # Assume new job so number of remaining positions is same as openings new_job = Job(title=job_title, summary=filtered_summary, year=year, term=term, location=[location], openings=openings, remaining=remaining, applicants=[applicant], levels=levels, programs=programs, url=url, keywords=keywords) new_job.save() new_job.reload() employer.update(push__jobs=new_job) if index: elastic.delete_employer_waterlooworks(employer) elastic.delete_job_waterlooworks(employer, job) elastic.index_employer_waterlooworks(employer) elastic.index_job_waterlooworks(employer, new_job) else: logger.info( COMPONENT, 'Job: {}: different summary detected but invalid openings: {}, ignoring..' .format(job_title, openings)) # Job is the same (same title and description) else: # If job is being advertised in new term if year != job.year or term != job.term: logger.info( COMPONENT, 'Job: {}: being advertised in new term, updating..'. format(job_title)) # Add hire ratio for previous term hire_ratio = float(job.openings - job.remaining) / job.openings job.hire_rate.add_rating(hire_ratio) location = Location(name=location) applicant = Applicant(applicants=applicants, date=date) hire_rate = AggregateRating(rating=job.hire_rate.rating, count=job.hire_rate.count) job.update(set__year=year, set__term=term, add_to_set__location=location, set__openings=openings, set__remaining=remaining, push__applicants=applicant, set__hire_rate=hire_rate, set__levels=levels, set__programs=programs, set__url=url, set__last_indexed=datetime.now()) if index: elastic.update_job_waterlooworks(employer, job) # Job is being updated. We need to update location, openings, levels, remaining, hire_rate, applicants else: logger.info( COMPONENT, 'Job: {}: updating for current term'.format(job_title)) remaining = job.remaining # Job posting has decreased, some positions filled up if openings < remaining: remaining = openings location = Location(name=location) applicant = Applicant(applicants=applicants, date=date) job.update(add_to_set__location=location, set__remaining=remaining, set__levels=list(set(levels + job.levels)), push__applicants=applicant, set__programs=list(set(programs + job.programs)), set__url=url, set__last_indexed=datetime.now()) if index: elastic.update_job_waterlooworks(employer, job)
def validate_TOP030(message): """ δες αν εχει ξαναέρθει msg με το incident id που ηρθε τώρα (είτε τοπικά είτε σε ένα csv). Αν έχει ξαναέρθει, τότε μην κάνεις τίποτα. Αν δεν έχει ξαναέρθει, τότε δες αν το msg έχει attachment και incident type. Αν δεν έχει, τότε μην κάνεις τίποτα Αν έχει (incident type) Βαλε το στην λίστα των incidents που έχουν επεξεργαστεί (έχουν περάσει ή περνανε τώρα validation) Δες αν το incident type == Heatwave (?) Αν οχι, τότε βάλε spam == false Αν ναι, τότε ρώτα το CRCL για να σου πει τις καιρικές συνθήκες στην περιοχή του incident """ report_id = None report_type = None inc_long = None inc_lat = None report_time = None report_spam = None # print(message['body']['incidents']) logger.debug("Processed TOP030 message: " + str(message)) header = message['header'] try: inc_long = float(message['body']['position']['long']) inc_lat = float(message['body']['position']['lat']) except (KeyError, TypeError, ValueError, IndexError) as e: logger.info( "Incoming message does not have location, validation will stop." ) logger.debug(str(type(e)) + str(e)) logger.debug(message) return try: incidents = message['body']['incidents'] except (KeyError, TypeError) as e: logger.info("No reports in TOP030, validation will stop.") logger.debug(str(type(e)) + str(e)) logger.debug(str(message)) return if len(incidents) == 0: logger.info("No incidents in TOP030.") for inc in incidents: try: report_id = inc['reportId'] report_type = inc['incidentType'] report_time = inc['timestamp'] except (KeyError, TypeError, ValueError, IndexError) as e: logger.warning( "Incident does not have report ID / incident Type / timestamp" ) logger.debug(str(type(e)) + str(e)) return if report_id in shared.processed_mgs: logger.debug("Report already processed. ReportId: " + str(report_id)) continue shared.processed_mgs[report_id] = {'inc': inc, 'header': header} # TODO: check if spam field is already there, and if is spam=True/False stop validation (not None) logger.info("Report is checked to determine if it is spam. ID:" + str(report_id)) if Validator.__rule_1_pre(report_type) is True: logger.info("Asking CRCL for report with ID:" + str(report_id)) t_802 = Validator.generate_TOP802(message, report_id, inc_long, inc_lat, report_time) Validator.bus_prod.send(topic=t_802['header']['topicName'], message=json.dumps(t_802)) else: Validator.__incident_spam(report_id, False)
def update_job(**kwargs): """Update job. Keyword arguments: id -- Job ID summary -- Job summary location -- Location job was advertised programs -- Programs the job is specified for levels -- Levels job is intended for [Junior, Intermediate, Senior] openings -- Number of job openings index -- Boolean to indicate whether to index or not (default True) """ summary = kwargs['summary'] location = kwargs['location'].lower() levels = kwargs['levels'] programs = [] for program in kwargs['programs']: uw_program = Program.get_program(program) if uw_program: programs.append(uw_program) else: logger.error(COMPONENT, 'Error processing program: {}'.format(program)) openings = 0 try: if kwargs['openings']: openings = int(kwargs['openings']) or 0 except Exception: pass index = False if index in kwargs: index = kwargs['index'] job = Job.objects(id=kwargs['id']).first() remaining = job.openings # Job posting has decreased, some positions filled up if openings < job.openings: remaining = openings filtered_summary = engine.filter_summary(summary) summary_keywords = engine.get_keywords(filtered_summary, programs) filtered_summary_compare = re.sub( r'\W+', '', filtered_summary.lower().strip()).strip() job_summary_compare = re.sub(r'\W+', '', job.summary.lower().strip()).strip() employer = Employer.objects(jobs=kwargs['id']).first() # Job summary is not the same. In this case the employer most likely changed the job if not filtered_summary_compare == job_summary_compare: if openings >= 1: logger.info( COMPONENT, 'Job: {}: different summary detected, deprecating and creating new job..' .format(kwargs['id'])) job.update(set__deprecated=True) location = Location(name=location) keywords = [ Keyword(keyword=k['keyword'], types=k['types']) for k in summary_keywords ] # Assume new job so number of remaining positions is same as openings new_job = Job(title=job.title, summary=filtered_summary, year=job.year, term=job.term, location=[location], openings=openings, remaining=openings, levels=levels, programs=programs, url=job.url, keywords=keywords) new_job.save() employer.update(push__jobs=new_job) if index: elastic.delete_employer_waterlooworks(employer) elastic.delete_job_waterlooworks(employer, job) elastic.index_employer_waterlooworks(employer) elastic.index_job_waterlooworks(employer, new_job) else: logger.info( COMPONENT, 'Job: {}: different summary detected but invalid openings: {}, ignoring..' .format(job.title, openings)) else: logger.info(COMPONENT, 'Job: {}: updating for current term'.format(kwargs['id'])) location = Location(name=location) job.update(add_to_set__location=location, set__remaining=remaining, set__levels=list(set(levels + job.levels)), set__programs=list(set(programs + job.programs)), set__last_indexed=datetime.now()) if index: elastic.update_job_waterlooworks(employer, job)
def index_waterlooworks(): logger.info(COMPONENT, 'Indexing waterlooworks data') elastic_instance.indices.delete(index='waterlooworks', ignore=[404]) elastic_instance.indices.create('waterlooworks', body={ "mappings": { "employers": { "properties": { "employer_name": {"type": "string"}, "employer_jobs": {"type": "string"} } }, "jobs": { "_parent": { "type": "employers" }, "properties": { "job_title": {"type": "string"}, "job_year": {"type": "integer"}, "job_term": {"type": "string"}, "job_summary": {"type": "string"}, "job_locations": {"type": "string"}, "job_programs": {"type": "string"}, "job_levels": {"type": "string"} } } } }) logger.info(COMPONENT, 'Indexing waterlooworks employers and jobs') employers = [] jobs = [] for employer in Employer.objects.only('name', 'jobs'): logger.info(COMPONENT, 'Indexing employer: {}'.format(employer.name)) employer_document = { "_index": "waterlooworks", "_type": "employers", "_id": employer.name, "_source": { "employer_name": employer.name, "employer_jobs": [str(job.id) for job in employer.jobs] } } employers.append(employer_document) for job in employer.jobs: if not job.deprecated: logger.info(COMPONENT, 'Indexing job: {} for employer: {}'.format(job.title, employer.name)) job_document = { "_index": "waterlooworks", "_type": "jobs", "_parent": employer.name, "_id": str(job.id), "_source": { "employer_name": employer.name, "job_title": job.title, "job_year": job.year, "job_term": job.term, "job_summary": job.summary, "job_keywords": [k.keyword for k in job.keywords], "job_locations": [location.name for location in job.location], "job_programs": job.programs, "job_levels": job.levels } } jobs.append(job_document) if len(jobs) == 1000: helpers.bulk(elastic_instance, jobs) jobs = [] if len(employers) == 1000: helpers.bulk(elastic_instance, employers) employers = [] if len(employers) > 0: helpers.bulk(elastic_instance, employers) if len(jobs) > 0: helpers.bulk(elastic_instance, jobs)
def import_comment(**kwargs): """Import comment from RateMyCoopJob. Keyword arguments: employer_name -- Employer name job_title -- Title of job comments: -- Array of comments comment -- Comment comment_date -- Date comment was submitted. Note: in non-standard form such as: 5 years ago, 3 weeks ago etc salary -- Job salary (hourly) rating -- Job rating out of 5 (1 - 5 stars on ratemycoopjob) """ employer_name = kwargs['employer_name'].lower() job_title = kwargs['job_title'].lower() # If employer alias exists (ex. Research in motion -> Blackberry), use instead if employer_name in employer_alias.aliases: employer_name = employer_alias.aliases[employer_name].lower() # If employer does not exist if not Employer.objects.search_text("\"{}\"".format(employer_name)).count() > 0: logger.info(COMPONENT, 'Employer: {} does not exist, ignoring..'.format(employer_name)) return logger.info(COMPONENT, 'Importing comments for job: {} from employer: {}'.format(job_title, employer_name)) employer = Employer.objects.search_text("\"{}\"".format(employer_name)).no_dereference().first() # Iterate through all comments for index, comment_obj in enumerate(kwargs['comments']): comment = comment_obj['comment'] comment_date = _get_comment_date(comment_obj['comment_date']) salary = float(comment_obj['salary']) rating = float(comment_obj['rating']) / 5 # If job does not exist add to employer if not employer.job_exists(job_title): if employer.comment_exists(comment=comment, date=comment_date, salary=salary, rating=rating): logger.info(COMPONENT, 'Comment: {} already exists for employer: {}, ignoring' .format(index, employer_name)) else: logger.info(COMPONENT, 'Adding comment: {} to employer: {}'.format(index, employer_name)) new_comment = Comment(comment=comment, date=comment_date, salary=salary, crawled=True, rating=AggregateRating(rating=rating, count=1)) employer.update(push__comments=new_comment) # Job already exists else: job = Job.objects(id__in=[job.id for job in employer.jobs], title=job_title).first() if job.comment_exists(comment=comment, date=comment_date, salary=salary, rating=rating): logger.info(COMPONENT, 'Comment: {} already exists for job: {} for employer: {}, ignoring' .format(index, job_title, employer_name)) else: logger.info(COMPONENT, 'Adding comment: {} for job: {} from {}'.format(index, job_title, employer_name)) new_comment = Comment(comment=comment, date=comment_date, salary=salary, crawled=True, rating=AggregateRating(rating=rating, count=1)) job.update(push__comments=new_comment)
def import_job(**kwargs): """Import job. Keyword arguments: employer_name -- Employer name job_title -- Title of job summary -- Job summary year -- Year the job was advertised term -- Term job was advertised [Fall, Winter, Spring] location -- Location job was advertised openings -- Number of job openings remaining -- Number of job openings remaining applicants -- Number of applicants job has (Optional) levels -- Levels job is intended for [Junior, Intermediate, Senior] programs -- Programs the job is specified for url -- URL of job date -- Date job was crawled (useful for knowing exactly # of applicants at what time) index -- Boolean to indicate whether to index or not (default True) """ employer_name = kwargs['employer_name'].lower() job_title = kwargs['job_title'].lower() term = kwargs['term'] levels = [] for level in kwargs['levels']: uw_level = Term.get_level(level) if uw_level: levels.append(uw_level) else: logger.error(COMPONENT, 'Error processing level: {}'.format(level)) programs = [] for program in kwargs['programs']: uw_program = Program.get_program(program) if uw_program: programs.append(uw_program) else: logger.error(COMPONENT, 'Error processing program: {}'.format(program)) location = kwargs['location'].lower() openings = int(kwargs['openings']) remaining = int(kwargs['remaining']) if 'remaining' in kwargs else openings summary = kwargs['summary'] filtered_summary = engine.filter_summary(summary) summary_keywords = engine.get_keywords(filtered_summary, programs) date = kwargs['date'] year = date.year url = kwargs['url'] applicants = 0 try: if kwargs['applicants']: applicants = int(kwargs['applicants']) except Exception: pass index = False if index in kwargs: index = kwargs['index'] logger.info(COMPONENT, 'Importing job: {} from {}'.format(job_title, employer_name)) # If employer does not exist, create it if not Employer.employer_exists(employer_name): logger.info(COMPONENT, 'Employer: {} does not exist, creating..'.format(employer_name)) employer = Employer(name=employer_name) logger.info(COMPONENT, 'Creating job: {}'.format(job_title)) location = Location(name=location) applicant = Applicant(applicants=applicants, date=date) keywords = [Keyword(keyword=k['keyword'], types=k['types']) for k in summary_keywords] # New job so number of remaining positions is same as openings job = Job(title=job_title, summary=filtered_summary, year=year, term=term, location=[location], openings=openings, remaining=remaining, applicants=[applicant], levels=levels, programs=programs, url=url, keywords=keywords) job.save() job.reload() employer.jobs.append(job) employer.save() employer.reload() if index: elastic.index_employer_waterlooworks(employer) elastic.index_job_waterlooworks(employer, job) # Employer already exists else: employer = Employer.objects(name=employer_name).no_dereference().first() logger.info(COMPONENT, 'Employer: {} already exists'.format(employer_name)) # If job does not exist, create it if not employer.job_exists(job_title): logger.info(COMPONENT, 'Creating job: {}'.format(job_title)) location = Location(name=location) applicant = Applicant(applicants=applicants, date=date) keywords = [Keyword(keyword=k['keyword'], types=k['types']) for k in summary_keywords] # New job so number of remaining positions is same as openings job = Job(title=job_title, summary=engine.filter_summary(summary), year=year, term=term, location=[location], openings=openings, remaining=remaining, applicants=[applicant], levels=levels, programs=programs, url=url, keywords=keywords) job.save() job.reload() employer.update(push__jobs=job) if index: elastic.update_employer_waterlooworks(employer) elastic.index_job_waterlooworks(employer, job) # Job already exists else: logger.info(COMPONENT, 'Job: {} already exists'.format(job_title)) job = Job.objects(id__in=[job.id for job in employer.jobs], title=job_title).first() if not year >= job.year: raise DataIntegrityError('Job: {} by {} cannot be advertised before {}' .format(job_title, employer_name, job.year)) filtered_summary_compare = re.sub(r'\W+', '', filtered_summary.lower().strip()).strip() job_summary_compare = re.sub(r'\W+', '', job.summary.lower().strip()).strip() # Job summary is not the same. In this case the employer most likely changed the job if not filtered_summary_compare == job_summary_compare: if openings >= 1: logger.info(COMPONENT, 'Job: {}: different summary detected, deprecating and creating new job..' .format(job_title)) job.update(set__deprecated=True) location = Location(name=location) applicant = Applicant(applicants=applicants, date=date) keywords = [Keyword(keyword=k['keyword'], types=k['types']) for k in summary_keywords] # Assume new job so number of remaining positions is same as openings new_job = Job(title=job_title, summary=filtered_summary, year=year, term=term, location=[location], openings=openings, remaining=remaining, applicants=[applicant], levels=levels, programs=programs, url=url, keywords=keywords) new_job.save() new_job.reload() employer.update(push__jobs=new_job) if index: elastic.delete_employer_waterlooworks(employer) elastic.delete_job_waterlooworks(employer, job) elastic.index_employer_waterlooworks(employer) elastic.index_job_waterlooworks(employer, new_job) else: logger.info(COMPONENT, 'Job: {}: different summary detected but invalid openings: {}, ignoring..' .format(job_title, openings)) # Job is the same (same title and description) else: # If job is being advertised in new term if year != job.year or term != job.term: logger.info(COMPONENT, 'Job: {}: being advertised in new term, updating..'.format(job_title)) # Add hire ratio for previous term hire_ratio = float(job.openings - job.remaining) / job.openings job.hire_rate.add_rating(hire_ratio) location = Location(name=location) applicant = Applicant(applicants=applicants, date=date) hire_rate = AggregateRating(rating=job.hire_rate.rating, count=job.hire_rate.count) job.update(set__year=year, set__term=term, add_to_set__location=location, set__openings=openings, set__remaining=remaining, push__applicants=applicant, set__hire_rate=hire_rate, set__levels=levels, set__programs=programs, set__url=url, set__last_indexed=datetime.now()) if index: elastic.update_job_waterlooworks(employer, job) # Job is being updated. We need to update location, openings, levels, remaining, hire_rate, applicants else: logger.info(COMPONENT, 'Job: {}: updating for current term'.format(job_title)) remaining = job.remaining # Job posting has decreased, some positions filled up if openings < remaining: remaining = openings location = Location(name=location) applicant = Applicant(applicants=applicants, date=date) job.update(add_to_set__location=location, set__remaining=remaining, set__levels=list(set(levels + job.levels)), push__applicants=applicant, set__programs=list(set(programs + job.programs)), set__url=url, set__last_indexed=datetime.now()) if index: elastic.update_job_waterlooworks(employer, job)
def update_job(**kwargs): """Update job. Keyword arguments: id -- Job ID summary -- Job summary location -- Location job was advertised programs -- Programs the job is specified for levels -- Levels job is intended for [Junior, Intermediate, Senior] openings -- Number of job openings index -- Boolean to indicate whether to index or not (default True) """ summary = kwargs['summary'] location = kwargs['location'].lower() levels = kwargs['levels'] programs = [] for program in kwargs['programs']: uw_program = Program.get_program(program) if uw_program: programs.append(uw_program) else: logger.error(COMPONENT, 'Error processing program: {}'.format(program)) openings = 0 try: if kwargs['openings']: openings = int(kwargs['openings']) or 0 except Exception: pass index = False if index in kwargs: index = kwargs['index'] job = Job.objects(id=kwargs['id']).first() remaining = job.openings # Job posting has decreased, some positions filled up if openings < job.openings: remaining = openings filtered_summary = engine.filter_summary(summary) summary_keywords = engine.get_keywords(filtered_summary, programs) filtered_summary_compare = re.sub(r'\W+', '', filtered_summary.lower().strip()).strip() job_summary_compare = re.sub(r'\W+', '', job.summary.lower().strip()).strip() employer = Employer.objects(jobs=kwargs['id']).first() # Job summary is not the same. In this case the employer most likely changed the job if not filtered_summary_compare == job_summary_compare: if openings >= 1: logger.info(COMPONENT, 'Job: {}: different summary detected, deprecating and creating new job..' .format(kwargs['id'])) job.update(set__deprecated=True) location = Location(name=location) keywords = [Keyword(keyword=k['keyword'], types=k['types']) for k in summary_keywords] # Assume new job so number of remaining positions is same as openings new_job = Job(title=job.title, summary=filtered_summary, year=job.year, term=job.term, location=[location], openings=openings, remaining=openings, levels=levels, programs=programs, url=job.url, keywords=keywords) new_job.save() employer.update(push__jobs=new_job) if index: elastic.delete_employer_waterlooworks(employer) elastic.delete_job_waterlooworks(employer, job) elastic.index_employer_waterlooworks(employer) elastic.index_job_waterlooworks(employer, new_job) else: logger.info(COMPONENT, 'Job: {}: different summary detected but invalid openings: {}, ignoring..' .format(job.title, openings)) else: logger.info(COMPONENT, 'Job: {}: updating for current term'.format(kwargs['id'])) location = Location(name=location) job.update(add_to_set__location=location, set__remaining=remaining, set__levels=list(set(levels + job.levels)), set__programs=list(set(programs + job.programs)), set__last_indexed=datetime.now()) if index: elastic.update_job_waterlooworks(employer, job)