def remove_blocked_records(cls): records = cls.findall() count = 0 for record in records: if BlockedContact.is_contact_blocked(record.contact): record.remove() count += 1 logger.info('cleared %d job items with blocked contacts' % count)
def remove_records_matches_rejection_pattern(cls): records = cls.findall() count = 0 for record in records: if RejectionPattern.should_be_rejected(record.job_title) or RejectionPattern.should_be_rejected(record.job_desc): record.remove() count += 1 logger.info('cleared %d job items matching the rejection pattern' % count)
def logs_socket(ws): message = ws.receive() while True: logger.info('server received from client: ' + datetime.datetime.now().isoformat()) cmd = 'awk -v Time="`date -d\'now-5 seconds\' \'+[%Y-%m-%d %H:%M:%S\'`" \'{if($0 > Time) print $0}\' ' + Config.LOG_FILE # logger.info(cmd) output = os.popen(cmd).readlines() ws.send('\n'.join(output)) if len(output) > 0 else None time.sleep(5)
def run_crawler(cls): start_time = time.time() logger.info('start running crawler..') spider_names = ['sgxin', 'shichengbbs', 'singxin', 'sggongzuo'] # pool = Pool(processes=len(spider_names)) # pool.map(CrawlerRunner(), spider_names) for spider_name in spider_names: CrawlerRunner().__call__(spider_name) logger.info('done running crawler.. Time elapsed: %.3fs' % (time.time() - start_time))
def logs_socket(ws): message = ws.receive() while True: logger.info("server received from client: " + datetime.datetime.now().isoformat()) cmd = ( "awk -v Time=\"`date -d'now-5 seconds' '+[%Y-%m-%d %H:%M:%S'`\" '{if($0 > Time) print $0}' " + Config.LOG_FILE ) # logger.info(cmd) output = os.popen(cmd).readlines() ws.send("\n".join(output)) if len(output) > 0 else None time.sleep(5)
def should_load_details(self, job_item): if JobItem.is_exists(job_item): logger.info( '[%s] skipping loading details as job already exists. job_title: %s' % (self.name, job_item.job_title)) return False if JobItem.is_older_required(job_item): logger.info( '[%s] skipping loading details as job is older than %s days. job_title: %s' % (self.name, str(config.HOUSEKEEPING_RECORD_ORDLER_THAN), job_item.job_title)) return False if BlockedContact.is_contact_blocked(job_item.contact): logger.info( '[%s] skipping loading details as job contact is blocked. contact: %s' % (self.name, job_item.contact)) return False if RejectionPattern.should_be_rejected(job_item.job_title): logger.info( '[%s] skipping loading details as job matches rejection pattern. job_title: %s' % (self.name, job_item.job_title)) return False return True
def remove(self): conn = self.connect_db() try: c = conn.cursor() c.execute('DELETE FROM ' + self.table_name + ' WHERE ' + ' AND '.join(['%s=?' % property for property in self.key_properties]), tuple([getattr(self, property) for property in self.key_properties])) conn.commit() logger.info('Removed: %s' % self) except Exception as e: logger.error(e) logger.info('Unable to remove: %s' % self) conn.rollback() raise DatabaseError(str(e)) finally: conn.close()
def import_records_from_file(item_desc): cls = desc_to_cls_mapping.get(item_desc) file = request.files["file_to_upload"] redirect_url = request.form.get("redirect_url", url_for("index")) for record in cls.findall(): record.remove() logger.info("Done removing all existing %s" % item_desc) count = 0 file.readline() # for the header, ignore for line in file.readlines(): columns = line.rstrip("\r\n").rstrip("\n").decode("utf-8").split(",") cls(columns[0], columns[1]).save() count += 1 logger.info("Done importing %d %s from %s" % (count, item_desc, file.filename)) return redirect(redirect_url)
def run_emailer(cls): from email.mime.base import MIMEBase from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from email import Encoders import smtplib logger.info('start sending email to subscribers...') smtp = smtplib.SMTP(host=config.SMTP_HOST, port=config.SMTP_PORT) try: smtp.set_debuglevel(4) smtp.ehlo() smtp.starttls() smtp.ehlo() smtp.login(user=config.SMTP_USER, password=config.SMTP_PASSWORD) logger.info('established secure connection to smtp server...') toaddrs = [ user.email for user in User.findall() if user.subscription_status == 'subscribed' ] print toaddrs fromaddr = config.FROM_ADDR current_date_string = datetime.datetime.now().strftime('%Y-%m-%d') message_subject = "%s:%s" % (config.APP_NAME, current_date_string) message_text = "Thank you for subscribing %s. Please find the newly posted jobs as of %s" % ( config.APP_NAME, current_date_string) msg = MIMEMultipart() msg['From'] = fromaddr msg['To'] = '' msg['Cc'] = ','.join(toaddrs) msg['Subject'] = message_subject msg.attach(MIMEText(message_text)) part = MIMEBase('application', "octet-stream") file_format = 'xlsx' part.set_payload(JobItem.extract_records_as_bytes(file_format)) logger.info( 'attached extracted files to the mail...waiting to be sent..') Encoders.encode_base64(part) part.add_header( 'Content-Disposition', 'attachment; filename="extracted_jobs_%s.%s"' % (current_date_string, file_format)) msg.attach(part) smtp.sendmail(fromaddr, toaddrs, msg.as_string()) logger.info('done sending email to subscribers...') except Exception as e: logger.error(e) finally: smtp.quit()
def import_records_from_file(item_desc): cls = desc_to_cls_mapping.get(item_desc) file = request.files['file_to_upload'] redirect_url = request.form.get('redirect_url', url_for('index')) for record in cls.findall(): record.remove() logger.info('Done removing all existing %s' % item_desc) count = 0 file.readline() #for the header, ignore for line in file.readlines(): columns = line.rstrip('\r\n').rstrip('\n').decode('utf-8').split(',') cls(columns[0], columns[1]).save() count += 1 logger.info('Done importing %d %s from %s' % (count, item_desc, file.filename)) return redirect(redirect_url)
def update(self): conn = self.connect_db() try: c = conn.cursor() c.execute(' UPDATE ' + self.table_name + ' SET ' + ', '.join(['%s=?' % property for property in self.property_names]) + ' WHERE ' + ' AND '.join(['%s=?' % property for property in self.key_properties]), tuple([getattr(self, property) for property in self.property_names] + [getattr(self, property) for property in self.key_properties])) conn.commit() logger.info('Updated: %s' % self) except Exception as e: logger.error(e) logger.info('Unable to update: %s' % self) conn.rollback() raise DatabaseError(str(e)) finally: conn.close()
def migrate_db(cls): """ place holder for putting the migrate db scripts -- need to be updated before every release :return: """ cls.create_db() conn = cls.datasource.get_connection() try: logger.info('start migrating database') User('meng', 'meng123', '*****@*****.**', 'admin').save() logger.info('done migrating database') except Exception as e: logger.error('Unable to run migrate_db') logger.error(e) finally: conn.close()
def run_emailer(cls): from email.mime.base import MIMEBase from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from email import Encoders import smtplib logger.info('start sending email to subscribers...') smtp = smtplib.SMTP(host=config.SMTP_HOST, port=config.SMTP_PORT) try: smtp.set_debuglevel(4) smtp.ehlo() smtp.starttls() smtp.ehlo() smtp.login(user=config.SMTP_USER, password=config.SMTP_PASSWORD) logger.info('established secure connection to smtp server...') toaddrs = [user.email for user in User.findall() if user.subscription_status == 'subscribed'] print toaddrs fromaddr = config.FROM_ADDR current_date_string = datetime.datetime.now().strftime('%Y-%m-%d') message_subject = "%s:%s" % (config.APP_NAME, current_date_string) message_text = "Thank you for subscribing %s. Please find the newly posted jobs as of %s" % ( config.APP_NAME, current_date_string) msg = MIMEMultipart() msg['From'] = fromaddr msg['To'] = '' msg['Cc'] = ','.join(toaddrs) msg['Subject'] = message_subject msg.attach(MIMEText(message_text)) part = MIMEBase('application', "octet-stream") file_format = 'xlsx' part.set_payload(JobItem.extract_records_as_bytes(file_format)) logger.info('attached extracted files to the mail...waiting to be sent..') Encoders.encode_base64(part) part.add_header('Content-Disposition', 'attachment; filename="extracted_jobs_%s.%s"' % (current_date_string, file_format)) msg.attach(part) smtp.sendmail(fromaddr, toaddrs, msg.as_string()) logger.info('done sending email to subscribers...') except Exception as e: logger.error(e) finally: smtp.quit()
def run_heartbeater(cls): import requests logger.info('started heartbeating..') resp = requests.get(config.APP_HEARTBEAT_URL, headers={'User-Agent': 'Zjobs Heartbeater'}) logger.info('heartbeater received status_code %s', resp.status_code) logger.info('done hearting beating')
def save(self): if self: if self.find(self) is None: conn = self.connect_db() try: c = conn.cursor() c.execute('INSERT INTO ' + self.table_name + '(' + ', '.join(self.property_names) + ') ' + 'VALUES (' + ', '.join(['?'] * len(self.property_names)) + ')', tuple([getattr(self, property_name) for property_name in self.property_names]) ) conn.commit() logger.info('Inserted item: %s' % self) except Exception as e: conn.rollback() logger.error('Unable to insert the item: %s' % self) logger.error(e) finally: conn.close() else: self.update()
def should_load_details(self, job_item): if JobItem.is_exists(job_item): logger.info('[%s] skipping loading details as job already exists. job_title: %s' % (self.name, job_item.job_title)) return False if JobItem.is_older_required(job_item): logger.info('[%s] skipping loading details as job is older than %s days. job_title: %s' % (self.name, str(config.HOUSEKEEPING_RECORD_ORDLER_THAN), job_item.job_title)) return False if BlockedContact.is_contact_blocked(job_item.contact): logger.info('[%s] skipping loading details as job contact is blocked. contact: %s' % (self.name, job_item.contact)) return False if RejectionPattern.should_be_rejected(job_item.job_title): logger.info('[%s] skipping loading details as job matches rejection pattern. job_title: %s' % (self.name, job_item.job_title)) return False return True
def run_web(cls): logger.info('starting web..') os.system( 'cd ' + app_home_dir + ' && gunicorn -c app/gunicorn.conf.py web.jobboard:app --debug')
def _crawl(cls, spider_name=None): if spider_name: os.popen('cd %s && scrapy crawl %s' % (app_home_dir, spider_name)) logger.info('Done running spider %s' % spider_name) return None
def run_web(cls): logger.info('starting web..') os.system('cd ' + app_home_dir + ' && gunicorn -c app/gunicorn.conf.py web.jobboard:app --debug')
def run_housekeeper(cls): logger.info('start running housekeeper..') logger.info('start removing records older than %s days..' % config.HOUSEKEEPING_RECORD_ORDLER_THAN) JobItem.remove_old_records(retention_days=config.HOUSEKEEPING_RECORD_ORDLER_THAN) logger.info('done removing records older than %s days..' % config.HOUSEKEEPING_RECORD_ORDLER_THAN) logger.info('start removing records posted by blocked contacts..') JobItem.remove_blocked_records() logger.info('done removing records posted by blocked contacts..') logger.info('start removing records should have been rejected..') JobItem.remove_records_matches_rejection_pattern() logger.info('done removing records should have been rejected..') logger.info('done running housekeeper..')
def create_db(cls): conn = cls.datasource.get_connection() try: c = conn.cursor() c.execute('DROP TABLE IF EXISTS CRAWLED_JOBS') c.execute('DROP INDEX IF EXISTS job_title_idx') c.execute(''' CREATE TABLE IF NOT EXISTS CRAWLED_JOBS( source text, crawled_date date, publish_date date, job_title text, job_desc text, job_details_link text, job_location text, job_country text, salary text, employer_name text, contact text ) ''') c.execute(''' CREATE UNIQUE INDEX job_title_idx ON CRAWLED_JOBS(job_title) ''') logger.info("created table and indexes for CRAWLED_JOBS") c.execute('DROP TABLE IF EXISTS JOB_REJECTION_RULES') c.execute('DROP INDEX IF EXISTS reject_pattern_idx') c.execute(''' CREATE TABLE IF NOT EXISTS JOB_REJECTION_RULES( reject_pattern text, reject_reason text ) ''') c.execute(''' CREATE UNIQUE INDEX reject_pattern_idx ON JOB_REJECTION_RULES(reject_pattern) ''') logger.info("created table and indexes for JOB_REJECTION_RULES") c.execute('DROP TABLE IF EXISTS BLOCKED_CONTACTS') c.execute('DROP INDEX IF EXISTS blocked_contacts_idx') c.execute(''' CREATE TABLE IF NOT EXISTS BLOCKED_CONTACTS( contact text, block_reason text ) ''') c.execute(''' CREATE UNIQUE INDEX blocked_contacts_idx ON BLOCKED_CONTACTS(contact) ''') logger.info("created table and indexes for BLOCKED_CONTACTS") c.execute('DROP TABLE IF EXISTS USERS') c.execute('DROP INDEX IF EXISTS users_idx') c.execute(''' CREATE TABLE IF NOT EXISTS USERS( username text, password text, email text, subscription_status text, role text, last_login_date date, register_date date ) ''') c.execute(''' CREATE UNIQUE INDEX users_idx ON USERS(username) ''') logger.info("created table and indexes for USERS") c.execute('DROP TABLE IF EXISTS DOCS') c.execute('DROP INDEX IF EXISTS docs_idx') c.execute(''' CREATE TABLE IF NOT EXISTS DOCS( filename text, content_type text, content bytea, uploaded_by text, uploaded_date date ) ''') c.execute(''' CREATE UNIQUE INDEX docs_idx ON DOCS(filename) ''') logger.info("created table and indexes for DOCS") conn.commit() logger.info('done create database') except Exception as e: logger.error('Unable to run create_db') logger.error(e) conn.rollback() finally: conn.close()
def run_housekeeper(cls): logger.info('start running housekeeper..') logger.info('start removing records older than %s days..' % config.HOUSEKEEPING_RECORD_ORDLER_THAN) JobItem.remove_old_records( retention_days=config.HOUSEKEEPING_RECORD_ORDLER_THAN) logger.info('done removing records older than %s days..' % config.HOUSEKEEPING_RECORD_ORDLER_THAN) logger.info('start removing records posted by blocked contacts..') JobItem.remove_blocked_records() logger.info('done removing records posted by blocked contacts..') logger.info('start removing records should have been rejected..') JobItem.remove_records_matches_rejection_pattern() logger.info('done removing records should have been rejected..') logger.info('done running housekeeper..')