Example #1
0
    def should_load_details(self, job_item):
        if JobItem.is_exists(job_item):
            logger.info(
                '[%s] skipping loading details as job already exists. job_title: %s'
                % (self.name, job_item.job_title))
            return False
        if JobItem.is_older_required(job_item):
            logger.info(
                '[%s] skipping loading details as job is older than %s days. job_title: %s'
                % (self.name, str(config.HOUSEKEEPING_RECORD_ORDLER_THAN),
                   job_item.job_title))
            return False

        if BlockedContact.is_contact_blocked(job_item.contact):
            logger.info(
                '[%s] skipping loading details as job contact is blocked. contact: %s'
                % (self.name, job_item.contact))
            return False

        if RejectionPattern.should_be_rejected(job_item.job_title):
            logger.info(
                '[%s] skipping loading details as job matches rejection pattern. job_title: %s'
                % (self.name, job_item.job_title))
            return False

        return True
Example #2
0
    def test_find_all(self):
        self.job_item.save()
        another_job_item = JobItem()
        another_job_item.job_title = "Another Test Job"
        another_job_item.save()

        records = JobItem.findall()
        print "Job Items", records
        self.assertEqual(2, len(records))
Example #3
0
    def test_find_with_pagination(self):
        for i in range(0, 20):
            job_item = JobItem()
            job_item.job_title = "job_item_%d" % i
            job_item.save()

        records = JobItem.find_with_pagination(page_request={"page_no": 2, "size": 10})

        print "Job Items", records
        self.assertEqual(10, len(records))
Example #4
0
 def setUp(self):
     self.clean_db()
     self.job_item = JobItem()
     self.job_item.job_title="Test Job"
     self.job_item.employer_name="Test Job Employer"
     # self.job_item.crawled_date = datetime.datetime.now()
     # self.job_item.publish_date = datetime.datetime.strptime('2014-10-31', '%Y-%m-%d')
     self.job_item.job_country = "Singapore"
     self.job_item.job_desc = "This is a test job"
     self.job_item.contact = "88888888"
     self.source = 'unit_test'
Example #5
0
    def process_item(self, item, spider):

        if JobItem.is_older_required(item):
            raise DropItem('Job is published order than %s days. Removing...' %
                           str(config.HOUSEKEEPING_RECORD_ORDLER_THAN))

        return item
Example #6
0
    def process_item(self, item, spider):

        if JobItem.is_older_required(item):
            raise DropItem(
                'Job is published order than %s days. Removing...' % str(config.HOUSEKEEPING_RECORD_ORDLER_THAN))

        return item
Example #7
0
    def should_load_details(self, job_item):
        if JobItem.is_exists(job_item):
            logger.info('[%s] skipping loading details as job already exists. job_title: %s' % (self.name, job_item.job_title))
            return False
        if JobItem.is_older_required(job_item):
            logger.info('[%s] skipping loading details as job is older than %s days. job_title: %s' % (self.name, str(config.HOUSEKEEPING_RECORD_ORDLER_THAN), job_item.job_title))
            return False

        if BlockedContact.is_contact_blocked(job_item.contact):
            logger.info('[%s] skipping loading details as job contact is blocked. contact: %s' % (self.name, job_item.contact))
            return False

        if RejectionPattern.should_be_rejected(job_item.job_title):
            logger.info('[%s] skipping loading details as job matches rejection pattern. job_title: %s' % (self.name, job_item.job_title))
            return False

        return True
Example #8
0
    def run_emailer(cls):
        from email.mime.base import MIMEBase
        from email.mime.multipart import MIMEMultipart
        from email.mime.text import MIMEText
        from email import Encoders
        import smtplib

        logger.info('start sending email to subscribers...')
        smtp = smtplib.SMTP(host=config.SMTP_HOST, port=config.SMTP_PORT)

        try:
            smtp.set_debuglevel(4)
            smtp.ehlo()
            smtp.starttls()
            smtp.ehlo()
            smtp.login(user=config.SMTP_USER, password=config.SMTP_PASSWORD)

            logger.info('established secure connection to smtp server...')

            toaddrs = [
                user.email for user in User.findall()
                if user.subscription_status == 'subscribed'
            ]
            print toaddrs
            fromaddr = config.FROM_ADDR

            current_date_string = datetime.datetime.now().strftime('%Y-%m-%d')
            message_subject = "%s:%s" % (config.APP_NAME, current_date_string)
            message_text = "Thank you for subscribing %s. Please find the newly posted jobs as of %s" % (
                config.APP_NAME, current_date_string)

            msg = MIMEMultipart()
            msg['From'] = fromaddr
            msg['To'] = ''
            msg['Cc'] = ','.join(toaddrs)
            msg['Subject'] = message_subject
            msg.attach(MIMEText(message_text))

            part = MIMEBase('application', "octet-stream")
            file_format = 'xlsx'
            part.set_payload(JobItem.extract_records_as_bytes(file_format))
            logger.info(
                'attached extracted files to the mail...waiting to be sent..')
            Encoders.encode_base64(part)
            part.add_header(
                'Content-Disposition',
                'attachment; filename="extracted_jobs_%s.%s"' %
                (current_date_string, file_format))
            msg.attach(part)

            smtp.sendmail(fromaddr, toaddrs, msg.as_string())
            logger.info('done sending email to subscribers...')
        except Exception as e:
            logger.error(e)
        finally:
            smtp.quit()
Example #9
0
 def setUp(self):
     self.clean_db()
     self.job_item = JobItem()
     self.job_item.job_title = "Test Job"
     self.job_item.employer_name = "Test Job Employer"
     # self.job_item.crawled_date = datetime.datetime.now()
     # self.job_item.publish_date = datetime.datetime.strptime('2014-10-31', '%Y-%m-%d')
     self.job_item.job_country = "Singapore"
     self.job_item.job_desc = "This is a test job"
     self.job_item.contact = "88888888"
     self.source = "unit_test"
Example #10
0
    def run_emailer(cls):
        from email.mime.base import MIMEBase
        from email.mime.multipart import MIMEMultipart
        from email.mime.text import MIMEText
        from email import Encoders
        import smtplib

        logger.info('start sending email to subscribers...')
        smtp = smtplib.SMTP(host=config.SMTP_HOST, port=config.SMTP_PORT)

        try:
            smtp.set_debuglevel(4)
            smtp.ehlo()
            smtp.starttls()
            smtp.ehlo()
            smtp.login(user=config.SMTP_USER, password=config.SMTP_PASSWORD)

            logger.info('established secure connection to smtp server...')

            toaddrs = [user.email for user in User.findall() if user.subscription_status == 'subscribed']
            print toaddrs
            fromaddr = config.FROM_ADDR

            current_date_string = datetime.datetime.now().strftime('%Y-%m-%d')
            message_subject = "%s:%s" % (config.APP_NAME, current_date_string)
            message_text = "Thank you for subscribing %s. Please find the newly posted jobs as of %s" % (
                config.APP_NAME, current_date_string)

            msg = MIMEMultipart()
            msg['From'] = fromaddr
            msg['To'] = ''
            msg['Cc'] = ','.join(toaddrs)
            msg['Subject'] = message_subject
            msg.attach(MIMEText(message_text))

            part = MIMEBase('application', "octet-stream")
            file_format = 'xlsx'
            part.set_payload(JobItem.extract_records_as_bytes(file_format))
            logger.info('attached extracted files to the mail...waiting to be sent..')
            Encoders.encode_base64(part)
            part.add_header('Content-Disposition',
                            'attachment; filename="extracted_jobs_%s.%s"' % (current_date_string, file_format))
            msg.attach(part)

            smtp.sendmail(fromaddr, toaddrs, msg.as_string())
            logger.info('done sending email to subscribers...')
        except Exception as e:
            logger.error(e)
        finally:
            smtp.quit()
Example #11
0
    def test_find_all(self):
        self.job_item.save()
        another_job_item = JobItem()
        another_job_item.job_title = 'Another Test Job'
        another_job_item.save()

        records = JobItem.findall()
        print 'Job Items', records
        self.assertEqual(2, len(records))
Example #12
0
    def parse_item_requests_callback(self, response, item_xpath_selector=''):
        requests = []
        for job_item in response.xpath(item_xpath_selector):

            job_crawler_item = JobItem()
            self.populate_job_crawler_item(job_item, job_crawler_item)

            if self.should_load_details(job_crawler_item):
                requests.append(
                    Request(url=job_crawler_item.job_details_link,
                            callback=self.retrieve_job_details,
                            meta={'item': job_crawler_item},
                            dont_filter=True))

        return requests
Example #13
0
    def test_find_with_pagination(self):
        for i in range(0, 20):
            job_item = JobItem()
            job_item.job_title='job_item_%d' % i
            job_item.save()

        records = JobItem.find_with_pagination(page_request={'page_no': 2, 'size': 10})

        print 'Job Items', records
        self.assertEqual(10, len(records))
Example #14
0
    def parse_item(self, response):
        requests = []
        for job_item in response.xpath('//tr'):
            job_crawler_item = JobItem()
            for index, detail_item in enumerate(job_item.xpath('./td')):
                self.populate_job_crawler_item(index, detail_item,
                                               job_crawler_item)
                if index == 4:
                    if self.should_load_details(job_crawler_item):
                        requests.append(
                            Request(url=job_crawler_item.job_details_link,
                                    callback=self.retrieve_job_details,
                                    meta={'item': job_crawler_item},
                                    dont_filter=True))

        return requests
Example #15
0
    def test_remove_blocked_records(self):
        for i in range(0, 20):
            job_item = JobItem()
            job_item.job_title=u'人员_%d' % i
            job_item.contact = str(random.randint(90000000, 99999999))
            job_item.save()

            # mark the contact as blocked
            BlockedContact(job_item.contact, u'人员').save()

        # run the remove action
        JobItem.remove_blocked_records()

        conn = self.connect_db()
        try:
            c = conn.cursor()
            c.execute('SELECT COUNT(*) FROM ' + JobItem.table_name)
            self.assertEqual(c.fetchone()[0], 0, 'Count of job items should be 0')
        except:
            pass
        finally:
            conn.close()
Example #16
0
    def run_housekeeper(cls):
        logger.info('start running housekeeper..')
        logger.info('start removing records older than %s days..' % config.HOUSEKEEPING_RECORD_ORDLER_THAN)
        JobItem.remove_old_records(retention_days=config.HOUSEKEEPING_RECORD_ORDLER_THAN)
        logger.info('done removing records older than %s days..' % config.HOUSEKEEPING_RECORD_ORDLER_THAN)

        logger.info('start removing records posted by blocked contacts..')
        JobItem.remove_blocked_records()
        logger.info('done removing records posted by blocked contacts..')

        logger.info('start removing records should have been rejected..')
        JobItem.remove_records_matches_rejection_pattern()
        logger.info('done removing records should have been rejected..')

        logger.info('done running housekeeper..')
Example #17
0
    def test_remove_records_matches_rejection_pattern(self):
        for i in range(0, 20):
            job_item = JobItem()
            job_item.job_title=u'人员_%d' % i
            job_item.save()

        # mark the title as blocked
        RejectionPattern(u'人员_\d+', 'For Testing').save()

        # run the remove action
        JobItem.remove_records_matches_rejection_pattern()

        conn = self.connect_db()
        try:
            c = conn.cursor()
            c.execute('SELECT COUNT(*) FROM ' + JobItem.table_name)
            self.assertEqual(c.fetchone()[0], 0, 'Count of job items should be 0')
        except:
            pass
        finally:
            conn.close()
Example #18
0
    def run_housekeeper(cls):
        logger.info('start running housekeeper..')
        logger.info('start removing records older than %s days..' %
                    config.HOUSEKEEPING_RECORD_ORDLER_THAN)
        JobItem.remove_old_records(
            retention_days=config.HOUSEKEEPING_RECORD_ORDLER_THAN)
        logger.info('done removing records older than %s days..' %
                    config.HOUSEKEEPING_RECORD_ORDLER_THAN)

        logger.info('start removing records posted by blocked contacts..')
        JobItem.remove_blocked_records()
        logger.info('done removing records posted by blocked contacts..')

        logger.info('start removing records should have been rejected..')
        JobItem.remove_records_matches_rejection_pattern()
        logger.info('done removing records should have been rejected..')

        logger.info('done running housekeeper..')
Example #19
0
    def test_remove_blocked_records(self):
        for i in range(0, 20):
            job_item = JobItem()
            job_item.job_title = u"人员_%d" % i
            job_item.contact = str(random.randint(90000000, 99999999))
            job_item.save()

            # mark the contact as blocked
            BlockedContact(job_item.contact, u"人员").save()

        # run the remove action
        JobItem.remove_blocked_records()

        conn = self.connect_db()
        try:
            c = conn.cursor()
            c.execute("SELECT COUNT(*) FROM " + JobItem.table_name)
            self.assertEqual(c.fetchone()[0], 0, "Count of job items should be 0")
        except:
            pass
        finally:
            conn.close()
Example #20
0
    def test_remove_records_matches_rejection_pattern(self):
        for i in range(0, 20):
            job_item = JobItem()
            job_item.job_title = u"人员_%d" % i
            job_item.save()

        # mark the title as blocked
        RejectionPattern(u"人员_\d+", "For Testing").save()

        # run the remove action
        JobItem.remove_records_matches_rejection_pattern()

        conn = self.connect_db()
        try:
            c = conn.cursor()
            c.execute("SELECT COUNT(*) FROM " + JobItem.table_name)
            self.assertEqual(c.fetchone()[0], 0, "Count of job items should be 0")
        except:
            pass
        finally:
            conn.close()
Example #21
0
 def test_find(self):
     self.job_item.save()
     print JobItem.find(self.job_item)
Example #22
0
 def process_item(self, item, spider):
     if JobItem.is_exists(item):
         raise DropItem('Duplicated Job title. Removing...')
     else:
         return item
Example #23
0
 def process_item(self, item, spider):
     if JobItem.is_exists(item):
         raise DropItem('Duplicated Job title. Removing...')
     else:
         return item
Example #24
0
 def test_is_exists(self):
     self.job_item.save()
     self.assertTrue(JobItem.is_exists(self.job_item), '%s should exist' % self.job_item.job_title)
Example #25
0
class JobItemTest(BaseTestCase):
    def setUp(self):
        self.clean_db()
        self.job_item = JobItem()
        self.job_item.job_title = "Test Job"
        self.job_item.employer_name = "Test Job Employer"
        # self.job_item.crawled_date = datetime.datetime.now()
        # self.job_item.publish_date = datetime.datetime.strptime('2014-10-31', '%Y-%m-%d')
        self.job_item.job_country = "Singapore"
        self.job_item.job_desc = "This is a test job"
        self.job_item.contact = "88888888"
        self.source = "unit_test"

    def tearDown(self):
        pass

    def test_save(self):
        self.job_item.save()
        conn = self.connect_db()
        try:
            c = conn.cursor()
            c.execute("SELECT COUNT(*) FROM " + JobItem.table_name)
            self.assertEqual(c.fetchone()[0], 1, "Count of job items should be 1")
        except:
            pass
        finally:
            conn.close()

    def test_find_all(self):
        self.job_item.save()
        another_job_item = JobItem()
        another_job_item.job_title = "Another Test Job"
        another_job_item.save()

        records = JobItem.findall()
        print "Job Items", records
        self.assertEqual(2, len(records))

    def test_find(self):
        self.job_item.save()
        print JobItem.find(self.job_item)

    def test_find_with_pagination(self):
        for i in range(0, 20):
            job_item = JobItem()
            job_item.job_title = "job_item_%d" % i
            job_item.save()

        records = JobItem.find_with_pagination(page_request={"page_no": 2, "size": 10})

        print "Job Items", records
        self.assertEqual(10, len(records))

    def test_iter_listOfTuple(self):
        list_of_tuples = [("key", "value"), ("key1", "value1")]

        print [key + " " + value for (key, value) in list_of_tuples]

    def test_is_exists(self):
        self.job_item.save()
        self.assertTrue(JobItem.is_exists(self.job_item), "%s should exist" % self.job_item.job_title)

    def test_remove_blocked_records(self):
        for i in range(0, 20):
            job_item = JobItem()
            job_item.job_title = u"人员_%d" % i
            job_item.contact = str(random.randint(90000000, 99999999))
            job_item.save()

            # mark the contact as blocked
            BlockedContact(job_item.contact, u"人员").save()

        # run the remove action
        JobItem.remove_blocked_records()

        conn = self.connect_db()
        try:
            c = conn.cursor()
            c.execute("SELECT COUNT(*) FROM " + JobItem.table_name)
            self.assertEqual(c.fetchone()[0], 0, "Count of job items should be 0")
        except:
            pass
        finally:
            conn.close()

    def test_remove_records_matches_rejection_pattern(self):
        for i in range(0, 20):
            job_item = JobItem()
            job_item.job_title = u"人员_%d" % i
            job_item.save()

        # mark the title as blocked
        RejectionPattern(u"人员_\d+", "For Testing").save()

        # run the remove action
        JobItem.remove_records_matches_rejection_pattern()

        conn = self.connect_db()
        try:
            c = conn.cursor()
            c.execute("SELECT COUNT(*) FROM " + JobItem.table_name)
            self.assertEqual(c.fetchone()[0], 0, "Count of job items should be 0")
        except:
            pass
        finally:
            conn.close()
Example #26
0
 def test_is_exists(self):
     self.job_item.save()
     self.assertTrue(JobItem.is_exists(self.job_item), "%s should exist" % self.job_item.job_title)
Example #27
0
class JobItemTest(BaseTestCase):
    def setUp(self):
        self.clean_db()
        self.job_item = JobItem()
        self.job_item.job_title="Test Job"
        self.job_item.employer_name="Test Job Employer"
        # self.job_item.crawled_date = datetime.datetime.now()
        # self.job_item.publish_date = datetime.datetime.strptime('2014-10-31', '%Y-%m-%d')
        self.job_item.job_country = "Singapore"
        self.job_item.job_desc = "This is a test job"
        self.job_item.contact = "88888888"
        self.source = 'unit_test'

    def tearDown(self):
        pass


    def test_save(self):
        self.job_item.save()
        conn = self.connect_db()
        try:
            c = conn.cursor()
            c.execute('SELECT COUNT(*) FROM ' + JobItem.table_name)
            self.assertEqual(c.fetchone()[0], 1, 'Count of job items should be 1')
        except:
            pass
        finally:
            conn.close()

    def test_find_all(self):
        self.job_item.save()
        another_job_item = JobItem()
        another_job_item.job_title = 'Another Test Job'
        another_job_item.save()

        records = JobItem.findall()
        print 'Job Items', records
        self.assertEqual(2, len(records))

    def test_find(self):
        self.job_item.save()
        print JobItem.find(self.job_item)

    def test_find_with_pagination(self):
        for i in range(0, 20):
            job_item = JobItem()
            job_item.job_title='job_item_%d' % i
            job_item.save()

        records = JobItem.find_with_pagination(page_request={'page_no': 2, 'size': 10})

        print 'Job Items', records
        self.assertEqual(10, len(records))

    def test_iter_listOfTuple(self):
        list_of_tuples = [('key', 'value'), ('key1', 'value1')]

        print [key + ' ' + value for (key, value) in list_of_tuples]


    def test_is_exists(self):
        self.job_item.save()
        self.assertTrue(JobItem.is_exists(self.job_item), '%s should exist' % self.job_item.job_title)

    def test_remove_blocked_records(self):
        for i in range(0, 20):
            job_item = JobItem()
            job_item.job_title=u'人员_%d' % i
            job_item.contact = str(random.randint(90000000, 99999999))
            job_item.save()

            # mark the contact as blocked
            BlockedContact(job_item.contact, u'人员').save()

        # run the remove action
        JobItem.remove_blocked_records()

        conn = self.connect_db()
        try:
            c = conn.cursor()
            c.execute('SELECT COUNT(*) FROM ' + JobItem.table_name)
            self.assertEqual(c.fetchone()[0], 0, 'Count of job items should be 0')
        except:
            pass
        finally:
            conn.close()

    def test_remove_records_matches_rejection_pattern(self):
        for i in range(0, 20):
            job_item = JobItem()
            job_item.job_title=u'人员_%d' % i
            job_item.save()

        # mark the title as blocked
        RejectionPattern(u'人员_\d+', 'For Testing').save()

        # run the remove action
        JobItem.remove_records_matches_rejection_pattern()

        conn = self.connect_db()
        try:
            c = conn.cursor()
            c.execute('SELECT COUNT(*) FROM ' + JobItem.table_name)
            self.assertEqual(c.fetchone()[0], 0, 'Count of job items should be 0')
        except:
            pass
        finally:
            conn.close()
Example #28
0
 def test_find(self):
     self.job_item.save()
     print JobItem.find(self.job_item)