def test_on_sites_by_buid(self): business_unit = BusinessUnitFactory(pk=77) results = DEv2JobFeed('seo/tests/data/dseo_feed_0.xml', jsid=business_unit.id, markdown=business_unit.enable_markdown) jobs = results.solr_jobs() for job in jobs: self.assertItemsEqual(job['on_sites'], [0]) site_package = SitePackageFactory(owner=self.company) business_unit.site_packages.add(site_package) results = DEv2JobFeed('seo/tests/data/dseo_feed_0.xml', jsid=business_unit.id, markdown=business_unit.enable_markdown) jobs = results.solr_jobs() for job in jobs: self.assertItemsEqual(job['on_sites'], [site_package.pk]) site_package2 = SitePackageFactory(owner=self.company) business_unit.site_packages.add(site_package2) results = DEv2JobFeed('seo/tests/data/dseo_feed_0.xml', jsid=business_unit.id, markdown=business_unit.enable_markdown) jobs = results.solr_jobs() for job in jobs: self.assertItemsEqual(job['on_sites'], [site_package.pk, site_package2.pk]) site_package2.delete() site_package.delete() business_unit.delete()
def test_dev2_feed(self): filepath = download_feed_file(self.buid_id) results = DEv2JobFeed(filepath) jobs = results.jobparse() self.assertEqual(results.jsid, self.buid_id) self.assertEqual(results.job_source_name, self.businessunit.title) self.assertEqual(len(jobs), self.numjobs)
def test_mocs(self): new_onet = OnetFactory(code="22222222") new_custom_mapping = CustomCareerFactory(object_id=1, onet_id="22222222") new_moc_detail = MocDetailFactory() new_moc = MocFactory(code="2") new_moc.onets = [new_onet] new_moc.save() new_onet = OnetFactory(code="33333333") new_custom_mapping = CustomCareerFactory(object_id=1, onet_id="33333333") new_moc_detail = MocDetailFactory() new_moc = MocFactory(code="3") new_moc.onets = [new_onet] new_moc.save() file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'dseo_feed_2.xml') feed = DEv2JobFeed(file_path, jsid=self.mapping.object_id) solr_dict = feed.solr_jobs() self.assertEqual(solr_dict[0]['moc'], ['2', '3']) job = {'onet_code': ['99999999', '22222222', '33333333']} mocs = feed.job_mocs(job) mapped_mocs = feed.mapped_mocs(mocs, job) self.assertEqual(len(mocs), 2)
def test_parse_invalid_feed(self): """ Test that a feed file that fails validation does not cause an unhandled exception. """ result = DEv2JobFeed(self.invalid_feed, jsid=0)
def test_markdown_no_newline_breaks(self): # Test that markdown businessunits do not have newlines converted to breaks results = DEv2JobFeed( 'seo/tests/data/dseo_feed_0.xml', jsid=self.businessunit.id, markdown = self.businessunit.enable_markdown) jobs = results.solr_jobs() self.assertEqual(jobs[0]['html_description'].find('Operations<br />'), -1)
def test_no_null_values(self): # test that there's no literal null in html 'city' entry results = DEv2JobFeed( 'seo/tests/data/dseo_feed_0.markdown.xml', jsid=0) jobs = results.solr_jobs() for job in jobs: self.assertNotEqual(job['city'], 'null')
def test_unescape(self): """Tests that escaped html characters are unescaped when imported""" results = DEv2JobFeed( 'seo/tests/data/dseo_feed_0.escaped_chars.xml', jsid=0) jobs = results.solr_jobs() self.assertEqual(results.job_source_name.find('¢'), -1) self.assertEqual(jobs[0]['description'].find('&'), -1)
def test_mapped_mocs(self): file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'dseo_feed_1.xml') feed = DEv2JobFeed(file_path, jsid=self.mapping.object_id) job = {'onet_code': [self.onet.code]} mocs = feed.job_mocs(job) mapped_mocs = feed.mapped_mocs(mocs, job) self.assertEqual(len(mocs), 0) self.assertEqual(len(mapped_mocs.codes), 1)
def test_no_markdown_newline_breaks(self): # Test that non-markdown businessunits have newlines converted to breaks no_markdown_bu = BusinessUnitFactory.build(id=5, enable_markdown=False) results = DEv2JobFeed( 'seo/tests/data/dseo_feed_0.xml', jsid=no_markdown_bu.id, markdown=no_markdown_bu.enable_markdown) jobs = results.solr_jobs() self.assertNotEqual(jobs[0]['html_description'].find('Operations<br />'), -1)
def test_empty_feed(self): """ Test that the schema for the v2 DirectEmployers feed file schema allows for empty feed files. """ results = DEv2JobFeed(self.emptyfeed) # If the schema is such that empty feed files are considered invalid, # trying to run jobparse() will throw an exception. self.assertEqual(len(results.jobparse()), 0)
def test_company_member(self): # Test that company member check is correctly added to solr results = DEv2JobFeed( 'seo/tests/data/dseo_feed_0.xml', jsid=self.businessunit.id, company=self.businessunit.company_set.all()[0]) jobs = results.solr_jobs() self.assertTrue(jobs[0]['company_member']) self.company.member = False self.company.save() results = DEv2JobFeed( 'seo/tests/data/dseo_feed_0.xml', jsid=self.businessunit.id, company=self.businessunit.company_set.all()[0]) jobs = results.solr_jobs() self.assertFalse(jobs[0]['company_member'])
def test_company_digital_strategies_customer(self): # Test that digial strategies customer check is correctly added to solr results = DEv2JobFeed( 'seo/tests/data/dseo_feed_0.xml', jsid=self.businessunit.id, company=self.businessunit.company_set.all()[0]) jobs = results.solr_jobs() self.assertFalse(jobs[0]['company_digital_strategies_customer']) self.company.digital_strategies_customer = True self.company.save() results = DEv2JobFeed( 'seo/tests/data/dseo_feed_0.xml', jsid=self.businessunit.id, company=self.businessunit.company_set.all()[0]) jobs = results.solr_jobs() self.assertTrue(jobs[0]['company_digital_strategies_customer'])
def test_company_canonical_microsite(self): # Test that canonical microsites is correctly added to solr results = DEv2JobFeed( 'seo/tests/data/dseo_feed_0.xml', jsid=self.businessunit.id, company=self.businessunit.company_set.all()[0]) jobs = results.solr_jobs() self.assertEqual(jobs[0]['company_canonical_microsite_exact'], None) self.company.canonical_microsite = "http://test.jobs" self.company.save() results = DEv2JobFeed( 'seo/tests/data/dseo_feed_0.xml', jsid=self.businessunit.id, company=self.businessunit.company_set.all()[0]) jobs = results.solr_jobs() self.assertEqual(jobs[0]['company_canonical_microsite_exact'], 'http://test.jobs')
def test_markdown_code_blocks(self): # test that code blocks are not in html job descriptions results = DEv2JobFeed( 'seo/tests/data/dseo_feed_0.markdown.xml', jsid=0) jobs = results.solr_jobs() for job in jobs: self.assertEqual(job['html_description'].find('<code>'), -1) self.assertEqual(job['html_description'].find('</code>'), -1) self.assertEqual(job['html_description'].find('<pre>'), -1) self.assertEqual(job['html_description'].find('<h1>'), -1) self.assertEqual(job['html_description'].find('##'), -1) self.assertNotEqual(job['html_description'].find('<h4>'), -1) self.assertNotEqual(job['html_description'].find('<h6>'), -1) self.assertNotEqual(job['html_description'].find('<li>'), -1) self.assertNotEqual(job['html_description'].find('</li>'), -1)
def test_salt_date(self): """ Test to ensure that job postings show up in a quasi-random fashion by sorting by the `salted_date` attribute in the index vice strictly by `date_new`. """ filepath = download_feed_file(self.buid_id) jobs = DEv2JobFeed(filepath) solrjobs = jobs.solr_jobs() self.conn.add(solrjobs) results = self.conn.search(q="*:*", sort="salted_date asc") self.assertEqual(self.numjobs, results.hits) # We can't really test for inequality between the two result sets, # since sometimes results.docs will equal results2.docs. results2 = self.conn.search(q="*:*", sort="date_new asc") self.assertItemsEqual(results2.docs, results.docs)
def test_zipcode(self): """ Tests to ensure proper behavior of zipcode field in being entered in Solr. """ filepath = download_feed_file(self.buid_id) dbresults = DEv2JobFeed(filepath) solrresults = dbresults.solr_jobs() zips_from_feedfile = ['30269', '30269', '48332', '30269', '30269', '30269', '30269', '30269', '48332', '48332', '30269', None, '30269', '30269'] solrzips = [i['zipcode'] for i in solrresults] for coll in [solrzips]: self.assertItemsEqual(zips_from_feedfile, coll)
def test_mocids(self): """ Tests that mocid fields exist when jobs are imported from a feed and added to a solr connnection """ filepath = download_feed_file(self.buid_id) results = DEv2JobFeed(filepath) jobs = results.solr_jobs() # Since we're going to be adding/updating data in the Solr index, we're # hardcoding in the local Solr instance so that we don't accidentally # alter production data. self.conn.add(jobs) num_hits = self.conn.search(q="*:*", fq="buid:%s -mocid:[* TO *]" % self.buid_id) self.assertEqual(num_hits.hits, self.numjobs) for job in jobs: self.assertTrue('mocid' in job)
def test_date_updated(self): """ Test to ensure proper behavior of date updated field when added to Solr. """ filepath = download_feed_file(self.buid_id) jobs = DEv2JobFeed(filepath) solrjobs = jobs.solr_jobs() self.conn.add(solrjobs) dates_updated = [datetime.datetime.strptime("4/16/2015 11:35:13 PM", "%m/%d/%Y %I:%M:%S %p"), datetime.datetime.strptime("4/16/2015 11:35:14 PM", "%m/%d/%Y %I:%M:%S %p"), datetime.datetime.strptime("4/16/2015 11:35:15 PM", "%m/%d/%Y %I:%M:%S %p")] solr_dates = [i['date_updated'] for i in solrjobs] for solr_date in solr_dates: self.assertIn(solr_date, dates_updated)
def seoxml_to_mongo(buid, data_dir=DATA_DIR): filepath = download_feed_file(buid, data_dir=data_dir) jobfeed = DEv2JobFeed(filepath, jsid=buid, markdown=False, company=None) # If the feed file did not pass validation, return. The return value is # '(0, 0)' to match what's returned on a successful parse. if jobfeed.errors: error = jobfeed.error_messages logging.error("BUID:%s - Feed file has failed validation on line %s. " "Exception: %s" % (buid, error['line'], error['exception'])) raise FeedImportError(error) # A dictionary of uids jobfeed.jobparse() jobs = jobfeed.solr_jobs() collection = connect_db().db.jobs bulk = collection.initialize_unordered_bulk_op() for job in jobs: bulk.find({'guid': job['guid']}).upsert().replace_one(job) bulk.execute()
def update_solr(buid, download=True, force=True, set_title=False, delete_feed=True, data_dir=DATA_DIR, clear_cache=False): """ Update the Solr master index with the data contained in a feed file for a given buid/jsid. This is meant to be a standalone function such that the state of the Solr index is not tied to the state of the database. Inputs: :buid: An integer; the ID for a particular business unit. :download: Boolean. If False, this process will not download a new feedfile, but instead use the one on disk. Should only be false for the purposes of our test suite. :force: Boolean. If True, every job seen in the feed file will be updated in the index. Otherwise, only the jobs seen in the feed file but not seen in the index will be updated. This latter option will soon be deprecated. Returns: A 2-tuple consisting of the number of jobs added and the number deleted. Writes/Modifies: Job data found in the feed file is used to modify the Solr index. This includes adds & deletes. (Solr does not have a discrete equivalent to SQL's UPDATE; by adding a document with the same UID as a document in the index, the equivalent of an update operation is performed.) """ if download: filepath = download_feed_file(buid, data_dir=data_dir) else: # Get current worker process id, to prevent race conditions. try: p = current_process() process_id = p.index except: process_id = 0 filepath = os.path.join(data_dir, str(process_id), FEED_FILE_PREFIX + str(buid) + '.xml') bu = BusinessUnit.objects.get(id=buid) try: co = bu.company_set.all()[0] except IndexError: co = None jobfeed = DEv2JobFeed(filepath, jsid=buid, markdown=bu.enable_markdown, company=co) # If the feed file did not pass validation, return. The return value is # '(0, 0)' to match what's returned on a successful parse. if jobfeed.errors: error = jobfeed.error_messages logging.error("BUID:%s - Feed file has failed validation on line %s. " "Exception: %s" % (buid, error['line'], error['exception'])) raise FeedImportError(error) # A dictionary of uids jobs = jobfeed.jobparse() # Build a set of all the UIDs for all those instances. job_uids = set([long(i.get('uid')) for i in jobs if i.get('uid')]) conn = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL']) step1 = 1024 # Get the count of all the results in the Solr index for this BUID. hits = conn.search("*:*", fq="buid:%s" % buid, facet="false", mlt="false").hits # Create (start-index, stop-index) tuples to facilitate handling results # in ``step1``-sized chunks. So if ``hits`` returns 2048 results, # ``job_slices`` will look like ``[(0,1024), (1024, 2048)]``. Those # values are then used to slice up the total results. # # This was put in place because part of the logic to figuring out what # jobs to delete from and add jobs to the Solr index is using set # algebra. We convert the total list of UIDs in the index and the UIDs # in the XML feed to sets, then compare them via ``.difference()`` # (seen below). However for very large feed files, say 10,000+ jobs, # this process was taking so long that the connection would time out. To # address this problem we break up the comparisons as described above. # This results in more requests but it alleviates the connection timeout # issue. job_slices = slices(range(hits), step=step1) results = [_solr_results_chunk(tup, buid, step1) for tup in job_slices] solr_uids = reduce(lambda x, y: x | y, results) if results else set() # Return the job UIDs that are in the Solr index but not in the feed # file. solr_del_uids = solr_uids.difference(job_uids) if not force: # Return the job UIDs that are in the feed file but not in the Solr # index. solr_add_uids = job_uids.difference(solr_uids) # ``jobfeed.solr_jobs()`` yields a list of dictionaries. We want to # filter out any dictionaries whose "uid" key is not in # ``solr_add_uids``. This is because by default we only want to add # new documents (which each ``solr_jobs()`` dictionary represents), # not update. add_docs = filter(lambda x: int(x.get("uid", 0)) in solr_add_uids, jobfeed.solr_jobs()) else: # This might seem redundant to refer to the same value # twice with two different variable names. However, this decision # was made during the implementation of the "force Solr update" # feature to this function. # # Instead of adding only the documents with UIDs that are in the feed # file but not in the Solr index, we're going to add ALL the documents # in the feed file. This will add the new documents of course, but it # will also update existing documents with any new data. Uniqueness of # the documents is ensured by the ``id`` field defined in the Solr # schema (the template for which can be seen in # templates/search_configuration/solr.xml). At the very bottom you'll # see <uniqueKey>id</uniqueKey>. This serves as the equivalent of the pk # (i.e. globally unique) in a database. solr_add_uids = job_uids add_docs = jobfeed.solr_jobs() # Slice up ``add_docs`` in chunks of 4096. This is because the # maxBooleanClauses setting in solrconfig.xml is set to 4096. This means # if we used any more than that Solr would throw an error and our # updates wouldn't get processed. add_steps = slices(range(len(solr_add_uids)), step=4096) # Same concept as ``add_docs``. del_steps = slices(range(len(solr_del_uids)), step=4096) # Create a generator that yields 2-tuples with each invocation. The # 2-tuples consist of one tuple each from del_steps & add_steps. Any # mismatched values (e.g. there are more del_steps than add_steps) # will be compensated for with the ``fillvalue``. zipped_steps = izip_longest(del_steps, add_steps, fillvalue=(0, 0)) for tup in zipped_steps: update_chunk = add_docs[tup[1][0]:tup[1][1] + 1] if update_chunk: logging.debug("BUID:%s - SOLR - Update chunk: %s" % (buid, [i['uid'] for i in update_chunk])) # Pass 'commitWithin' so that Solr doesn't try to commit the new # docs right away. This will help relieve some of the resource # stress during the daily update. The value is expressed in # milliseconds. conn.add(update_chunk, commitWithin="30000") delete_chunk = _build_solr_delete_query( list(solr_del_uids)[tup[0][0]:tup[0][1] + 1]) if delete_chunk: # Post-a-job jobs should not be deleted during import delete_chunk = "(%s) AND -is_posted:true" % delete_chunk logging.debug("BUID:%s - SOLR - Delete chunk: %s" % (buid, list(solr_del_uids))) conn.delete(q=delete_chunk) # delete any jobs that may have been added via etl_to_solr conn.delete(q="buid:%s AND !uid:[0 TO *]" % buid) # Update business unit information: title, dates, and associated_jobs if set_title or not bu.title or (bu.title != jobfeed.job_source_name and jobfeed.job_source_name): bu.title = jobfeed.job_source_name updated = bool(solr_add_uids) or bool(solr_del_uids) _update_business_unit_modified_dates(bu, jobfeed.crawled_date, updated=updated) bu.associated_jobs = len(jobs) bu.save() # Update the Django database to reflect company additions and name changes add_company(bu) if delete_feed: os.remove(filepath) logging.info("BUID:%s - Deleted feed file." % buid) return len(solr_add_uids), len(solr_del_uids)
def test_parse_malformed_feed(self): """ Test that a malformed feedfile does not cause an unhandled exception. """ result = DEv2JobFeed(self.malformed_feed, jsid=0)
def test_unused_fields(self): # Test that new fields don't break existing code results = DEv2JobFeed(self.unused_field_feed, jsid=self.businessunit.id) jobs = results.solr_jobs() self.assertEqual(len(results.jobparse()), 1)
def test_no_onets(self): result = DEv2JobFeed(self.no_onet_feed, jsid=0) jobs = result.solr_jobs() self.assertEqual(jobs[0]['onet'], '')