def test_analytics_delete_old_data(self): """ When Solr is updated with analytics data, we should delete all docs from the "current" collection older than 30 days. """ solr = Solr() # Create old logs that will be pruned when delete is run logs = [MockLog(log_type=type_, delta=datetime.timedelta(days=-31)) for type_ in ['analytics', 'redirect']] parse_log(logs, self.test_solr) results = solr.search(q='doc_type:analytics') self.assertEqual(results.hits, 2, 'Old logs were not added') old_uids = {doc['uid'] for doc in results.docs} # Create logs timestamped for today logs = [MockLog(log_type=type_) for type_ in ['analytics', 'redirect']] parse_log(logs, self.test_solr) results = solr.search(q='doc_type:analytics') self.assertEqual(results.hits, 4, 'New logs were not added') all_uids = {doc['uid'] for doc in results.docs} # delete_old_analytics_docs is called after parse_logs in read_new_logs # and has not been called yet. Call it now delete_old_analytics_docs() results = solr.search(q='doc_type:analytics') self.assertEqual(results.hits, 2, 'Old logs were not deleted') new_uids = {doc['uid'] for doc in results.docs} # Ensure that the correct documents have been added/removed # The old and new uid sets should be disjoint (no elements in common) self.assertTrue(old_uids.isdisjoint(new_uids), 'Sets are not disjoint; Intersecting elements: %s' % str(old_uids.intersection(new_uids))) # Since the old and new uid sets have nothing in common, their union # should equal the set of all uids self.assertEqual(old_uids.union(new_uids), all_uids, 'Sets are not equal; difference: %s' % str(old_uids.union(new_uids).symmetric_difference( all_uids)))
def test_analytics_log_parsing(self): """ Ensure that analytics logs are parsed and stored in solr correctly """ company = CompanyFactory(id=1) business_unit = BusinessUnitFactory(id=1000) company.job_source_ids.add(business_unit) # match and no_match will be used later to ensure that the correct # number of documents were associated with a company or associated # with the default company match = Mock( wraps=lambda: self.assertEqual(doc['company_id'], company.pk)) no_match = Mock( wraps=lambda: self.assertEqual(doc['company_id'], 999999)) for log_type in ['analytics', 'redirect']: log = MockLog(log_type=log_type) parse_log([log], self.test_solr) solr = Solr() results = solr.search(q='uid:analytics*') # fake logs contain two lines - one human and one bot hit # If it is getting processed correctly, there should be only one # hit recorded self.assertEqual(results.hits, 1) multi_field = 'facets' if log_type == 'redirect': with self.assertRaises(KeyError): results.docs[0][multi_field] else: self.assertEqual(len(results.docs[0][multi_field]), 2) for field in results.docs[0].keys(): if field != multi_field: self.assertTrue(type(results.docs[0][field] != list)) uuid.UUID(results.docs[0]['aguid']) with self.assertRaises(KeyError): results.docs[0]['User_user_guid'] for doc in results.docs: if doc['job_view_buid'] == business_unit.pk: # If business units match, company ids should match match() else: # Business units don't match; company id should be set to # the default company no_match() solr.delete() user = UserFactory(email="*****@*****.**") user.user_guid = '1e5f7e122156483f98727366afe06e0b' user.save() parse_log([log], self.test_solr) results = solr.search(q='uid:analytics*') for guid in ['aguid', 'User_user_guid']: uuid.UUID(results.docs[0][guid]) solr.delete() user.delete() # We have already determined that there are only two documents. # Ensure that there is exactly one document that matches a specific # company and one document that was given the default company self.assertEqual(match.call_count, 1) self.assertEqual(no_match.call_count, 1)