Esempio n. 1
0
def bread_box_title_heading(title_slug_value, jobs=None):
    if (not title_slug_value and not jobs) or not title_slug_value:
        return None

    if jobs:
        job = jobs[0]
        if title_slug_value == job.title_slug:
            return job.title
        else:
            for job in jobs:
                if title_slug_value == job.title_slug:
                    return job.title

    # Try searching solr for a matching title.
    conn = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL'])
    try:
        search_terms = {
            'q': u'title_slug:%s' % title_slug_value,
            'fl': 'title, title_slug',
            'rows': 1,
        }
        res = conn.search(**search_terms)
    except SolrError:
        # Poorly formated title_slug_values can sometimes cause Solr errors.
        res = None

    if res and res.docs[0].get('title_slug') == title_slug_value:
        return res.docs[0]['title']
    else:
        if title_slug_value:
            return title_slug_value.replace('-', ' ').title()
        else:
            return None
Esempio n. 2
0
def bread_box_title_heading(title_slug_value, jobs=None):
    if (not title_slug_value and not jobs) or not title_slug_value:
        return None

    if jobs:
        job = jobs[0]
        if title_slug_value == job.title_slug:
            return job.title
        else:
            for job in jobs:
                if title_slug_value == job.title_slug:
                    return job.title

    # Try searching solr for a matching title.
    conn = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL'])
    try:
        search_terms = {
            'q': u'title_slug:%s' % title_slug_value,
            'fl': 'title, title_slug',
            'rows': 1,
        }
        res = conn.search(**search_terms)
    except SolrError:
        # Poorly formated title_slug_values can sometimes cause Solr errors.
        res = None

    if res and res.docs[0].get('title_slug') == title_slug_value:
        return res.docs[0]['title']
    else:
        if title_slug_value:
            return title_slug_value.replace('-', ' ').title()
        else:
            return None
Esempio n. 3
0
def clear_solr(buid):
    """Delete all jobs for a given business unit/job source."""
    conn = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL'])
    hits = conn.search(q="*:*", rows=1, mlt="false", facet="false").hits
    logging.info("BUID:%s - SOLR - Deleting all %s jobs" % (buid, hits))
    conn.delete(q="buid:%s" % buid)
    logging.info("BUID:%s - SOLR - All jobs deleted." % buid)
Esempio n. 4
0
def add_jobs(jobs, upload_chunk_size=1024):
    """
    Loads a solr-ready json list of jobs into solr.

    inputs:
        :jobs: A list of solr-ready, json-formatted jobs.

    outputs:
        The number of jobs loaded into solr.
    """
    conn = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL'])
    num_jobs = len(jobs)
    # AT&T Showed that large numbers of MOCs can cause import issues due to the size of documents.
    # Therefore, when processing AT&T lower the document chunk size.
    for job in jobs:
        if int(job.get('buid', 0)) == 19389:
            logger.warn("AT&T has large amounts of mapped_mocs, that cause problems.  Reducing chunk size.")
            upload_chunk_size = 64
            break
            
    
    # Chunk them
    jobs = chunk(jobs, upload_chunk_size)
    for job_group in jobs:
        conn.add(list(job_group))
    return num_jobs
Esempio n. 5
0
def clear_solr(buid):
    """Delete all jobs for a given business unit/job source."""
    conn = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL'])
    hits = conn.search(q="*:*", rows=1, mlt="false", facet="false").hits
    logging.info("BUID:%s - SOLR - Deleting all %s jobs" % (buid, hits))
    conn.delete(q="buid:%s" % buid)
    logging.info("BUID:%s - SOLR - All jobs deleted." % buid)
Esempio n. 6
0
class DirectSEOBase(TestCase):
    def setUp(self):
        db_backend = settings.DATABASES['default']['ENGINE'].split('.')[-1]

        # Set columns that are utf8 in production to utf8
        if db_backend == 'mysql':
            cursor = connections['default'].cursor()
            cursor.execute("alter table seo_customfacet convert to character "
                           "set utf8 collate utf8_unicode_ci")
            cursor.execute("alter table seo_seositefacet convert to character "
                           "set utf8 collate utf8_unicode_ci")
            cursor.execute("alter table seo_company convert to character set "
                           "utf8 collate utf8_unicode_ci")
            cursor.execute("alter table seo_queryredirect convert to character "
                           "set utf8 collate utf8_unicode_ci")
            cursor.execute("alter table taggit_tag convert to character set "
                           "utf8 collate utf8_unicode_ci")
            cursor.execute("alter table taggit_taggeditem convert to "
                           "character set "
                           "utf8 collate utf8_unicode_ci")
            cursor.execute("alter table seo_seositeredirect convert to "
                           "character set utf8 collate utf8_unicode_ci")
            cursor.execute("alter table django_redirect convert to "
                           "character set utf8 collate utf8_unicode_ci")

        setattr(settings, 'ROOT_URLCONF', 'dseo_urls')
        setattr(settings, "PROJECT", 'dseo')
        clear_url_caches()

        self.base_middleware_classes = settings.MIDDLEWARE_CLASSES
        middleware_classes = self.base_middleware_classes + (
            'wildcard.middleware.WildcardMiddleware',
            'middleware.RedirectOverrideMiddleware')
        setattr(settings, 'MIDDLEWARE_CLASSES', middleware_classes)

        self.base_context_processors = settings.TEMPLATE_CONTEXT_PROCESSORS
        context_processors = self.base_context_processors + (
            "social_links.context_processors.social_links_context",
            "seo.context_processors.site_config_context",
        )
        setattr(settings, 'TEMPLATE_CONTEXT_PROCESSORS', context_processors)
        context._standard_context_processors = None

        self.conn = Solr('http://127.0.0.1:8983/solr/seo')
        self.conn.delete(q="*:*")
        cache.clear()
        clear_url_caches()

        setattr(settings, 'MEMOIZE', False)

    def tearDown(self):
        from django.conf import settings
        from django.template import context

        setattr(settings, 'TEMPLATE_CONTEXT_PROCESSORS',
                self.base_context_processors)
        context._standard_context_processors = None
        setattr(settings, 'MIDDLEWARE_CLASSES',
                self.base_middleware_classes)
Esempio n. 7
0
def _solr_results_chunk(tup, buid, step):
    """
    Takes a (start_index, stop_index) tuple and gets the results in that
    range from the Solr index.

    """
    conn = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL'])
    results = conn.search("*:*", fq="buid:%s" % buid, fl="uid",
                          rows=step, start=tup[0], facet="false",
                          mlt="false").docs
    return set([i['uid'] for i in results if 'uid' in i])
Esempio n. 8
0
    def __init__(self, connection_alias, **connection_options):
        """
        Inputs:
        :HTTP_AUTH_USERNAME: Username used for http authentication
        :HTTP_AUTH_PASSWORD: Password used for http authentication

        """
        super(DESolrSearchBackend, self).__init__(connection_alias,
                                                  **connection_options)
        user = connection_options.get("HTTP_AUTH_USERNAME")
        passwd = connection_options.get("HTTP_AUTH_PASSWORD")
        self.conn = Solr(connection_options['URL'], auth=(user, passwd),
                         timeout=self.timeout)
Esempio n. 9
0
    def setUp(self):
        super(SiteTestCase, self).setUp()
        self.conn = Solr('http://127.0.0.1:8983/solr/seo')
        self.conn.delete(q="*:*")
        self.businessunit = factories.BusinessUnitFactory(id=0)
        self.buid = self.businessunit.id
        self.filepath = os.path.join(import_jobs.DATA_DIR,
                                     'dseo_feed_%s.xml' % self.buid)
        SeoSite.objects.all().delete()
        self.site = factories.SeoSiteFactory(id=1)

        self.configuration = factories.ConfigurationFactory(status=2)
        self.configuration.save()
        self.site.configurations.clear()
        self.site.configurations.add(self.configuration)
Esempio n. 10
0
def _solr_results_chunk(tup, buid, step):
    """
    Takes a (start_index, stop_index) tuple and gets the results in that
    range from the Solr index.

    """
    conn = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL'])
    results = conn.search("*:*",
                          fq="buid:%s" % buid,
                          fl="uid",
                          rows=step,
                          start=tup[0],
                          facet="false",
                          mlt="false").docs
    return set([i['uid'] for i in results if 'uid' in i])
Esempio n. 11
0
    def setUp(self):
        super(MyJobsBase, self).setUp()
        settings.ROOT_URLCONF = "myjobs_urls"
        settings.PROJECT = "myjobs"

        self.app_access = AppAccessFactory()
        self.activities = [
            ActivityFactory(name=activity, app_access=self.app_access)
            for activity in [
                "create communication record", "create contact",
                "create partner saved search", "create partner", "create role",
                "create tag", "create user", "delete tag", "delete partner",
                "delete role", "delete user", "read contact",
                "read communication record", "read partner saved search",
                "read partner", "read role", "read user", "read tag",
                "update communication record", "update contact",
                "update partner", "update role", "update tag", "update user",
                "read outreach email address", "create outreach email address",
                "delete outreach email address",
                "update outreach email address", "read outreach record",
                "convert outreach record", "view analytics"
            ]
        ]

        self.company = CompanyFactory(app_access=[self.app_access])
        # this role will be populated by activities on a test-by-test basis
        self.role = RoleFactory(company=self.company, name="Admin")
        self.user = UserFactory(roles=[self.role], is_staff=True)

        cache.clear()
        clear_url_caches()
        self.ms_solr = Solr(settings.SOLR['seo_test'])
        self.ms_solr.delete(q='*:*')

        self.base_context_processors = settings.TEMPLATE_CONTEXT_PROCESSORS
        context_processors = self.base_context_processors + (
            'mymessages.context_processors.message_lists', )
        setattr(settings, 'TEMPLATE_CONTEXT_PROCESSORS', context_processors)
        setattr(settings, 'MEMOIZE', False)

        self.patcher = patch('urllib2.urlopen', return_file())
        self.mock_urlopen = self.patcher.start()

        self.client = TestClient()
        self.client.login_user(self.user)
Esempio n. 12
0
 def setUp(self):
     super(ImportJobsTestCase, self).setUp()
     self.businessunit = BusinessUnitFactory(id=0)
     self.buid_id = self.businessunit.id        
     self.filepath = os.path.join(DATA_DIR, '0', 'dseo_feed_%s.xml' % self.buid_id)
     self.solr_settings = {
         'default': {'URL': 'http://127.0.0.1:8983/solr/seo'}
     }
     self.solr = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL'])
Esempio n. 13
0
def remove_expired_jobs(buid, active_jobs, upload_chunk_size=1024):
    """
    Given a job source id and a list of active jobs for that job source,
    Remove the jobs on solr that are not among the active jobs.
    """
    conn = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL'])
    count = conn.search("*:*", fq="buid:%s" % buid, facet="false",
                                      mlt="false").hits
    old_jobs = conn.search("*:*", fq="buid:%s" % buid, facet="false",
                           rows=count, mlt="false").docs
    active_ids = set(j['id'] for j in active_jobs)
    old_ids = set(j['id'] for j in old_jobs)
    expired = old_ids - active_ids
    chunks = chunk(list(expired), upload_chunk_size)
    for jobs in chunks:
        query = "id:(%s)" % " OR ".join([str(x) for x in jobs])
        conn.delete(q=query)
    return expired
Esempio n. 14
0
def add_jobs(jobs, upload_chunk_size=1024):
    """
    Loads a solr-ready json list of jobs into solr.

    inputs:
        :jobs: A list of solr-ready, json-formatted jobs.

    outputs:
        The ids of jobs loaded into solr.
    """
    conn = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL'])

    # Chunk them
    jobs = chunk(jobs, upload_chunk_size)
    job_ids = list()
    for job_group in jobs:
        job_group = list(job_group)
        conn.add(job_group)
        job_ids.extend(j['id'] for j in job_group)
    return job_ids
Esempio n. 15
0
def add_jobs(jobs, upload_chunk_size=1024):
    """
    Loads a solr-ready json list of jobs into solr.

    inputs:
        :jobs: A list of solr-ready, json-formatted jobs.

    outputs:
        The ids of jobs loaded into solr.
    """
    conn = Solr(settings.HAYSTACK_CONNECTIONS["default"]["URL"])

    # Chunk them
    jobs = chunk(jobs, upload_chunk_size)
    job_ids = list()
    for job_group in jobs:
        job_group = list(job_group)
        conn.add(job_group)
        job_ids.extend(j["id"] for j in job_group)
    return job_ids
Esempio n. 16
0
def delete_by_guid(guids):
    """
    Removes a jobs from solr by guid.

    inputs:
        :guids: A list of guids

    outputs:
        The number of jobs that were requested to be deleted. This may
        be higher than the number of actual jobs deleted if a guid
        passed in did not correspond to a job in solr.
    """
    conn = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL'])
    if not guids:
        return 0
    num_guids = len(guids)
    guids = chunk(guids)
    for guid_group in guids:
        delete_str = " OR ".join(guid_group)
        conn.delete(q="guid: (%s)" % delete_str)
    return num_guids
Esempio n. 17
0
    def setUp(self):
        super(JobFeedTestCase, self).setUp()
        self.businessunit = BusinessUnitFactory(id=0)
        self.buid_id = self.businessunit.id
        self.numjobs = 14
        self.testdir = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                                    'data')
        self.company = CompanyFactory()
        self.company.job_source_ids.add(self.businessunit)
        self.company.save()
        self.conn = Solr("http://127.0.0.1:8983/solr/seo")
        self.emptyfeed = os.path.join(self.testdir, "dseo_feed_0.no_jobs.xml")
        self.malformed_feed = os.path.join(self.testdir, 'dseo_malformed_feed_0.xml')
        self.invalid_feed = os.path.join(self.testdir, 'dseo_invalid_feed_0.xml')
        self.unused_field_feed = os.path.join(self.testdir, 'dseo_feed_1.xml')
        self.no_onet_feed = os.path.join(self.testdir, 'dseo_feed_no_onets.xml')

        # Ensures DATA_DIR used by import_jobs.download_feed_file exists
        data_path = DATA_DIR
        if not os.path.exists(data_path):
            os.mkdir(data_path)
Esempio n. 18
0
class MyJobsBase(TestCase):
    def setUp(self):
        from django.conf import settings
        setattr(settings, 'ROOT_URLCONF', 'myjobs_urls')
        cache.clear()
        clear_url_caches()
        self.ms_solr = Solr('http://127.0.0.1:8983/solr/seo')
        self.ms_solr.delete(q='*:*')
        setattr(settings, "PROJECT", 'myjobs')

        self.base_context_processors = settings.TEMPLATE_CONTEXT_PROCESSORS
        context_processors = self.base_context_processors + (
            'mymessages.context_processors.message_lists',
        )
        setattr(settings, 'TEMPLATE_CONTEXT_PROCESSORS', context_processors)
        setattr(settings, 'MEMOIZE', False)

    def tearDown(self):
        self.ms_solr.delete(q='*:*')
        setattr(settings, 'TEMPLATE_CONTEXT_PROCESSORS',
                self.base_context_processors)
Esempio n. 19
0
def delete_by_guid(guids):
    """
    Removes a jobs from solr by guid.

    inputs:
        :guids: A list of guids

    outputs:
        The number of jobs that were requested to be deleted. This may
        be higher than the number of actual jobs deleted if a guid
        passed in did not correspond to a job in solr.
    """
    conn = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL'])
    if not guids:
        return 0
    num_guids = len(guids)
    guids = chunk(guids)
    for guid_group in guids:
        delete_str = " OR ".join(guid_group)
        conn.delete(q="guid: (%s)" % delete_str)
    return num_guids
Esempio n. 20
0
    def setUp(self):
        db_backend = settings.DATABASES['default']['ENGINE'].split('.')[-1]

        # Set columns that are utf8 in production to utf8
        if db_backend == 'mysql':
            cursor = connections['default'].cursor()
            cursor.execute("alter table seo_customfacet convert to character "
                           "set utf8 collate utf8_unicode_ci")
            cursor.execute("alter table seo_seositefacet convert to character "
                           "set utf8 collate utf8_unicode_ci")
            cursor.execute("alter table seo_company convert to character set "
                           "utf8 collate utf8_unicode_ci")
            cursor.execute("alter table taggit_tag convert to character set "
                           "utf8 collate utf8_unicode_ci")
            cursor.execute("alter table taggit_taggeditem convert to "
                           "character set "
                           "utf8 collate utf8_unicode_ci")
            cursor.execute("alter table seo_seositeredirect convert to "
                           "character set utf8 collate utf8_unicode_ci")
            cursor.execute("alter table django_redirect convert to "
                           "character set utf8 collate utf8_unicode_ci")

        setattr(settings, 'ROOT_URLCONF', 'dseo_urls')
        setattr(settings, "PROJECT", 'dseo')
        clear_url_caches()

        self.base_middleware_classes = settings.MIDDLEWARE_CLASSES
        middleware_classes = self.base_middleware_classes + (
            'wildcard.middleware.WildcardMiddleware', )
        setattr(settings, 'MIDDLEWARE_CLASSES', middleware_classes)

        self.base_context_processors = settings.TEMPLATE_CONTEXT_PROCESSORS
        context_processors = self.base_context_processors + (
            "social_links.context_processors.social_links_context",
            "seo.context_processors.site_config_context",
        )
        setattr(settings, 'TEMPLATE_CONTEXT_PROCESSORS', context_processors)
        context._standard_context_processors = None

        self.conn = Solr('http://127.0.0.1:8983/solr/seo')
        self.conn.delete(q="*:*")
        cache.clear()
        clear_url_caches()

        # Change the solr engine to one that has been extended
        # for testing purposes.
        self.default_engine = settings.HAYSTACK_CONNECTIONS['default']['ENGINE']
        self.engine = 'seo.tests.setup.TestDESolrEngine'
        settings.HAYSTACK_CONNECTIONS['default']['ENGINE'] = self.engine
        haystack_connections.reload('default')

        setattr(settings, 'MEMOIZE', False)
Esempio n. 21
0
    def __init__(self, connection_alias, **connection_options):
        """
        Inputs:
        :HTTP_AUTH_USERNAME: Username used for http authentication
        :HTTP_AUTH_PASSWORD: Password used for http authentication

        """
        super(DESolrSearchBackend, self).__init__(connection_alias,
                                                  **connection_options)
        user = connection_options.get("HTTP_AUTH_USERNAME")
        passwd = connection_options.get("HTTP_AUTH_PASSWORD")
        self.conn = Solr(connection_options['URL'], auth=(user, passwd),
                         timeout=self.timeout)
Esempio n. 22
0
    def setUp(self):
        super(SiteTestCase, self).setUp()
        self.conn = Solr("http://127.0.0.1:8983/solr/seo")
        self.conn.delete(q="*:*")
        self.businessunit = factories.BusinessUnitFactory(id=0)
        self.buid = self.businessunit.id
        self.filepath = os.path.join(import_jobs.DATA_DIR, "dseo_feed_%s.xml" % self.buid)
        SeoSite.objects.all().delete()
        self.site = factories.SeoSiteFactory(id=1)

        self.configuration = factories.ConfigurationFactory(status=2)
        self.configuration.save()
        self.site.configurations.clear()
        self.site.configurations.add(self.configuration)
Esempio n. 23
0
    def setUp(self):
        self.solr_settings = {
            'default': {'URL': 'http://127.0.0.1:8983/solr/seo'}
        }
        self.solr = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL'])
        self.solr.delete(q="*:*")

        self.zipfile = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                    'data',
                                    'ActiveDirectory_ce2ca701-eeca-4c13-96ba-e6bde9cb7060.zip') 
        
        with open(self.zipfile) as zf:
            self.jobs = list(get_jobs_from_zipfile(zf, "ce2ca701-eeca-4c13-96ba-e6bde9cb7060"))
            
        self.businessunit = BusinessUnitFactory(id=0)
        self.buid = self.businessunit.id
        self.guid = 'ce2ca701-eeca-4c13-96ba-e6bde9cb7060'
        self.name = "Test"
Esempio n. 24
0
    def setUp(self):
        settings.ROOT_URLCONF = "myjobs_urls"
        settings.PROJECT = "myjobs"

        self.app_access = AppAccessFactory()
        self.activities = [
            ActivityFactory(name=activity, app_access=self.app_access)
            for activity in [
                "create communication record", "create contact",
                "create partner saved search", "create partner", "create role",
                "create tag", "create user", "delete tag", "delete partner",
                "delete role", "delete user", "read contact",
                "read communication record", "read partner saved search",
                "read partner", "read role", "read user", "read tag",
                "update communication record", "update contact",
                "update partner", "update role", "update tag", "update user",
                "read outreach email address", "create outreach email address",
                "delete outreach email address",
                "update outreach email address", "read outreach record",
                "convert outreach record"]]

        self.company = CompanyFactory(app_access=[self.app_access])
        # this role will be populated by activities on a test-by-test basis
        self.role = RoleFactory(company=self.company, name="Admin")
        self.user = UserFactory(roles=[self.role], is_staff=True)

        cache.clear()
        clear_url_caches()
        self.ms_solr = Solr(settings.SOLR['seo_test'])
        self.ms_solr.delete(q='*:*')

        self.base_context_processors = settings.TEMPLATE_CONTEXT_PROCESSORS
        context_processors = self.base_context_processors + (
            'mymessages.context_processors.message_lists',
        )
        setattr(settings, 'TEMPLATE_CONTEXT_PROCESSORS', context_processors)
        setattr(settings, 'MEMOIZE', False)

        self.patcher = patch('urllib2.urlopen', return_file())
        self.mock_urlopen = self.patcher.start()

        self.client = TestClient()
        self.client.login_user(self.user)
Esempio n. 25
0
    def setUp(self):
        super(JobFeedTestCase, self).setUp()
        self.businessunit = BusinessUnitFactory(id=0)
        self.buid_id = self.businessunit.id
        self.numjobs = 14
        self.testdir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 
                                    'data')
        self.company = CompanyFactory()
        self.company.job_source_ids.add(self.businessunit)
        self.company.save()
        self.conn = Solr("http://127.0.0.1:8983/solr/seo")
        self.emptyfeed = os.path.join(self.testdir, "dseo_feed_0.no_jobs.xml")
        self.malformed_feed = os.path.join(self.testdir, 'dseo_malformed_feed_0.xml')
        self.invalid_feed = os.path.join(self.testdir, 'dseo_invalid_feed_0.xml')
        self.unused_field_feed = os.path.join(self.testdir, 'dseo_feed_1.xml')
        self.no_onet_feed = os.path.join(self.testdir, 'dseo_feed_no_onets.xml')

        #Ensures DATA_DIR used by import_jobs.download_feed_file exists
        data_path = DATA_DIR
        if not os.path.exists(data_path):
            os.mkdir(data_path)
Esempio n. 26
0
def remove_expired_jobs(buid, active_ids, upload_chunk_size=1024):
    """
    Given a job source id and a list of active job ids for that job source,
    Remove the jobs on solr that are not among the active jobs.
    """
    conn = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL'])

    active_ids = set(active_ids)

    count = conn.search("*:*", fq="buid:%s" % buid, facet="false",
                        mlt="false").hits
    old_jobs = conn.search("*:*",
                           fq="buid:%s" % buid,
                           facet="false",
                           rows=count,
                           mlt="false").docs
    old_ids = set(j['id'] for j in old_jobs)
    expired = old_ids - active_ids
    chunks = chunk(list(expired), upload_chunk_size)
    for jobs in chunks:
        query = "id:(%s)" % " OR ".join([str(x) for x in jobs])
        conn.delete(q=query)
    return expired
Esempio n. 27
0
class SitemapTestCase(DirectSEOBase):
    def setUp(self):
        super(SitemapTestCase, self).setUp()
        self.conn = Solr('http://127.0.0.1:8983/solr/seo')
        self.conn.add(SOLR_FIXTURE)

    def test_index(self):
        resp = self.client.get("/sitemap.xml")
        self.assertEqual(resp.status_code, 200)
        
    def test_no_buid_sitemap(self):
        """
        Test to verify that a sitemap is generated with sites that have no
        BUID.
        
        """
        site = SeoSite.objects.get(id=1)
        site.business_units = []
        site.save()
        today = datetime.datetime.today()
        dt = datetime.date(*today.timetuple()[0:3]).isoformat()
        resp = self.client.get("/sitemap-" + dt + ".xml")
        self.assertTrue("<url>" in resp.content)

    def test_noreverse(self):
        """
        Test to ensure that jobs with bad/ugly data do not block the
        creation of a sitemap page, but instead are just skipped over in
        `SolrSitemap.get_urls().`

        This is a regression test. It was prompted by a job in a job feed
        file having "~" in the "city" field. Because our URL pattern
        doesn't recognize that character in its regex, it caused a
        `NoReverseMatch` exception to be thrown. Instead of adding a
        tilde, we want to be able to handle any weird characters not
        specified in our URL config.
        
        """
        # Sometimes the site settings are messed up from other tests. Ensure
        # that the settings are compatible with actually searching for the
        # jobs we're adding.
        settings.SITE_BUIDS = []
        site = SeoSite.objects.get(pk=1)
        site.business_units = []
        site.save()

        # These are kwargs from the actual error that created this error in the
        # first place.
        kwargs = {
            'location': '~, WV',
            'title': '911 Coordinator',
            'uid': '25901630'
        }
        job = dict(SOLR_FIXTURE[0])
        job.update(kwargs)
        self.conn.add([job])
        today = datetime.datetime.now()
        dt = today.date().isoformat()
        resp = self.client.get("/sitemap-" + dt + ".xml")
        self.assertEqual(resp.status_code, 200)
        self.assertTrue("<url>" in resp.content)
        
    def tearDown(self):
        super(SitemapTestCase, self).tearDown()
        self.conn.delete("*:*")
Esempio n. 28
0
    def setUp(self):
        db_backend = settings.DATABASES['default']['ENGINE'].split('.')[-1]

        # Set columns that are utf8 in production to utf8
        if db_backend == 'mysql':
            cursor = connections['default'].cursor()
            cursor.execute("alter table seo_customfacet convert to character "
                           "set utf8 collate utf8_unicode_ci")
            cursor.execute("alter table seo_seositefacet convert to character "
                           "set utf8 collate utf8_unicode_ci")
            cursor.execute("alter table seo_company convert to character set "
                           "utf8 collate utf8_unicode_ci")
            cursor.execute("alter table seo_queryredirect convert to character "
                           "set utf8 collate utf8_unicode_ci")
            cursor.execute("alter table taggit_tag convert to character set "
                           "utf8 collate utf8_unicode_ci")
            cursor.execute("alter table taggit_taggeditem convert to "
                           "character set "
                           "utf8 collate utf8_unicode_ci")
            cursor.execute("alter table seo_seositeredirect convert to "
                           "character set utf8 collate utf8_unicode_ci")
            cursor.execute("alter table django_redirect convert to "
                           "character set utf8 collate utf8_unicode_ci")
            # We have a data migration that does this, but we don't run
            # migrations during tests (Django 1.6.5
            cursor.execute("ALTER TABLE django_flatpage CONVERT TO "
                           "CHARACTER SET utf8 COLLATE utf8_general_ci")
            cursor.execute("ALTER TABLE seo_custompage CONVERT TO "
                           "CHARACTER SET utf8 COLLATE utf8_general_ci")

        setattr(settings, 'ROOT_URLCONF', 'dseo_urls')
        setattr(settings, "PROJECT", 'dseo')
        clear_url_caches()

        self.base_middleware_classes = settings.MIDDLEWARE_CLASSES
        middleware_classes = self.base_middleware_classes + (
            'wildcard.middleware.WildcardMiddleware',
            'middleware.RedirectOverrideMiddleware')
        setattr(settings, 'MIDDLEWARE_CLASSES', middleware_classes)

        self.base_context_processors = settings.TEMPLATE_CONTEXT_PROCESSORS
        context_processors = self.base_context_processors + (
            "social_links.context_processors.social_links_context",
            "seo.context_processors.site_config_context",
        )
        setattr(settings, 'TEMPLATE_CONTEXT_PROCESSORS', context_processors)
        context._standard_context_processors = None

        self.conn = Solr('http://127.0.0.1:8983/solr/seo')
        self.conn.delete(q="*:*")
        cache.clear()
        clear_url_caches()

        setattr(settings, 'MEMOIZE', False)

        # As we added tests that created more and more companies, we
        # approached the hardcoded companies in import_jobs_testdata.json.
        # When we hit those ids, we began to get IntegrityErrors during
        # testing. Reset the sequence used by CompanyFactory to clear this
        # build-up.
        CompanyFactory.reset_sequence()
Esempio n. 29
0
class DESolrSearchBackend(SolrSearchBackend):
    def __init__(self, connection_alias, **connection_options):
        """
        Inputs:
        :HTTP_AUTH_USERNAME: Username used for http authentication
        :HTTP_AUTH_PASSWORD: Password used for http authentication

        """
        super(DESolrSearchBackend, self).__init__(connection_alias,
                                                  **connection_options)
        user = connection_options.get("HTTP_AUTH_USERNAME")
        passwd = connection_options.get("HTTP_AUTH_PASSWORD")
        self.conn = Solr(connection_options['URL'], auth=(user, passwd),
                         timeout=self.timeout)

    @log_query
    def search(self, query_string, sort_by=None, start_offset=0, end_offset=None,
               fields='', highlight=False, facets=None, date_facets=None,
               query_facets=None, narrow_queries=None, spelling_query=None,
               within=None, dwithin=None, distance_point=None,
               limit_to_registered_models=None, result_class=None,
               facet_mincount=None, facet_limit=None, facet_prefix=None,
               facet_sort=None, facet_offset=None, bf=None, **kwargs):
        """
        Overrides both search() and build_search_kwargs().

        """

        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }
        kwargs = {
            'fl': '* score',
            'mlt': 'false'
        }

        if fields:
            if isinstance(fields, (list, set)):
                fields = " ".join(fields)
            kwargs['fl'] = fields
       # This code was causing sort_by to break, but we're keeping it as a
       # reference in case we want to enable geographic sorting in the future.
       # Haystack does have an order_by_distance function, so this code might
       # not be necessary
       # Jason McLaughlin 10/30/2012
       # geo_sort = False
       # if sort_by is not None:
       #     if sort_by in ['distance asc', 'distance desc'] and distance_point:
       #         # Do the geo-enabled sort.
       #         lng, lat = distance_point['point'].get_coords()
       #         kwargs['sfield'] = distance_point['field']
       #         kwargs['pt'] = '%s,%s' % (lat, lng)
       #         geo_sort = True
       #
       #         if sort_by == 'distance asc':
       #             kwargs['sort'] = 'geodist() asc'
       #         else:
       #             kwargs['sort'] = 'geodist() desc'
       #     else:
       #         if sort_by.startswith('distance '):
       #              warnings.warn("In order to sort by distance, "
       #                            "you must call the '.distance(...)' "
       #                            "method.")

        if sort_by is not None:
            # Regular sorting.
            kwargs['sort'] = sort_by

        if bf is not None:
            kwargs['bf'] = bf

        if start_offset is not None:
            kwargs['start'] = start_offset

        if end_offset is not None:
            kwargs['rows'] = end_offset - start_offset

        if highlight is True:
            kwargs['hl'] = 'true'
            kwargs['hl.fl'] = 'description'
            kwargs['hl.alternateField'] = 'description'
            kwargs['hl.maxAlternateFieldLength'] = \
                settings.SEARCH_FRAGMENT_SIZE
            kwargs['hl.requireFieldMatch'] = 'true'
            kwargs['hl.fragsize'] = settings.SEARCH_FRAGMENT_SIZE
            kwargs['hl.snippets'] = settings.SEARCH_SNIPPETS
            kwargs['hl.mergeContiguous'] = 'true'
            kwargs['hl.simple.pre'] = '###{{{###'
            kwargs['hl.simple.post'] = '###}}}###'

        if self.include_spelling is True:
            kwargs['spellcheck'] = 'true'
            kwargs['spellcheck.collate'] = 'true'
            kwargs['spellcheck.count'] = 1

            if spelling_query:
                kwargs['spellcheck.q'] = spelling_query

        if facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.field'] = facets.keys()

            for facet_field, options in facets.items():
                for key, value in options.items():
                    kwargs['f.%s.facet.%s' % (facet_field, key)] = self.conn._from_python(value)

        if facet_mincount is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.mincount'] = facet_mincount

        if facet_limit is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.limit'] = facet_limit

        if facet_prefix is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.prefix'] = facet_prefix

        if facet_sort is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.sort'] = facet_sort

        if facet_offset is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.offset'] = facet_offset

        if date_facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.date'] = date_facets.keys()
            kwargs['facet.date.other'] = 'none'

            for key, value in date_facets.items():
                kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(value.get('start_date'))
                kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(value.get('end_date'))
                gap_by_string = value.get('gap_by').upper()
                gap_string = "%d%s" % (value.get('gap_amount'), gap_by_string)

                if value.get('gap_amount') != 1:
                    gap_string += "S"

                kwargs["f.%s.facet.date.gap" % key] = '+%s/%s' % (gap_string, gap_by_string)

        if query_facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.query'] = query_facets

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            if narrow_queries is None:
                narrow_queries = set()

            registered_models = self.build_models_list()

            if len(registered_models) > 0:
                narrow_queries.add('%s:(%s)' % (DJANGO_CT, ' OR '.join(registered_models)))

        if narrow_queries is not None:
            kwargs['fq'] = list(narrow_queries)

        # if within is not None:
        #     from haystack.utils.geo import generate_bounding_box
        #
        #     kwargs.setdefault('fq', [])
        #     ((min_lat, min_lng), (max_lat, max_lng)) = generate_bounding_box(within['point_1'], within['point_2'])
        #     # Bounding boxes are min, min TO max, max. Solr's wiki was *NOT*
        #     # very clear on this.
        #     bbox = '%s:[%s,%s TO %s,%s]' % (within['field'], min_lat, min_lng, max_lat, max_lng)
        #     kwargs['fq'].append(bbox)

        # if dwithin is not None:
        #     kwargs.setdefault('fq', [])
        #     lng, lat = dwithin['point'].get_coords()
        #     geofilt = '{!geofilt pt=%s,%s sfield=%s d=%s}' % (lat, lng, dwithin['field'], dwithin['distance'].km)
        #     kwargs['fq'].append(geofilt)

        # # Check to see if the backend should try to include distances
        # # (Solr 4.X+) in the results.
        # if self.distance_available and distance_point:
        #     # In early testing, you can't just hand Solr 4.X a proper bounding box
        #     # & request distances. To enable native distance would take calculating
        #     # a center point & a radius off the user-provided box, which kinda
        #     # sucks. We'll avoid it for now, since Solr 4.x's release will be some
        #     # time yet.
        #     # kwargs['fl'] += ' _dist_:geodist()'
        #     pass

        try:
            raw_results = self.conn.search(query_string, **kwargs)
        except (IOError, SolrError), e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to query Solr using '%s': %s", query_string, e)
            raw_results = EmptyResults()

        return self._process_results(raw_results, highlight=highlight,
                                     result_class=result_class)
Esempio n. 30
0
def update_solr(buid, download=True, force=True, set_title=False,
                delete_feed=True, data_dir=DATA_DIR, clear_cache=False):
    """
    Update the Solr master index with the data contained in a feed file
    for a given buid/jsid.

    This is meant to be a standalone function such that the state of the
    Solr index is not tied to the state of the database.

    Inputs:
    :buid: An integer; the ID for a particular business unit.
    :download: Boolean. If False, this process will not download a new
    feedfile, but instead use the one on disk. Should only be false for
    the purposes of our test suite.
    :force: Boolean. If True, every job seen in the feed file will be
    updated in the index. Otherwise, only the jobs seen in the feed file
    but not seen in the index will be updated. This latter option will
    soon be deprecated.

    Returns:
    A 2-tuple consisting of the number of jobs added and the number deleted.

    Writes/Modifies:
    Job data found in the feed file is used to modify the Solr index. This
    includes adds & deletes. (Solr does not have a discrete equivalent to
    SQL's UPDATE; by adding a document with the same UID as a document in
    the index, the equivalent of an update operation is performed.)

    """
    if download:
        filepath = download_feed_file(buid, data_dir=data_dir)
    else:
        # Get current worker process id, to prevent race conditions.
        try:
            p = current_process()
            process_id = p.index
        except:
            process_id = 0
        filepath = os.path.join(data_dir, str(process_id), FEED_FILE_PREFIX + str(buid) +
                                '.xml')
    bu = BusinessUnit.objects.get(id=buid)
    try:
        co = bu.company_set.all()[0]
    except IndexError:
        co = None
    jobfeed = DEv2JobFeed(filepath, jsid=buid, markdown=bu.enable_markdown,
                          company=co)
    # If the feed file did not pass validation, return. The return value is
    # '(0, 0)' to match what's returned on a successful parse.
    if jobfeed.errors:
        error = jobfeed.error_messages
        logging.error("BUID:%s - Feed file has failed validation on line %s. "
                      "Exception: %s" % (buid, error['line'],
                                         error['exception']))
        raise FeedImportError(error)

    # A dictionary of uids
    jobs = jobfeed.jobparse()

    # Build a set of all the UIDs for all those instances.
    job_uids = set([long(i.get('uid')) for i in jobs if i.get('uid')])
    conn = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL'])
    step1 = 1024

    # Get the count of all the results in the Solr index for this BUID.
    hits = conn.search("*:*", fq="buid:%s" % buid, facet="false",
                       mlt="false").hits
    # Create (start-index, stop-index) tuples to facilitate handling results
    # in ``step1``-sized chunks. So if ``hits`` returns 2048 results,
    # ``job_slices`` will look like ``[(0,1024), (1024, 2048)]``. Those
    # values are then used to slice up the total results.
    #
    # This was put in place because part of the logic to figuring out what
    # jobs to delete from and add jobs to the Solr index is using set
    # algebra. We convert the total list of UIDs in the index and the UIDs
    # in the XML feed to sets, then compare them via ``.difference()``
    # (seen below). However for very large feed files, say 10,000+ jobs,
    # this process was taking so long that the connection would time out. To
    # address this problem we break up the comparisons as described above.
    # This results in more requests but it alleviates the connection timeout
    # issue.
    job_slices = slices(range(hits), step=step1)
    results = [_solr_results_chunk(tup, buid, step1) for tup in job_slices]
    solr_uids = reduce(lambda x, y: x | y, results) if results else set()
    # Return the job UIDs that are in the Solr index but not in the feed
    # file.
    solr_del_uids = solr_uids.difference(job_uids)

    if not force:
        # Return the job UIDs that are in the feed file but not in the Solr
        # index.
        solr_add_uids = job_uids.difference(solr_uids)
        # ``jobfeed.solr_jobs()`` yields a list of dictionaries. We want to
        # filter out any dictionaries whose "uid" key is not in
        # ``solr_add_uids``. This is because by default we only want to add
        # new documents (which each ``solr_jobs()`` dictionary represents),
        # not update.
        add_docs = filter(lambda x: int(x.get("uid", 0)) in solr_add_uids,
                          jobfeed.solr_jobs())
    else:
        # This might seem redundant to refer to the same value
        # twice with two different variable names. However, this decision
        # was made during the implementation of the "force Solr update"
        # feature to this function.
        #
        # Instead of adding only the documents with UIDs that are in the feed
        # file but not in the Solr index, we're going to add ALL the documents
        # in the feed file. This will add the new documents of course, but it
        # will also update existing documents with any new data. Uniqueness of
        # the documents is ensured by the ``id`` field defined in the Solr
        # schema (the template for which can be seen in
        # templates/search_configuration/solr.xml). At the very bottom you'll
        # see <uniqueKey>id</uniqueKey>. This serves as the equivalent of the pk
        # (i.e. globally unique) in a database.
        solr_add_uids = job_uids
        add_docs = jobfeed.solr_jobs()

    # Slice up ``add_docs`` in chunks of 4096. This is because the
    # maxBooleanClauses setting in solrconfig.xml is set to 4096. This means
    # if we used any more than that Solr would throw an error and our
    # updates wouldn't get processed.
    add_steps = slices(range(len(solr_add_uids)), step=4096)
    # Same concept as ``add_docs``.
    del_steps = slices(range(len(solr_del_uids)), step=4096)
    # Create a generator that yields 2-tuples with each invocation. The
    # 2-tuples consist of one tuple each from del_steps & add_steps. Any
    # mismatched values (e.g. there are more del_steps than add_steps)
    # will be compensated for with the ``fillvalue``.
    zipped_steps = izip_longest(del_steps, add_steps, fillvalue=(0, 0))

    for tup in zipped_steps:
        update_chunk = add_docs[tup[1][0]:tup[1][1] + 1]

        if update_chunk:
            logging.debug("BUID:%s - SOLR - Update chunk: %s" %
                         (buid, [i['uid'] for i in update_chunk]))
            # Pass 'commitWithin' so that Solr doesn't try to commit the new
            # docs right away. This will help relieve some of the resource
            # stress during the daily update. The value is expressed in
            # milliseconds.
            conn.add(update_chunk, commitWithin="30000")

        delete_chunk = _build_solr_delete_query(
            list(solr_del_uids)[tup[0][0]:tup[0][1] + 1])

        if delete_chunk:
            # Post-a-job jobs should not be deleted during import
            delete_chunk = "(%s) AND -is_posted:true" % delete_chunk
            logging.debug("BUID:%s - SOLR - Delete chunk: %s" %
                         (buid, list(solr_del_uids)))
            conn.delete(q=delete_chunk)

    # delete any jobs that may have been added via etl_to_solr
    conn.delete(q="buid:%s AND !uid:[0  TO *]" % buid)

    # Update business unit information: title, dates, and associated_jobs
    if set_title or not bu.title or (bu.title != jobfeed.job_source_name and
                                     jobfeed.job_source_name):
        bu.title = jobfeed.job_source_name
    updated = bool(solr_add_uids) or bool(solr_del_uids)
    _update_business_unit_modified_dates(bu, jobfeed.crawled_date,
                                         updated=updated)
    bu.associated_jobs = len(jobs)
    bu.save()
    # Update the Django database to reflect company additions and name changes
    add_company(bu)
    if delete_feed:
        os.remove(filepath)
        logging.info("BUID:%s - Deleted feed file." % buid)
    return len(solr_add_uids), len(solr_del_uids)
Esempio n. 31
0
class LoadETLTestCase(DirectSEOBase):
    fixtures = ['countries.json']
    
    def setUp(self):
        self.solr_settings = {
            'default': {'URL': 'http://127.0.0.1:8983/solr/seo'}
        }
        self.solr = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL'])
        self.solr.delete(q="*:*")

        self.zipfile = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                    'data',
                                    'ActiveDirectory_ce2ca701-eeca-4c13-96ba-e6bde9cb7060.zip') 
        
        with open(self.zipfile) as zf:
            self.jobs = list(get_jobs_from_zipfile(zf, "ce2ca701-eeca-4c13-96ba-e6bde9cb7060"))
            
        self.businessunit = BusinessUnitFactory(id=0)
        self.buid = self.businessunit.id
        self.guid = 'ce2ca701-eeca-4c13-96ba-e6bde9cb7060'
        self.name = "Test"
    
    def tearDown(self):
        pass
    
    @patch('import_jobs.get_jobsfs_zipfile')
    def test_update_job_source(self, mock_jobsfs):
        mock_jobsfs.return_value = open(self.zipfile, 'rb')
        
        count = self.solr.search('*:*').hits
        self.assertEqual(count, 0, "Jobs for buid in solr before the test.  Cannot guarantee correct behavior.")
        self.assertEqual(self.businessunit.associated_jobs, 4, "Initial Job Count does not match the factory")

        update_job_source(self.guid, self.buid, self.name)

        count = self.solr.search('buid:%s' % self.buid).hits
        # Note the job count being one low here is due to one job being filtered out due to include_in_index_bit
        self.assertEqual(count, 38, "38 Jobs not in solr after call to update job source. Found %s" % count)
        self.assertEqual(BusinessUnit.objects.get(id=self.buid).associated_jobs, 38, 
                         "Job Count not updated after imports: Should be 38 was %s" % self.businessunit.associated_jobs)
    
    def test_filtering_on_includeinindex_bit(self):
        """Test that filtering on the include_in_index bit works"""
        
        #Prove we have the expected number of jobs in the zipfile itself.
        self.assertEqual(len(self.jobs), 39, 
                         "Expected to find 0 jobs in the test zipfile, instead found %s" % len(self.jobs))
        
        # Prove that filtering works.
        filtered_jobs = list(filter_current_jobs(self.jobs, self.businessunit))
        self.assertEqual(len(filtered_jobs), 38,
                         "filter_current_jobs should rmeove jobs with the includeinindex bit set, "
                         "it's expected to return %s.  Instead it returned %s" % (38, len(filtered_jobs)))
        
    
    def test_businessunit_ignore_includeinindex(self):
        """Test that filtering on the include_in_index bit can be overridden on a per business unit basis."""
        # Set ignore_includeinindex on the test BusinessUnit
        self.businessunit.ignore_includeinindex = True
        self.businessunit.save()
        
        #Prove we have the expected number of jobs in the zipfile itself.
        self.assertEqual(len(self.jobs), 39, 
                         "Expected to find 0 jobs in the test zipfile, instead found %s" % len(self.jobs))
        
        # Prove that filtering works.
        filtered_jobs = list(filter_current_jobs(self.jobs, self.businessunit))
        self.assertEqual(len(filtered_jobs), 39,
                         "filter_current_jobs should ignore the includeinindex bit, returning 39 jobs.  "
                         "Instead returned %s." % len(filtered_jobs))
Esempio n. 32
0
def update_solr(buid,
                download=True,
                force=True,
                set_title=False,
                delete_feed=True,
                data_dir=DATA_DIR,
                clear_cache=False):
    """
    Update the Solr master index with the data contained in a feed file
    for a given buid/jsid.

    This is meant to be a standalone function such that the state of the
    Solr index is not tied to the state of the database.

    Inputs:
    :buid: An integer; the ID for a particular business unit.
    :download: Boolean. If False, this process will not download a new
    feedfile, but instead use the one on disk. Should only be false for
    the purposes of our test suite.
    :force: Boolean. If True, every job seen in the feed file will be
    updated in the index. Otherwise, only the jobs seen in the feed file
    but not seen in the index will be updated. This latter option will
    soon be deprecated.

    Returns:
    A 2-tuple consisting of the number of jobs added and the number deleted.

    Writes/Modifies:
    Job data found in the feed file is used to modify the Solr index. This
    includes adds & deletes. (Solr does not have a discrete equivalent to
    SQL's UPDATE; by adding a document with the same UID as a document in
    the index, the equivalent of an update operation is performed.)

    """
    if download:
        filepath = download_feed_file(buid, data_dir=data_dir)
    else:
        # Get current worker process id, to prevent race conditions.
        try:
            p = current_process()
            process_id = p.index
        except:
            process_id = 0
        filepath = os.path.join(data_dir, str(process_id),
                                FEED_FILE_PREFIX + str(buid) + '.xml')
    bu = BusinessUnit.objects.get(id=buid)
    try:
        co = bu.company_set.all()[0]
    except IndexError:
        co = None
    jobfeed = DEv2JobFeed(filepath,
                          jsid=buid,
                          markdown=bu.enable_markdown,
                          company=co)
    # If the feed file did not pass validation, return. The return value is
    # '(0, 0)' to match what's returned on a successful parse.
    if jobfeed.errors:
        error = jobfeed.error_messages
        logging.error("BUID:%s - Feed file has failed validation on line %s. "
                      "Exception: %s" %
                      (buid, error['line'], error['exception']))
        raise FeedImportError(error)

    # A dictionary of uids
    jobs = jobfeed.jobparse()

    # Build a set of all the UIDs for all those instances.
    job_uids = set([long(i.get('uid')) for i in jobs if i.get('uid')])
    conn = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL'])
    step1 = 1024

    # Get the count of all the results in the Solr index for this BUID.
    hits = conn.search("*:*", fq="buid:%s" % buid, facet="false",
                       mlt="false").hits
    # Create (start-index, stop-index) tuples to facilitate handling results
    # in ``step1``-sized chunks. So if ``hits`` returns 2048 results,
    # ``job_slices`` will look like ``[(0,1024), (1024, 2048)]``. Those
    # values are then used to slice up the total results.
    #
    # This was put in place because part of the logic to figuring out what
    # jobs to delete from and add jobs to the Solr index is using set
    # algebra. We convert the total list of UIDs in the index and the UIDs
    # in the XML feed to sets, then compare them via ``.difference()``
    # (seen below). However for very large feed files, say 10,000+ jobs,
    # this process was taking so long that the connection would time out. To
    # address this problem we break up the comparisons as described above.
    # This results in more requests but it alleviates the connection timeout
    # issue.
    job_slices = slices(range(hits), step=step1)
    results = [_solr_results_chunk(tup, buid, step1) for tup in job_slices]
    solr_uids = reduce(lambda x, y: x | y, results) if results else set()
    # Return the job UIDs that are in the Solr index but not in the feed
    # file.
    solr_del_uids = solr_uids.difference(job_uids)

    if not force:
        # Return the job UIDs that are in the feed file but not in the Solr
        # index.
        solr_add_uids = job_uids.difference(solr_uids)
        # ``jobfeed.solr_jobs()`` yields a list of dictionaries. We want to
        # filter out any dictionaries whose "uid" key is not in
        # ``solr_add_uids``. This is because by default we only want to add
        # new documents (which each ``solr_jobs()`` dictionary represents),
        # not update.
        add_docs = filter(lambda x: int(x.get("uid", 0)) in solr_add_uids,
                          jobfeed.solr_jobs())
    else:
        # This might seem redundant to refer to the same value
        # twice with two different variable names. However, this decision
        # was made during the implementation of the "force Solr update"
        # feature to this function.
        #
        # Instead of adding only the documents with UIDs that are in the feed
        # file but not in the Solr index, we're going to add ALL the documents
        # in the feed file. This will add the new documents of course, but it
        # will also update existing documents with any new data. Uniqueness of
        # the documents is ensured by the ``id`` field defined in the Solr
        # schema (the template for which can be seen in
        # templates/search_configuration/solr.xml). At the very bottom you'll
        # see <uniqueKey>id</uniqueKey>. This serves as the equivalent of the pk
        # (i.e. globally unique) in a database.
        solr_add_uids = job_uids
        add_docs = jobfeed.solr_jobs()

    # Slice up ``add_docs`` in chunks of 4096. This is because the
    # maxBooleanClauses setting in solrconfig.xml is set to 4096. This means
    # if we used any more than that Solr would throw an error and our
    # updates wouldn't get processed.
    add_steps = slices(range(len(solr_add_uids)), step=4096)
    # Same concept as ``add_docs``.
    del_steps = slices(range(len(solr_del_uids)), step=4096)
    # Create a generator that yields 2-tuples with each invocation. The
    # 2-tuples consist of one tuple each from del_steps & add_steps. Any
    # mismatched values (e.g. there are more del_steps than add_steps)
    # will be compensated for with the ``fillvalue``.
    zipped_steps = izip_longest(del_steps, add_steps, fillvalue=(0, 0))

    for tup in zipped_steps:
        update_chunk = add_docs[tup[1][0]:tup[1][1] + 1]

        if update_chunk:
            logging.debug("BUID:%s - SOLR - Update chunk: %s" %
                          (buid, [i['uid'] for i in update_chunk]))
            # Pass 'commitWithin' so that Solr doesn't try to commit the new
            # docs right away. This will help relieve some of the resource
            # stress during the daily update. The value is expressed in
            # milliseconds.
            conn.add(update_chunk, commitWithin="30000")

        delete_chunk = _build_solr_delete_query(
            list(solr_del_uids)[tup[0][0]:tup[0][1] + 1])

        if delete_chunk:
            # Post-a-job jobs should not be deleted during import
            delete_chunk = "(%s) AND -is_posted:true" % delete_chunk
            logging.debug("BUID:%s - SOLR - Delete chunk: %s" %
                          (buid, list(solr_del_uids)))
            conn.delete(q=delete_chunk)

    # delete any jobs that may have been added via etl_to_solr
    conn.delete(q="buid:%s AND !uid:[0  TO *]" % buid)

    # Update business unit information: title, dates, and associated_jobs
    if set_title or not bu.title or (bu.title != jobfeed.job_source_name
                                     and jobfeed.job_source_name):
        bu.title = jobfeed.job_source_name
    updated = bool(solr_add_uids) or bool(solr_del_uids)
    _update_business_unit_modified_dates(bu,
                                         jobfeed.crawled_date,
                                         updated=updated)
    bu.associated_jobs = len(jobs)
    bu.save()
    # Update the Django database to reflect company additions and name changes
    add_company(bu)
    if delete_feed:
        os.remove(filepath)
        logging.info("BUID:%s - Deleted feed file." % buid)
    return len(solr_add_uids), len(solr_del_uids)
Esempio n. 33
0
class SiteTestCase(DirectSEOBase):
    """
    We're adding these tests to ensure unicode jobs descriptions and titles
    make it through the import process and work with high-level features.
    We should use http requests wherever possible since it's difficult to
    predict which modules will have issues with unicode.

    """

    def setUp(self):
        super(SiteTestCase, self).setUp()
        self.conn = Solr('http://127.0.0.1:8983/solr/seo')
        self.conn.delete(q="*:*")
        self.businessunit = factories.BusinessUnitFactory(id=0)
        self.buid = self.businessunit.id
        self.filepath = os.path.join(import_jobs.DATA_DIR,
                                     'dseo_feed_%s.xml' % self.buid)
        SeoSite.objects.all().delete()
        self.site = factories.SeoSiteFactory(id=1)

        self.configuration = factories.ConfigurationFactory(status=2)
        self.configuration.save()
        self.site.configurations.clear()
        self.site.configurations.add(self.configuration)

    def tearDown(self):
        super(SiteTestCase, self).tearDown()
        self.conn.delete(q="*:*")
    
    def test_unicode_title(self):
        # Test imports
        group = factories.GroupFactory()
        self.site.group = group
        self.site.business_units.add(self.businessunit)
        self.site.save()
        import_jobs.update_solr(self.buid, download=False, delete_feed=False,
                                data_dir='seo/tests/data/')
        solr_jobs = self.conn.search("*:*")
        resp = self.client.get('/')
        self.assertEqual(resp.context['total_jobs_count'], solr_jobs.hits)

        # test standard facets against Haystack query
        standard_cf = factories.CustomFacetFactory.build(
            # default facet will return both jobs
            name="Keyword Facet",
            group=group,
            show_production=True)
        standard_cf.save()
        standard_cf.keyword.add(u'Ключевые')
        standard_cf.save()
        standard_site_facet = factories.SeoSiteFacetFactory(
            seosite=self.site,
            customfacet=standard_cf,
            facet_type=factories.SeoSiteFacet.STANDARD)
        standard_site_facet.save()

        # test standard facets against Haystack query
        standard_cf2 = factories.CustomFacetFactory.build(
            # default facet will return both jobs
            name='Country Facet',
            country='United States',
            group=group,
            show_production=True)
        standard_cf2.save()
        standard_site_facet2 = factories.SeoSiteFacetFactory(
            seosite=self.site,
            customfacet=standard_cf2,
            facet_type=factories.SeoSiteFacet.STANDARD)
        standard_site_facet2.save()

        resp = self.client.get('/keyword-facet/new-jobs/',
                               HTTP_HOST=self.site.domain, follow=True)
        sqs = DESearchQuerySet().filter(text=u'Ключевые')
        self.assertEqual(len(resp.context['default_jobs']), sqs.count())
        for facet_widget in resp.context['widgets']:
            # Ensure that no standard facet has more results than current
            # search results
            for count_tuple in facet_widget.items:
                self.assertTrue(sqs.count() >= count_tuple[1])
        
        # Test default site facets against PySolr query
        from django.core.cache import cache
        cache.clear()
        default_cf = factories.CustomFacetFactory.build(
            name="Default Facet",
            title=u"Специалист",
            group=group,
            show_production=True)
        default_cf.save()
        default_site_facet = factories.SeoSiteFacetFactory(
            seosite=self.site,
            facet_type=factories.SeoSiteFacet.DEFAULT,
            customfacet=default_cf)
        default_site_facet.save()
        resp = self.client.get('/jobs/', HTTP_HOST=self.site.domain,
                               follow=True)
        total_jobs = resp.context['total_jobs_count']
        solr_jobs = self.conn.search(q=u"title:Специалист")
        self.assertEqual(total_jobs, solr_jobs.hits)
        self.assertEqual(len(resp.context['default_jobs']), total_jobs)
        for facet_widget in resp.context['widgets']:
            for count_tuple in facet_widget.items:
                self.assertTrue(sqs.count() >= count_tuple[1])

        # Feed test
        resp = self.client.get('/feed/json', HTTP_HOST=self.site.domain)
        jobs = json.loads(resp.content)
        self.assertEqual(len(jobs), total_jobs)
        for job in jobs:
            resp = self.client.get(job['url'], HTTP_HOST=self.site.domain,
                                   follow=False)
            self.assertEqual(resp.status_code, 302)
            expected = 'http://my.jobs/%s%d?my.jobs.site.id=%s' %\
                       (job['guid'],
                        settings.FEED_VIEW_SOURCES['json'],
                        str(self.site.pk))
            self.assertEqual(resp['Location'], expected)

        # Sitemap index Test - Since sitemap only builds out updates from the
        # last 30 days, this test will eventually be checking 0 jobs in sitemap
        # TODO, find a way to keep feed dates current. We might be able to use
        # the mock library to override datetime functions
        resp = self.client.get('/sitemap.xml', HTTP_HOST=self.site.domain)  
        root = etree.fromstring(resp.content)
        self.assertGreater(len(root), 0)
        crawled_jobs = 0
        for loc, lastmod in root:
            self.assertTrue(loc.text)
            resp = self.client.get(loc.text, HTTP_HOST=self.site.domain)  
            self.assertEqual(resp.status_code, 200)
            # Get the first daily sitemap
            urlset = etree.fromstring(resp.content)
            # Check each job in daily sitemap - I'm a bot
            for loc, _, _, _ in urlset:
                resp = self.client.get(loc.text, HTTP_HOST=self.site.domain)
                self.assertEqual(resp.status_code, 200)
                self.assertIn(str(resp.context['the_job'].uid), loc.text)
                crawled_jobs += 1
Esempio n. 34
0
class ImportJobsTestCase(DirectSEOBase):
    fixtures = ['import_jobs_testdata.json']

    def setUp(self):
        super(ImportJobsTestCase, self).setUp()
        self.businessunit = BusinessUnitFactory(id=0)
        self.buid_id = self.businessunit.id        
        self.filepath = os.path.join(DATA_DIR, '0', 'dseo_feed_%s.xml' % self.buid_id)
        self.solr_settings = {
            'default': {'URL': 'http://127.0.0.1:8983/solr/seo'}
        }
        self.solr = Solr(settings.HAYSTACK_CONNECTIONS['default']['URL'])

    def tearDown(self):
        super(ImportJobsTestCase, self).tearDown()
        self.solr.delete(q='*:*')

    def test_solr_rm_feedfile(self):
        """
        Test that at the end of Solr parsing, the feed file is deleted.
        
        """
        update_solr(self.buid_id)
        self.assertFalse(os.access(self.filepath, os.F_OK))

    def test_subsidiary_rename(self):
        company1 = CompanyFactory()
        company1.save()
        bu1 = self.businessunit
        bu1.title = "Acme corp"
        bu2 = BusinessUnitFactory(title=bu1.title)
        bu2.save()
        self.businessunit.company_set.add(company1)

        # Test that a company was created for both business units
        add_company(bu1)
        companies = bu1.company_set.all()
        self.assertEqual(len(companies), 1)
        co = companies[0]
        self.assertEqual(co.name, bu1.title)

        # Add the 2nd business unit
        add_company(bu2)

        # Both units should be attached to that company
        self.assertEqual(bu1.company_set.all()[0], bu2.company_set.all()[0])
        self.assertEqual(bu1.company_set.all().count(), 1) 
        self.assertIn(bu1, co.job_source_ids.all())
        self.assertIn(bu2, co.job_source_ids.all())
        self.assertEqual(co.name, bu1.title)
        self.assertEqual(co.name, bu2.title)

        bu2.title = "New company name"
        add_company(bu1)
        add_company(bu2)
        self.assertEqual(len(co.job_source_ids.all()), 1)
        self.assertNotEqual(bu1.company_set.all(), bu2.company_set.all())
        self.assertEqual(co.name, bu1.title)
        self.assertEqual(len(bu2.company_set.all()), 1)
        co2 = bu2.company_set.all()[0]
        self.assertEqual(co2.name, bu2.title)
        self.assertNotEqual(co2.name, bu1.title)
        self.assertNotEqual(co.name, bu2.title)

    def test_duplicate_company(self):
        company1 = CompanyFactory()
        company1.save()
        company2 = CompanyFactory(name="Acme corp")
        company2.save()
        self.businessunit.company_set.add(company1)
        self.businessunit.title = "Acme corp"
        add_company(self.businessunit)
        self.assertEqual(self.businessunit.company_set.all()[0], company2)

    def test_set_bu_title(self):
        """
        Ensure that if a feedfile for a BusinessUnit comes through, and
        the `title` attribute for that BusinessUnit is not set, that
        `helpers.update_solr` sets the `title` attribute properly.

        """
        bu = BusinessUnit.objects.get(id=self.buid_id)
        bu.title = None
        bu.save()
        # Since the BusinessUnit title is None, the intent is that update_solr
        # will set its title to match the company name found in the feed file.
        results = update_solr(self.buid_id)
        # We have to get the updated state of the BusinessUnit instance, since
        # changes to the database won't be reflected by our in-memory version of
        # the data.
        bu = BusinessUnit.objects.get(id=self.buid_id)
        # The title attribute should now equal the initial value established in
        # the setUp method.
        self.assertEquals(self.businessunit.title, bu.title)

    def test_add_company(self):
        """
        Create environment to test for every possible case--
        
         - Existing relationship but the name is different                 pk=10
         - No existing relationship, but the company exists in the database (as
           established by the BusinessUnit title matching a company name)  pk=11
         - No relationship and the company is not in the database          pk=12
          
        Start with  2 Company objects and 3 BusinessUnit objects
        End up with 3 Company objects and 3 BusinessUnit objects

        """

        for i in range(10, 4):
            add_company(BusinessUnit.get(id=i))

            # The names of the BU and the Co should be the same
            self.assertEquals(BusinessUnit.get(id=i).title,
                              Company.get(id=i).name,
                              msg="Company names do not match")

            # ensure the relationship was formed
            self.assertIn(Company.objects.get(id=i),
                          BusinessUnit.objects.get(id=i).company_set.all(),
                          msg="Company is not related to job feed")

    def test_remove_expired_jobs(self):
        buid = 12345
        active_jobs = [{'id': 'seo.%s' % i, 'buid': buid} for i in range(4)]
        old_jobs = [{'id': 'seo.%s' % i, 'buid': buid} for i in range(2, 10)]

        with self.settings(HAYSTACK_CONNECTIONS=self.solr_settings):
            self.solr.add(old_jobs)
            self.solr.commit()

            removed = remove_expired_jobs(buid, [d['id'] for d in active_jobs])
            self.assertEqual(len(removed), 6, "Removed jobs %s" % removed)
            ids = [d['id'] for d in self.solr.search('*:*').docs]
            self.assertTrue([5, 6, 7, 8, 9, 10] not in ids)
Esempio n. 35
0
class JobFeedTestCase(DirectSEOBase):

    def setUp(self):
        super(JobFeedTestCase, self).setUp()
        self.businessunit = BusinessUnitFactory(id=0)
        self.buid_id = self.businessunit.id
        self.numjobs = 14
        self.testdir = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                                    'data')
        self.company = CompanyFactory()
        self.company.job_source_ids.add(self.businessunit)
        self.company.save()
        self.conn = Solr("http://127.0.0.1:8983/solr/seo")
        self.emptyfeed = os.path.join(self.testdir, "dseo_feed_0.no_jobs.xml")
        self.malformed_feed = os.path.join(self.testdir, 'dseo_malformed_feed_0.xml')
        self.invalid_feed = os.path.join(self.testdir, 'dseo_invalid_feed_0.xml')
        self.unused_field_feed = os.path.join(self.testdir, 'dseo_feed_1.xml')
        self.no_onet_feed = os.path.join(self.testdir, 'dseo_feed_no_onets.xml')

        # Ensures DATA_DIR used by import_jobs.download_feed_file exists
        data_path = DATA_DIR
        if not os.path.exists(data_path):
            os.mkdir(data_path)

    def test_company_canonical_microsite(self):
        # Test that canonical microsites is correctly added to solr
        results = DEv2JobFeed(
            'seo/tests/data/dseo_feed_0.xml',
            jsid=self.businessunit.id,
            company=self.businessunit.company_set.all()[0])

        jobs = results.solr_jobs()
        self.assertEqual(jobs[0]['company_canonical_microsite_exact'], None)

        self.company.canonical_microsite = "http://test.jobs"
        self.company.save()

        results = DEv2JobFeed(
            'seo/tests/data/dseo_feed_0.xml',
            jsid=self.businessunit.id,
            company=self.businessunit.company_set.all()[0])

        jobs = results.solr_jobs()
        self.assertEqual(jobs[0]['company_canonical_microsite_exact'],
                         'http://test.jobs')

    def test_company_enhanced(self):
        # Test that company enhanced check is correctly added to solr
        results = DEv2JobFeed(
            'seo/tests/data/dseo_feed_0.xml',
            jsid=self.businessunit.id,
            company=self.businessunit.company_set.all()[0])

        jobs = results.solr_jobs()
        self.assertFalse(jobs[0]['company_enhanced'])

        self.company.enhanced = True
        self.company.save()

        results = DEv2JobFeed(
            'seo/tests/data/dseo_feed_0.xml',
            jsid=self.businessunit.id,
            company=self.businessunit.company_set.all()[0])

        jobs = results.solr_jobs()
        self.assertTrue(jobs[0]['company_enhanced'])

    def test_company_member(self):
        # Test that company member check is correctly added to solr
        results = DEv2JobFeed(
            'seo/tests/data/dseo_feed_0.xml',
            jsid=self.businessunit.id,
            company=self.businessunit.company_set.all()[0])
        jobs = results.solr_jobs()
        self.assertTrue(jobs[0]['company_member'])

        self.company.member = False
        self.company.save()

        results = DEv2JobFeed(
            'seo/tests/data/dseo_feed_0.xml',
            jsid=self.businessunit.id,
            company=self.businessunit.company_set.all()[0])

        jobs = results.solr_jobs()
        self.assertFalse(jobs[0]['company_member'])

    def test_company_digital_strategies_customer(self):
        # Test that digial strategies customer check is correctly added to solr
        results = DEv2JobFeed(
            'seo/tests/data/dseo_feed_0.xml',
            jsid=self.businessunit.id,
            company=self.businessunit.company_set.all()[0])

        jobs = results.solr_jobs()
        self.assertFalse(jobs[0]['company_digital_strategies_customer'])

        self.company.digital_strategies_customer = True
        self.company.save()

        results = DEv2JobFeed(
            'seo/tests/data/dseo_feed_0.xml',
            jsid=self.businessunit.id,
            company=self.businessunit.company_set.all()[0])

        jobs = results.solr_jobs()
        self.assertTrue(jobs[0]['company_digital_strategies_customer'])

    def test_no_markdown_newline_breaks(self):
        # Test that non-markdown businessunits have newlines converted to breaks
        no_markdown_bu = BusinessUnitFactory.build(id=5, enable_markdown=False)
        results = DEv2JobFeed(
                    'seo/tests/data/dseo_feed_0.xml',
                    jsid=no_markdown_bu.id,
                    markdown=no_markdown_bu.enable_markdown)
        jobs = results.solr_jobs()
        self.assertNotEqual(jobs[0]['html_description'].find('Operations<br />'), -1)

    def test_markdown_no_newline_breaks(self):
        # Test that markdown businessunits do not have newlines converted to breaks
        results = DEv2JobFeed(
                    'seo/tests/data/dseo_feed_0.xml',
                    jsid=self.businessunit.id,
                    markdown = self.businessunit.enable_markdown)
        jobs = results.solr_jobs()
        self.assertEqual(jobs[0]['html_description'].find('Operations<br />'), -1)

    def test_unused_fields(self):
        # Test that new fields don't break existing code
        results = DEv2JobFeed(self.unused_field_feed,
                                        jsid=self.businessunit.id)
        jobs = results.solr_jobs()
        self.assertEqual(len(results.jobparse()), 1)

    def test_unescape(self):
        """Tests that escaped html characters are unescaped when imported"""
        results = DEv2JobFeed(
                    'seo/tests/data/dseo_feed_0.escaped_chars.xml',
                    jsid=0)
        jobs = results.solr_jobs()
        self.assertEqual(results.job_source_name.find('&#162;'), -1)
        self.assertEqual(jobs[0]['description'].find('&amp;'), -1)

    def test_markdown_code_blocks(self):
        # test that code blocks are not in html job descriptions
        results = DEv2JobFeed(
                    'seo/tests/data/dseo_feed_0.markdown.xml',
                    jsid=0)
        jobs = results.solr_jobs()
        for job in jobs:
            self.assertEqual(job['html_description'].find('<code>'), -1)
            self.assertEqual(job['html_description'].find('</code>'), -1)
            self.assertEqual(job['html_description'].find('<pre>'), -1)
            self.assertEqual(job['html_description'].find('<h1>'), -1)
            self.assertEqual(job['html_description'].find('##'), -1)
            self.assertNotEqual(job['html_description'].find('<h4>'), -1)
            self.assertNotEqual(job['html_description'].find('<h6>'), -1)
            self.assertNotEqual(job['html_description'].find('<li>'), -1)
            self.assertNotEqual(job['html_description'].find('</li>'), -1)

    def test_no_null_values(self):
        # test that there's no literal null in html 'city' entry
        results = DEv2JobFeed(
                    'seo/tests/data/dseo_feed_0.markdown.xml',
                    jsid=0)
        jobs = results.solr_jobs()
        for job in jobs:
            self.assertNotEqual(job['city'], 'null')

    def test_dev2_feed(self):
        filepath = download_feed_file(self.buid_id)
        results = DEv2JobFeed(filepath)
        jobs = results.jobparse()
        self.assertEqual(results.jsid, self.buid_id)
        self.assertEqual(results.job_source_name, self.businessunit.title)
        self.assertEqual(len(jobs), self.numjobs)

    def test_mocids(self):
        """
        Tests that mocid fields exist when jobs are imported from a feed and
        added to a solr connnection

        """
        filepath = download_feed_file(self.buid_id)
        results = DEv2JobFeed(filepath)
        jobs = results.solr_jobs()
        # Since we're going to be adding/updating data in the Solr index, we're
        # hardcoding in the local Solr instance so that we don't accidentally
        # alter production data.
        self.conn.add(jobs)
        num_hits = self.conn.search(q="*:*",
                                    fq="buid:%s -mocid:[* TO *]" % self.buid_id)
        self.assertEqual(num_hits.hits, self.numjobs)
        for job in jobs:
            self.assertTrue('mocid' in job)

    def test_empty_feed(self):
        """
        Test that the schema for the v2 DirectEmployers feed file schema
        allows for empty feed files.

        """
        results = DEv2JobFeed(self.emptyfeed)
        # If the schema is such that empty feed files are considered invalid,
        # trying to run jobparse() will throw an exception.
        self.assertEqual(len(results.jobparse()), 0)

    def test_empty_solr(self):
        """
        Tests for the proper behavior when encountering a job-less, but
        otherwise valid, feed file. The proper behavior is to delete any
        jobs associated with that BusinessUnit from the Solr index.

        """
        # Normal download-and-parse operation on a feed file with jobs.
        update_solr(self.buid_id)
        results = self.conn.search(q="*:*", fq="buid:%s" % self.buid_id)
        self.assertEqual(results.hits, self.numjobs)

        # Download-and-parse operation on a feed file with no jobs. Expected
        # behavior is to delete all jobs.
        self._get_feedfile()
        update_solr(self.buid_id, download=False)
        results = self.conn.search(q="*:*", fq="buid:%s" % self.buid_id)
        self.assertEqual(results.hits, 0)

    def test_zipcode(self):
        """
        Tests to ensure proper behavior of zipcode field in being entered in
        Solr.

        """
        filepath = download_feed_file(self.buid_id)
        dbresults = DEv2JobFeed(filepath)
        solrresults = dbresults.solr_jobs()

        zips_from_feedfile = ['30269', '30269', '48332', '30269', '30269',
                              '30269', '30269', '30269', '48332', '48332',
                              '30269', None, '30269', '30269']

        solrzips = [i['zipcode'] for i in solrresults]
        for coll in [solrzips]:
            self.assertItemsEqual(zips_from_feedfile, coll)

    def test_salt_date(self):
        """
        Test to ensure that job postings show up in a quasi-random
        fashion by sorting by the `salted_date` attribute in the index
        vice strictly by `date_new`.

        """
        filepath = download_feed_file(self.buid_id)
        jobs = DEv2JobFeed(filepath)
        solrjobs = jobs.solr_jobs()
        self.conn.add(solrjobs)
        results = self.conn.search(q="*:*", sort="salted_date asc")
        self.assertEqual(self.numjobs, results.hits)
        # We can't really test for inequality between the two result sets,
        # since sometimes results.docs will equal results2.docs.
        results2 = self.conn.search(q="*:*", sort="date_new asc")
        self.assertItemsEqual(results2.docs, results.docs)

    def test_date_updated(self):
        """
        Test to ensure proper behavior of date updated field when added to
        Solr.

        """
        filepath = download_feed_file(self.buid_id)
        jobs = DEv2JobFeed(filepath)
        solrjobs = jobs.solr_jobs()
        self.conn.add(solrjobs)
        dates_updated = [datetime.datetime.strptime("4/16/2015 11:35:13 PM",
                                                    "%m/%d/%Y %I:%M:%S %p"),
                         datetime.datetime.strptime("4/16/2015 11:35:14 PM",
                                                    "%m/%d/%Y %I:%M:%S %p"),
                         datetime.datetime.strptime("4/16/2015 11:35:15 PM",
                                                    "%m/%d/%Y %I:%M:%S %p")]
        solr_dates = [i['date_updated'] for i in solrjobs]
        for solr_date in solr_dates:
            self.assertIn(solr_date, dates_updated)

    def _get_feedfile(self):
        # Download the 'real' feed file then copy the empty feed file in its
        # place.
        realfeed = download_feed_file(self.buid_id)
        shutil.copyfile(realfeed, "%s.bak" % realfeed)
        shutil.copyfile(self.emptyfeed, realfeed)

    def test_parse_malformed_feed(self):
        """
        Test that a malformed feedfile does not cause an unhandled exception.

        """
        result = DEv2JobFeed(self.malformed_feed, jsid=0)

    def test_parse_invalid_feed(self):
        """
        Test that a feed file that fails validation does not cause an unhandled
        exception.

        """
        result = DEv2JobFeed(self.invalid_feed, jsid=0)

    def test_no_onets(self):
        result = DEv2JobFeed(self.no_onet_feed, jsid=0)
        jobs = result.solr_jobs()
        self.assertEqual(jobs[0]['onet'], '')

    def test_on_sites_by_buid(self):
        business_unit = BusinessUnitFactory(pk=77)

        results = DEv2JobFeed('seo/tests/data/dseo_feed_0.xml',
                              jsid=business_unit.id,
                              markdown=business_unit.enable_markdown)
        jobs = results.solr_jobs()
        for job in jobs:
            self.assertItemsEqual(job['on_sites'], [0])

        site_package = SitePackageFactory(owner=self.company)
        business_unit.site_packages.add(site_package)

        results = DEv2JobFeed('seo/tests/data/dseo_feed_0.xml',
                              jsid=business_unit.id,
                              markdown=business_unit.enable_markdown)
        jobs = results.solr_jobs()
        for job in jobs:
            self.assertItemsEqual(job['on_sites'], [site_package.pk])

        site_package2 = SitePackageFactory(owner=self.company)
        business_unit.site_packages.add(site_package2)

        results = DEv2JobFeed('seo/tests/data/dseo_feed_0.xml',
                              jsid=business_unit.id,
                              markdown=business_unit.enable_markdown)
        jobs = results.solr_jobs()
        for job in jobs:
            self.assertItemsEqual(job['on_sites'], [site_package.pk,
                                                    site_package2.pk])

        site_package2.delete()
        site_package.delete()
        business_unit.delete()
Esempio n. 36
0
class MyJobsBase(TestCase):
    def setUp(self):
        settings.ROOT_URLCONF = "myjobs_urls"
        settings.PROJECT = "myjobs"

        self.app_access = AppAccessFactory()
        self.activities = [
            ActivityFactory(name=activity, app_access=self.app_access)
            for activity in [
                "create communication record", "create contact",
                "create partner saved search", "create partner", "create role",
                "create tag", "create user", "delete tag", "delete partner",
                "delete role", "delete user", "read contact",
                "read communication record", "read partner saved search",
                "read partner", "read role", "read user", "read tag",
                "update communication record", "update contact",
                "update partner", "update role", "update tag", "update user",
                "read outreach email address", "create outreach email address",
                "delete outreach email address",
                "update outreach email address", "read outreach record",
                "convert outreach record"]]

        self.company = CompanyFactory(app_access=[self.app_access])
        # this role will be populated by activities on a test-by-test basis
        self.role = RoleFactory(company=self.company, name="Admin")
        self.user = UserFactory(roles=[self.role], is_staff=True)

        cache.clear()
        clear_url_caches()
        self.ms_solr = Solr(settings.SOLR['seo_test'])
        self.ms_solr.delete(q='*:*')

        self.base_context_processors = settings.TEMPLATE_CONTEXT_PROCESSORS
        context_processors = self.base_context_processors + (
            'mymessages.context_processors.message_lists',
        )
        setattr(settings, 'TEMPLATE_CONTEXT_PROCESSORS', context_processors)
        setattr(settings, 'MEMOIZE', False)

        self.patcher = patch('urllib2.urlopen', return_file())
        self.mock_urlopen = self.patcher.start()

        self.client = TestClient()
        self.client.login_user(self.user)

    def tearDown(self):
        self.ms_solr.delete(q='*:*')
        setattr(settings, 'TEMPLATE_CONTEXT_PROCESSORS',
                self.base_context_processors)
        try:
            self.patcher.stop()
        except RuntimeError:
            # patcher was stopped in a test
            pass

    def assertRequires(self, view_name, *activities, **kwargs):
        """
        Asserts that the given view is only accessible when a user has a role
        with the given activities.

        """
        url = reverse(view_name, kwargs=kwargs.get('kwargs'))
        method = kwargs.get("method", "get").lower()

        response = getattr(self.client, method)(path=url)
        self.assertEqual(type(response), MissingActivity)

        self.role.activities = [activity for activity in self.activities
                                if activity.name in activities]

        response = getattr(self.client, method)(path=url)
        self.assertNotEqual(type(response), MissingActivity)

        self.role.activities.clear()
Esempio n. 37
0
class JobFeedTestCase(DirectSEOBase):

    def setUp(self):
        super(JobFeedTestCase, self).setUp()
        self.businessunit = BusinessUnitFactory(id=0)
        self.buid_id = self.businessunit.id
        self.numjobs = 14
        self.testdir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 
                                    'data')
        self.company = CompanyFactory()
        self.company.job_source_ids.add(self.businessunit)
        self.company.save()
        self.conn = Solr("http://127.0.0.1:8983/solr/seo")
        self.emptyfeed = os.path.join(self.testdir, "dseo_feed_0.no_jobs.xml")
        self.malformed_feed = os.path.join(self.testdir, 'dseo_malformed_feed_0.xml')
        self.invalid_feed = os.path.join(self.testdir, 'dseo_invalid_feed_0.xml')
        self.unused_field_feed = os.path.join(self.testdir, 'dseo_feed_1.xml')
        self.no_onet_feed = os.path.join(self.testdir, 'dseo_feed_no_onets.xml')

        #Ensures DATA_DIR used by import_jobs.download_feed_file exists
        data_path = DATA_DIR
        if not os.path.exists(data_path):
            os.mkdir(data_path)

    def test_company_canonical_microsite(self):
        # Test that canonical microsites is correctly added to solr
        results = DEv2JobFeed(
            'seo/tests/data/dseo_feed_0.xml',
            jsid=self.businessunit.id,
            company=self.businessunit.company_set.all()[0])

        jobs = results.solr_jobs()
        self.assertEqual(jobs[0]['company_canonical_microsite_exact'], None)

        self.company.canonical_microsite = "http://test.jobs"
        self.company.save()

        results = DEv2JobFeed(
            'seo/tests/data/dseo_feed_0.xml',
            jsid=self.businessunit.id,
            company=self.businessunit.company_set.all()[0])

        jobs = results.solr_jobs()
        self.assertEqual(jobs[0]['company_canonical_microsite_exact'],
                         'http://test.jobs')

    def test_company_enhanced(self):
        # Test that company enhanced check is correctly added to solr
        results = DEv2JobFeed(
            'seo/tests/data/dseo_feed_0.xml',
            jsid=self.businessunit.id,
            company=self.businessunit.company_set.all()[0])

        jobs = results.solr_jobs()
        self.assertFalse(jobs[0]['company_enhanced'])

        self.company.enhanced = True
        self.company.save()

        results = DEv2JobFeed(
            'seo/tests/data/dseo_feed_0.xml',
            jsid=self.businessunit.id,
            company=self.businessunit.company_set.all()[0])

        jobs = results.solr_jobs()
        self.assertTrue(jobs[0]['company_enhanced'])

    def test_company_member(self):
        # Test that company member check is correctly added to solr
        results = DEv2JobFeed(
            'seo/tests/data/dseo_feed_0.xml',
            jsid=self.businessunit.id,
            company=self.businessunit.company_set.all()[0])
        jobs = results.solr_jobs()
        self.assertTrue(jobs[0]['company_member'])

        self.company.member = False
        self.company.save()

        results = DEv2JobFeed(
            'seo/tests/data/dseo_feed_0.xml',
            jsid=self.businessunit.id,
            company=self.businessunit.company_set.all()[0])

        jobs = results.solr_jobs()
        self.assertFalse(jobs[0]['company_member'])

    def test_company_digital_strategies_customer(self):
        # Test that digial strategies customer check is correctly added to solr
        results = DEv2JobFeed(
            'seo/tests/data/dseo_feed_0.xml',
            jsid=self.businessunit.id,
            company=self.businessunit.company_set.all()[0])

        jobs = results.solr_jobs()
        self.assertFalse(jobs[0]['company_digital_strategies_customer'])

        self.company.digital_strategies_customer = True
        self.company.save()

        results = DEv2JobFeed(
            'seo/tests/data/dseo_feed_0.xml',
            jsid=self.businessunit.id,
            company=self.businessunit.company_set.all()[0])

        jobs = results.solr_jobs()
        self.assertTrue(jobs[0]['company_digital_strategies_customer'])

    def test_no_markdown_newline_breaks(self):
        # Test that non-markdown businessunits have newlines converted to breaks
        no_markdown_bu = BusinessUnitFactory.build(id=5, enable_markdown=False)
        results = DEv2JobFeed(
                    'seo/tests/data/dseo_feed_0.xml', 
                    jsid=no_markdown_bu.id,
                    markdown=no_markdown_bu.enable_markdown) 
        jobs = results.solr_jobs()
        self.assertNotEqual(jobs[0]['html_description'].find('Operations<br />'), -1)
        
    def test_markdown_no_newline_breaks(self):
        # Test that markdown businessunits do not have newlines converted to breaks
        results = DEv2JobFeed(
                    'seo/tests/data/dseo_feed_0.xml', 
                    jsid=self.businessunit.id,
                    markdown = self.businessunit.enable_markdown) 
        jobs = results.solr_jobs()
        self.assertEqual(jobs[0]['html_description'].find('Operations<br />'), -1)

    def test_unused_fields(self):
        # Test that new fields don't break existing code
        results = DEv2JobFeed(self.unused_field_feed,
                                        jsid=self.businessunit.id)
        jobs = results.solr_jobs()
        self.assertEqual(len(results.jobparse()), 1)

    def test_unescape(self):
        """Tests that escaped html characters are unescaped when imported"""
        results = DEv2JobFeed(
                    'seo/tests/data/dseo_feed_0.escaped_chars.xml', 
                    jsid=0) 
        jobs = results.solr_jobs()
        self.assertEqual(results.job_source_name.find('&#162;'), -1)
        self.assertEqual(jobs[0]['description'].find('&amp;'), -1)

    def test_markdown_code_blocks(self):
        #test that code blocks are not in html job descriptions
        results = DEv2JobFeed(
                    'seo/tests/data/dseo_feed_0.markdown.xml', 
                    jsid=0) 
        jobs = results.solr_jobs()
        for job in jobs:
            self.assertEqual(job['html_description'].find('<code>'), -1)
            self.assertEqual(job['html_description'].find('</code>'), -1)
            self.assertEqual(job['html_description'].find('<pre>'), -1)
            self.assertEqual(job['html_description'].find('<h1>'), -1)
            self.assertEqual(job['html_description'].find('##'), -1)
            self.assertNotEqual(job['html_description'].find('<h4>'), -1)
            self.assertNotEqual(job['html_description'].find('<h6>'), -1)
            self.assertNotEqual(job['html_description'].find('<li>'), -1)
            self.assertNotEqual(job['html_description'].find('</li>'), -1)

    def test_no_null_values(self):
        # test that there's no literal null in html 'city' entry
        results = DEv2JobFeed(
                    'seo/tests/data/dseo_feed_0.markdown.xml', 
                    jsid=0) 
        jobs = results.solr_jobs()
        for job in jobs:
            self.assertNotEqual(job['city'], 'null')
        
    def test_dev2_feed(self):
        filepath = download_feed_file(self.buid_id)
        results = DEv2JobFeed(filepath)
        jobs = results.jobparse()
        self.assertEqual(results.jsid, self.buid_id)
        self.assertEqual(results.job_source_name, self.businessunit.title)
        self.assertEqual(len(jobs), self.numjobs)

    def test_mocids(self):
        """
        Tests that mocid fields exist when jobs are imported from a feed and
        added to a solr connnection
        
        """
        filepath = download_feed_file(self.buid_id)
        results = DEv2JobFeed(filepath)
        jobs = results.solr_jobs()
        # Since we're going to be adding/updating data in the Solr index, we're
        # hardcoding in the local Solr instance so that we don't accidentally
        # alter production data.
        self.conn.add(jobs)
        num_hits = self.conn.search(q="*:*",
                                    fq="buid:%s -mocid:[* TO *]" % self.buid_id)
        self.assertEqual(num_hits.hits, self.numjobs)
        for job in jobs:
            self.assertTrue('mocid' in job)

    def test_empty_feed(self):
        """
        Test that the schema for the v2 DirectEmployers feed file schema
        allows for empty feed files.
        
        """
        results = DEv2JobFeed(self.emptyfeed)
        # If the schema is such that empty feed files are considered invalid,
        # trying to run jobparse() will throw an exception.
        self.assertEqual(len(results.jobparse()), 0)

    def test_empty_solr(self):
        """
        Tests for the proper behavior when encountering a job-less, but
        otherwise valid, feed file. The proper behavior is to delete any
        jobs associated with that BusinessUnit from the Solr index.

        """
        # Normal download-and-parse operation on a feed file with jobs.
        update_solr(self.buid_id)
        results = self.conn.search(q="*:*", fq="buid:%s" % self.buid_id)
        self.assertEqual(results.hits, self.numjobs)

        # Download-and-parse operation on a feed file with no jobs. Expected
        # behavior is to delete all jobs.
        self._get_feedfile()
        update_solr(self.buid_id, download=False)
        results = self.conn.search(q="*:*", fq="buid:%s" % self.buid_id)
        self.assertEqual(results.hits, 0)

    def test_zipcode(self):
        """
        Tests to ensure proper behavior of zipcode field in being entered in
        Solr.

        """
        filepath = download_feed_file(self.buid_id)
        dbresults = DEv2JobFeed(filepath)
        solrresults = dbresults.solr_jobs()

        zips_from_feedfile = ['30269', '30269', '48332', '30269', '30269',
                              '30269', '30269', '30269', '48332', '48332',
                              '30269', None, '30269', '30269']

        solrzips = [i['zipcode'] for i in solrresults]
        for coll in [solrzips]:
            self.assertItemsEqual(zips_from_feedfile, coll)

    def test_salt_date(self):
        """
        Test to ensure that job postings show up in a quasi-random
        fashion by sorting by the `salted_date` attribute in the index
        vice strictly by `date_new`.
        
        """
        filepath = download_feed_file(self.buid_id)
        jobs = DEv2JobFeed(filepath)
        solrjobs = jobs.solr_jobs()
        self.conn.add(solrjobs)
        results = self.conn.search(q="*:*", sort="salted_date asc")
        self.assertEqual(self.numjobs, results.hits)
        # We can't really test for inequality between the two result sets,
        # since sometimes results.docs will equal results2.docs.
        results2 = self.conn.search(q="*:*", sort="date_new asc")
        self.assertItemsEqual(results2.docs, results.docs)

    def test_date_updated(self):
        """
        Test to ensure proper behavior of date updated field when added to
        Solr.

        """
        filepath = download_feed_file(self.buid_id)
        jobs = DEv2JobFeed(filepath)
        solrjobs = jobs.solr_jobs()
        self.conn.add(solrjobs)
        dates_updated = [datetime.datetime.strptime("4/16/2015 11:35:13 PM",
                                                    "%m/%d/%Y %I:%M:%S %p"),
                         datetime.datetime.strptime("4/16/2015 11:35:14 PM",
                                                    "%m/%d/%Y %I:%M:%S %p"),
                         datetime.datetime.strptime("4/16/2015 11:35:15 PM",
                                                    "%m/%d/%Y %I:%M:%S %p")]
        solr_dates = [i['date_updated'] for i in solrjobs]
        for solr_date in solr_dates:
            self.assertIn(solr_date, dates_updated)
        
    def _get_feedfile(self):
        # Download the 'real' feed file then copy the empty feed file in its
        # place.
        realfeed = download_feed_file(self.buid_id)
        shutil.copyfile(realfeed, "%s.bak" % realfeed)
        shutil.copyfile(self.emptyfeed, realfeed)

    def test_parse_malformed_feed(self):
        """
        Test that a malformed feedfile does not cause an unhandled exception.

        """
        result = DEv2JobFeed(self.malformed_feed, jsid=0)

    def test_parse_invalid_feed(self):
        """
        Test that a feed file that fails validation does not cause an unhandled
        exception. 

        """
        result = DEv2JobFeed(self.invalid_feed, jsid=0)

    def test_no_onets(self):
        result = DEv2JobFeed(self.no_onet_feed, jsid=0)
        jobs = result.solr_jobs()
        self.assertEqual(jobs[0]['onet'], '')

    def test_on_sites_by_buid(self):
        business_unit = BusinessUnitFactory(pk=77)

        results = DEv2JobFeed('seo/tests/data/dseo_feed_0.xml',
                              jsid=business_unit.id,
                              markdown=business_unit.enable_markdown)
        jobs = results.solr_jobs()
        for job in jobs:
            self.assertItemsEqual(job['on_sites'], [0])

        site_package = SitePackageFactory(owner=self.company)
        business_unit.site_packages.add(site_package)

        results = DEv2JobFeed('seo/tests/data/dseo_feed_0.xml',
                              jsid=business_unit.id,
                              markdown=business_unit.enable_markdown)
        jobs = results.solr_jobs()
        for job in jobs:
            self.assertItemsEqual(job['on_sites'], [site_package.pk])

        site_package2 = SitePackageFactory(owner=self.company)
        business_unit.site_packages.add(site_package2)

        results = DEv2JobFeed('seo/tests/data/dseo_feed_0.xml',
                              jsid=business_unit.id,
                              markdown=business_unit.enable_markdown)
        jobs = results.solr_jobs()
        for job in jobs:
            self.assertItemsEqual(job['on_sites'], [site_package.pk,
                                                    site_package2.pk])

        site_package2.delete()
        site_package.delete()
        business_unit.delete()
Esempio n. 38
0
 def setUp(self):
     super(SitemapTestCase, self).setUp()
     self.conn = Solr('http://127.0.0.1:8983/solr/seo')
     self.conn.add(SOLR_FIXTURE)
Esempio n. 39
0
class MyJobsBase(TestCase):
    def setUp(self):
        settings.ROOT_URLCONF = "myjobs_urls"
        settings.PROJECT = "myjobs"

        self.app_access = AppAccessFactory()
        self.activities = [
            ActivityFactory(name=activity, app_access=self.app_access)
            for activity in [
                "create communication record", "create contact",
                "create partner saved search", "create partner", "create role",
                "create tag", "create user", "delete tag", "delete partner",
                "delete role", "delete user", "read contact",
                "read communication record", "read partner saved search",
                "read partner", "read role", "read user", "read tag",
                "update communication record", "update contact",
                "update partner", "update role", "update tag", "update user",
                "read outreach email address", "create outreach email address",
                "delete outreach email address",
                "update outreach email address", "read outreach record",
                "convert outreach record"
            ]
        ]

        self.company = CompanyFactory(app_access=[self.app_access])
        # this role will be populated by activities on a test-by-test basis
        self.role = RoleFactory(company=self.company, name="Admin")
        self.user = UserFactory(roles=[self.role], is_staff=True)

        cache.clear()
        clear_url_caches()
        self.ms_solr = Solr(settings.SOLR['seo_test'])
        self.ms_solr.delete(q='*:*')

        self.base_context_processors = settings.TEMPLATE_CONTEXT_PROCESSORS
        context_processors = self.base_context_processors + (
            'mymessages.context_processors.message_lists', )
        setattr(settings, 'TEMPLATE_CONTEXT_PROCESSORS', context_processors)
        setattr(settings, 'MEMOIZE', False)

        self.patcher = patch('urllib2.urlopen', return_file())
        self.mock_urlopen = self.patcher.start()

        self.client = TestClient()
        self.client.login_user(self.user)

    def tearDown(self):
        self.ms_solr.delete(q='*:*')
        setattr(settings, 'TEMPLATE_CONTEXT_PROCESSORS',
                self.base_context_processors)
        try:
            self.patcher.stop()
        except RuntimeError:
            # patcher was stopped in a test
            pass

    def assertRequires(self, view_name, *activities, **kwargs):
        """
        Asserts that the given view is only accessible when a user has a role
        with the given activities.

        """
        url = reverse(view_name, kwargs=kwargs.get('kwargs'))
        method = kwargs.get("method", "get").lower()

        response = getattr(self.client, method)(path=url)
        self.assertEqual(type(response), MissingActivity)

        self.role.activities = [
            activity for activity in self.activities
            if activity.name in activities
        ]

        response = getattr(self.client, method)(path=url)
        self.assertNotEqual(type(response), MissingActivity)

        self.role.activities.clear()
Esempio n. 40
0
class DESolrSearchBackend(SolrSearchBackend):
    def __init__(self, connection_alias, **connection_options):
        """
        Inputs:
        :HTTP_AUTH_USERNAME: Username used for http authentication
        :HTTP_AUTH_PASSWORD: Password used for http authentication

        """
        super(DESolrSearchBackend, self).__init__(connection_alias,
                                                  **connection_options)
        user = connection_options.get("HTTP_AUTH_USERNAME")
        passwd = connection_options.get("HTTP_AUTH_PASSWORD")
        self.conn = Solr(connection_options['URL'], auth=(user, passwd),
                         timeout=self.timeout)

    @log_query
    def search(self, query_string, sort_by=None, start_offset=0, end_offset=None,
               fields='', highlight=False, facets=None, date_facets=None,
               query_facets=None, narrow_queries=None, spelling_query=None,
               within=None, dwithin=None, distance_point=None,
               limit_to_registered_models=None, result_class=None,
               facet_mincount=None, facet_limit=None, facet_prefix=None,
               facet_sort=None, facet_offset=None, bf=None, **kwargs):
        """
        Overrides both search() and build_search_kwargs().

        """

        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }
        kwargs = {
            'fl': '* score',
            'mlt': 'false'
        }

        if fields:
            if isinstance(fields, (list, set)):
                fields = " ".join(fields)
            kwargs['fl'] = fields
       # This code was causing sort_by to break, but we're keeping it as a
       # reference in case we want to enable geographic sorting in the future.
       # Haystack does have an order_by_distance function, so this code might
       # not be necessary
       # Jason McLaughlin 10/30/2012
       # geo_sort = False
       # if sort_by is not None:
       #     if sort_by in ['distance asc', 'distance desc'] and distance_point:
       #         # Do the geo-enabled sort.
       #         lng, lat = distance_point['point'].get_coords()
       #         kwargs['sfield'] = distance_point['field']
       #         kwargs['pt'] = '%s,%s' % (lat, lng)
       #         geo_sort = True
       #
       #         if sort_by == 'distance asc':
       #             kwargs['sort'] = 'geodist() asc'
       #         else:
       #             kwargs['sort'] = 'geodist() desc'
       #     else:
       #         if sort_by.startswith('distance '):
       #              warnings.warn("In order to sort by distance, "
       #                            "you must call the '.distance(...)' "
       #                            "method.")

        if sort_by is not None:
            # Regular sorting.
            kwargs['sort'] = sort_by

        if bf is not None:
            kwargs['bf'] = bf

        if start_offset is not None:
            kwargs['start'] = start_offset

        if end_offset is not None:
            kwargs['rows'] = end_offset - start_offset

        if highlight is True:
            kwargs['hl'] = 'true'
            kwargs['hl.fragsize'] = '100'
            kwargs['hl.snippets'] = '2'
            kwargs['hl.mergeContiguous'] = 'true'
            kwargs['hl.simple.pre'] = '<b>'
            kwargs['hl.simple.post'] = '</b>'

        if self.include_spelling is True:
            kwargs['spellcheck'] = 'true'
            kwargs['spellcheck.collate'] = 'true'
            kwargs['spellcheck.count'] = 1

            if spelling_query:
                kwargs['spellcheck.q'] = spelling_query

        if facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.field'] = facets.keys()

            for facet_field, options in facets.items():
                for key, value in options.items():
                    kwargs['f.%s.facet.%s' % (facet_field, key)] = self.conn._from_python(value)

        if facet_mincount is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.mincount'] = facet_mincount

        if facet_limit is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.limit'] = facet_limit

        if facet_prefix is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.prefix'] = facet_prefix

        if facet_sort is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.sort'] = facet_sort

        if facet_offset is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.offset'] = facet_offset

        if date_facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.date'] = date_facets.keys()
            kwargs['facet.date.other'] = 'none'

            for key, value in date_facets.items():
                kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(value.get('start_date'))
                kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(value.get('end_date'))
                gap_by_string = value.get('gap_by').upper()
                gap_string = "%d%s" % (value.get('gap_amount'), gap_by_string)

                if value.get('gap_amount') != 1:
                    gap_string += "S"

                kwargs["f.%s.facet.date.gap" % key] = '+%s/%s' % (gap_string, gap_by_string)

        if query_facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.query'] = query_facets

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            if narrow_queries is None:
                narrow_queries = set()

            registered_models = self.build_models_list()

            if len(registered_models) > 0:
                narrow_queries.add('%s:(%s)' % (DJANGO_CT, ' OR '.join(registered_models)))

        if narrow_queries is not None:
            kwargs['fq'] = list(narrow_queries)

        # if within is not None:
        #     from haystack.utils.geo import generate_bounding_box
        #
        #     kwargs.setdefault('fq', [])
        #     ((min_lat, min_lng), (max_lat, max_lng)) = generate_bounding_box(within['point_1'], within['point_2'])
        #     # Bounding boxes are min, min TO max, max. Solr's wiki was *NOT*
        #     # very clear on this.
        #     bbox = '%s:[%s,%s TO %s,%s]' % (within['field'], min_lat, min_lng, max_lat, max_lng)
        #     kwargs['fq'].append(bbox)

        # if dwithin is not None:
        #     kwargs.setdefault('fq', [])
        #     lng, lat = dwithin['point'].get_coords()
        #     geofilt = '{!geofilt pt=%s,%s sfield=%s d=%s}' % (lat, lng, dwithin['field'], dwithin['distance'].km)
        #     kwargs['fq'].append(geofilt)

        # # Check to see if the backend should try to include distances
        # # (Solr 4.X+) in the results.
        # if self.distance_available and distance_point:
        #     # In early testing, you can't just hand Solr 4.X a proper bounding box
        #     # & request distances. To enable native distance would take calculating
        #     # a center point & a radius off the user-provided box, which kinda
        #     # sucks. We'll avoid it for now, since Solr 4.x's release will be some
        #     # time yet.
        #     # kwargs['fl'] += ' _dist_:geodist()'
        #     pass

        try:
            raw_results = self.conn.search(query_string, **kwargs)
        except (IOError, SolrError), e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to query Solr using '%s': %s", query_string, e)
            raw_results = EmptyResults()

        return self._process_results(raw_results, highlight=highlight,
                                     result_class=result_class)
Esempio n. 41
0
class SiteTestCase(DirectSEOBase):
    """
    We're adding these tests to ensure unicode jobs descriptions and titles
    make it through the import process and work with high-level features.
    We should use http requests wherever possible since it's difficult to
    predict which modules will have issues with unicode.

    """
    def setUp(self):
        super(SiteTestCase, self).setUp()
        self.conn = Solr('http://127.0.0.1:8983/solr/seo')
        self.conn.delete(q="*:*")
        self.businessunit = factories.BusinessUnitFactory(id=0)
        self.buid = self.businessunit.id
        self.filepath = os.path.join(import_jobs.DATA_DIR,
                                     'dseo_feed_%s.xml' % self.buid)
        SeoSite.objects.all().delete()
        self.site = factories.SeoSiteFactory(id=1)

        self.configuration = factories.ConfigurationFactory(status=2)
        self.configuration.save()
        self.site.configurations.clear()
        self.site.configurations.add(self.configuration)

    def tearDown(self):
        super(SiteTestCase, self).tearDown()
        self.conn.delete(q="*:*")

    def test_unicode_title(self):
        # Test imports
        group = factories.GroupFactory()
        self.site.group = group
        self.site.business_units.add(self.businessunit)
        self.site.save()
        import_jobs.update_solr(self.buid,
                                download=False,
                                delete_feed=False,
                                data_dir='seo/tests/data/')
        solr_jobs = self.conn.search("*:*")
        resp = self.client.get('/')
        self.assertEqual(resp.context['total_jobs_count'], solr_jobs.hits)

        # test standard facets against Haystack query
        standard_cf = factories.CustomFacetFactory.build(
            # default facet will return both jobs
            name="Keyword Facet",
            group=group,
            show_production=True)
        standard_cf.save()
        standard_cf.keyword.add(u'Ключевые')
        standard_cf.save()
        standard_site_facet = factories.SeoSiteFacetFactory(
            seosite=self.site,
            customfacet=standard_cf,
            facet_type=factories.SeoSiteFacet.STANDARD)
        standard_site_facet.save()

        # test standard facets against Haystack query
        standard_cf2 = factories.CustomFacetFactory.build(
            # default facet will return both jobs
            name='Country Facet',
            country='United States',
            group=group,
            show_production=True)
        standard_cf2.save()
        standard_site_facet2 = factories.SeoSiteFacetFactory(
            seosite=self.site,
            customfacet=standard_cf2,
            facet_type=factories.SeoSiteFacet.STANDARD)
        standard_site_facet2.save()

        resp = self.client.get('/keyword-facet/new-jobs/',
                               HTTP_HOST=self.site.domain,
                               follow=True)
        sqs = DESearchQuerySet().filter(text=u'Ключевые')
        self.assertEqual(len(resp.context['default_jobs']), sqs.count())
        for facet_widget in resp.context['widgets']:
            # Ensure that no standard facet has more results than current
            # search results
            for count_tuple in facet_widget.items:
                self.assertTrue(sqs.count() >= count_tuple[1])

        # Test default site facets against PySolr query
        from django.core.cache import cache
        cache.clear()
        default_cf = factories.CustomFacetFactory.build(name="Default Facet",
                                                        title=u"Специалист",
                                                        group=group,
                                                        show_production=True)
        default_cf.save()
        default_site_facet = factories.SeoSiteFacetFactory(
            seosite=self.site,
            facet_type=factories.SeoSiteFacet.DEFAULT,
            customfacet=default_cf)
        default_site_facet.save()
        resp = self.client.get('/jobs/',
                               HTTP_HOST=self.site.domain,
                               follow=True)
        total_jobs = resp.context['total_jobs_count']
        solr_jobs = self.conn.search(q=u"title:Специалист")
        self.assertEqual(total_jobs, solr_jobs.hits)
        self.assertEqual(len(resp.context['default_jobs']), total_jobs)
        for facet_widget in resp.context['widgets']:
            for count_tuple in facet_widget.items:
                self.assertTrue(sqs.count() >= count_tuple[1])

        # Feed test
        resp = self.client.get('/feed/json', HTTP_HOST=self.site.domain)
        jobs = json.loads(resp.content)
        self.assertEqual(len(jobs), total_jobs)
        for job in jobs:
            resp = self.client.get(job['url'],
                                   HTTP_HOST=self.site.domain,
                                   follow=False)
            self.assertEqual(resp.status_code, 302)
            expected = 'https://my.jobs/%s%d?my.jobs.site.id=%s' %\
                       (job['guid'],
                        settings.FEED_VIEW_SOURCES['json'],
                        str(self.site.pk))
            self.assertEqual(resp['Location'], expected)

        # Sitemap index Test - Since sitemap only builds out updates from the
        # last 30 days, this test will eventually be checking 0 jobs in sitemap
        # TODO, find a way to keep feed dates current. We might be able to use
        # the mock library to override datetime functions
        resp = self.client.get('/sitemap.xml', HTTP_HOST=self.site.domain)
        root = etree.fromstring(resp.content)
        self.assertGreater(len(root), 0)
        crawled_jobs = 0
        for loc, lastmod in root:
            self.assertTrue(loc.text)
            resp = self.client.get(loc.text, HTTP_HOST=self.site.domain)
            self.assertEqual(resp.status_code, 200)
            # Get the first daily sitemap
            urlset = etree.fromstring(resp.content)
            # Check each job in daily sitemap - I'm a bot
            for loc, _, _, _ in urlset:
                resp = self.client.get(loc.text, HTTP_HOST=self.site.domain)
                self.assertEqual(resp.status_code, 200)
                self.assertIn(str(resp.context['the_job'].uid), loc.text)
                crawled_jobs += 1
Esempio n. 42
0
    def setUp(self):
        db_backend = settings.DATABASES['default']['ENGINE'].split('.')[-1]

        # Set columns that are utf8 in production to utf8
        if db_backend == 'mysql':
            cursor = connections['default'].cursor()
            cursor.execute("alter table seo_customfacet convert to character "
                           "set utf8 collate utf8_unicode_ci")
            cursor.execute("alter table seo_seositefacet convert to character "
                           "set utf8 collate utf8_unicode_ci")
            cursor.execute("alter table seo_company convert to character set "
                           "utf8 collate utf8_unicode_ci")
            cursor.execute(
                "alter table seo_queryredirect convert to character "
                "set utf8 collate utf8_unicode_ci")
            cursor.execute("alter table taggit_tag convert to character set "
                           "utf8 collate utf8_unicode_ci")
            cursor.execute("alter table taggit_taggeditem convert to "
                           "character set "
                           "utf8 collate utf8_unicode_ci")
            cursor.execute("alter table seo_seositeredirect convert to "
                           "character set utf8 collate utf8_unicode_ci")
            cursor.execute("alter table django_redirect convert to "
                           "character set utf8 collate utf8_unicode_ci")
            # We have a data migration that does this, but we don't run
            # migrations during tests (Django 1.6.5
            cursor.execute("ALTER TABLE django_flatpage CONVERT TO "
                           "CHARACTER SET utf8 COLLATE utf8_general_ci")
            cursor.execute("ALTER TABLE seo_custompage CONVERT TO "
                           "CHARACTER SET utf8 COLLATE utf8_general_ci")

        setattr(settings, 'ROOT_URLCONF', 'dseo_urls')
        setattr(settings, "PROJECT", 'dseo')
        clear_url_caches()

        self.base_middleware_classes = settings.MIDDLEWARE_CLASSES
        middleware_classes = self.base_middleware_classes + (
            'wildcard.middleware.WildcardMiddleware',
            'middleware.RedirectOverrideMiddleware')
        setattr(settings, 'MIDDLEWARE_CLASSES', middleware_classes)

        self.base_context_processors = settings.TEMPLATE_CONTEXT_PROCESSORS
        context_processors = self.base_context_processors + (
            "social_links.context_processors.social_links_context",
            "seo.context_processors.site_config_context",
        )
        setattr(settings, 'TEMPLATE_CONTEXT_PROCESSORS', context_processors)
        context._standard_context_processors = None

        self.conn = Solr('http://127.0.0.1:8983/solr/seo')
        self.conn.delete(q="*:*")
        cache.clear()
        clear_url_caches()

        setattr(settings, 'MEMOIZE', False)

        # As we added tests that created more and more companies, we
        # approached the hardcoded companies in import_jobs_testdata.json.
        # When we hit those ids, we began to get IntegrityErrors during
        # testing. Reset the sequence used by CompanyFactory to clear this
        # build-up.
        CompanyFactory.reset_sequence()