Ejemplos de rebuild en Python, ejemplos de ckan.lib.search.rebuild en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: test_controllers.py Proyecto: starsinmypockets/ckanext-ed

 def setup(self):
     super(TestStateUpdateController, self).setup()
     self.pkg = 'test-dataset-1'
     helpers.reset_db()
     rebuild()
     factories.User(name='george', id='george')
     factories.User(name='john', id='john')
     factories.User(name='paul', id='paul')
     factories.Organization(users=[{
         'name': 'george',
         'capacity': 'admin'
     }, {
         'name': 'john',
         'capacity': 'editor'
     }, {
         'name': 'paul',
         'capacity': 'reader'
     }],
                            name='us-ed-1',
                            id='us-ed-1')
     # Dataset created by factories seem to use sysadmin so approval_state
     # forced to be "approved". Creating packages this way to avoid that
     context = {'user': '******'}
     data_dict = _create_dataset_dict(self.pkg, 'us-ed-1', private=True)
     self.package = helpers.call_action('package_create', context,
                                        **data_dict)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: cli.py Proyecto: skvisha/hdx-ckan

    def command(self):

        from ckan.lib.search import rebuild

        from ckan.logic import NotFound

        self._load_config()
        # hours_to_check = int(config.get('hdx.analytics.hours_to_check_for_refresh', 24))

        self.log = logging.getLogger(__name__)

        solr_ds = self._find_potential_datasets_in_solr()
        pageviews_ds, downloads_ds = self._find_potential_datasets_in_mixpanel()

        ds_to_update = self._decide_which_datasets_need_update(solr_ds, pageviews_ds, downloads_ds)

        total = len(ds_to_update)
        self.log.info('Rebuilding index for {} datasets.'.format(total))

        for idx, dataset_id in enumerate(ds_to_update):
            self.log.info('Rebuilding index for dataset {}. {}/{}'.format(dataset_id, idx+1, total))
            try:
                rebuild(dataset_id)
                self.log.info('Done')
            except NotFound:
                self.log.error("Error: package {} not found.".format(dataset_id))
            except KeyboardInterrupt:
                self.log.error("Stopped.")
                return
            except:
                raise

Ejemplo n.º 3

0

Mostrar archivo

Archivo: publisher_rename.py Proyecto: datagovuk/ckanext-dgu

    def rename(self, old_name, new_name, new_title=None):
        """ Changes the slug for the group """
        import ckan.model as model
        import ckan.lib.search as search

        print "Converting '{0}' to '{1}'".format(old_name, new_name)

        existing = model.Group.by_name(new_name)
        if existing:
            print "'{0}' is already in user, please choose another name".format(new_name)

        group = model.Group.by_name(old_name)
        if not group:
            print "Group {g} not found".format(g=old_name)
            return

        model.repo.new_revision()
        group.name = new_name
        x = sum(1 for k in group.extras.keys() if k.startswith('previous-name-'))
        group.extras['previous-name-%d' % (x+1)] = old_name
        if new_title:
            group.extras['previous-title'] = group.title
            group.title = new_title
        group.save()

        print "Updating search index ...."
        members = model.Session.query(model.Member).filter(model.Member.group_id==group.id).\
            filter(model.Member.state=='active').filter(model.Member.table_name=='package')
        for member in members:
            search.rebuild(member.table_id)

        self._display_group(group)

Ejemplo n.º 4

0

Mostrar archivo

def resource_view_create(context, data_dict):
    ''' Wraps the default resource_view_create ALSO reindexing the package

    :param resource_id: id of the resource
    :type resource_id: string
    :param title: the title of the view
    :type title: string
    :param description: a description of the view (optional)
    :type description: string
    :param view_type: type of view
    :type view_type: string
    :param config: options necessary to recreate a view state (optional)
    :type config: JSON string

    :returns: the newly created resource view
    :rtype: dictionary

    '''

    from ckan.lib.search import rebuild

    result = core_create.resource_view_create(context, data_dict)

    if data_dict.get('view_type') == 'hdx_hxl_preview':
        resource = context.get('resource')
        package_id = resource.package_id

        try:
            rebuild(package_id)
        except NotFound:
            log.error("Error: package {} not found.".format(package_id))
        except Exception, e:
            log.error(str(e))

Ejemplo n.º 5

0

Mostrar archivo

Archivo: cli.py Proyecto: allanglen/ckan

    def command(self):
        self._load_config()
        from ckan.lib.search import rebuild, check, show, clear

        if not self.args:
            # default to printing help
            print self.usage
            return

        cmd = self.args[0]        
        if cmd == 'rebuild':
            if len(self.args) > 1:
                rebuild(self.args[1])
            else:
                rebuild()
        elif cmd == 'check':
            check()
        elif cmd == 'show':
            if not len(self.args) == 2:
                import pdb; pdb.set_trace()
                self.args
            show(self.args[1])
        elif cmd == 'clear':
            clear()
        else:
            print 'Command %s not recognized' % cmd

Ejemplo n.º 6

0

Mostrar archivo

Archivo: test_admin.py Proyecto: okfn/ckanext-unhcr

    def test_search_geographies(self, app):

        # clear and rebuild the index
        package_index = search.index_for(model.Package)
        package_index.clear()
        search.rebuild()

        expected = []
        for key, geo in self.geogs.items():
            expected.extend([geo.gis_name, geo.pcode])

        data_dicts = [{'q': term} for term in expected]
        context = {'ignore_auth': True}
        for data_dict in data_dicts:
            packages = toolkit.get_action('package_search')(context, data_dict)

            # Check responses
            from_gis2 = [
                self.unrelated['20DEU010004'].pcode,
                self.unrelated['20DEU010004'].gis_name
            ]
            if data_dict['q'] in from_gis2:
                should_be = self.gis_dataset2['id']
            else:
                should_be = self.gis_dataset1['id']

            assert should_be in [result['id'] for result in packages['results']]

Ejemplo n.º 7

0

Mostrar archivo

    def command(self):
        self._load_config()
        from ckan.lib.search import rebuild, check, show, clear

        if not self.args:
            # default to printing help
            print self.usage
            return

        cmd = self.args[0]
        if cmd == 'rebuild':
            if len(self.args) > 1:
                rebuild(self.args[1])
            else:
                rebuild()
        elif cmd == 'check':
            check()
        elif cmd == 'show':
            if not len(self.args) == 2:
                import pdb
                pdb.set_trace()
                self.args
            show(self.args[1])
        elif cmd == 'clear':
            clear()
        else:
            print 'Command %s not recognized' % cmd

Ejemplo n.º 8

0

Mostrar archivo

def reindex_package_on_hdx_hxl_preview_view(view_type, context, data_dict):

    from ckan.lib.search import rebuild

    if view_type == 'hdx_hxl_preview':
        resource = context.get('resource')

        # resource is in context only when the auth is run. But that doesn't happen for sysadmins
        if resource:
            package_id = resource.package_id
        else:
            resource_id = _get_or_bust(data_dict, 'resource_id')
            model = context['model']
            resource = model.Resource.get(resource_id)
            if resource:
                package_id = resource.package_id
            else:
                package_id = None
        try:
            if package_id:
                rebuild(package_id)
        except NotFound:
            log.error("Error: package {} not found.".format(package_id))
        except Exception, e:
            log.error(str(e))

Ejemplo n.º 9

0

Mostrar archivo

Archivo: action.py Proyecto: DataShades/ckanext-syndicate

def set_syndicated_id(local_id: str, remote_id: str, field: str):
    """Set the remote package id on the local package"""
    ext_id = (
        model.Session.query(model.PackageExtra.id)
        .join(model.Package, model.Package.id == model.PackageExtra.package_id)
        .filter(
            model.Package.id == local_id,
            model.PackageExtra.key == field,
        )
        .first()
    )
    if not ext_id:
        existing = model.PackageExtra(
            package_id=local_id,
            key=field,
            value=remote_id,
        )
        model.Session.add(existing)
        model.Session.commit()
        model.Session.flush()
    else:
        model.Session.query(model.PackageExtra).filter_by(id=ext_id).update(
            {"value": remote_id, "state": "active"}
        )
    rebuild(local_id)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: jobs.py Proyecto: dvulanov/ckanext-knowledgehub

    def refresh_index(self, query):
        package_ids = self.find_documents(query)

        self.logger.debug('Rebuilding index for packages: %s', package_ids)
        if package_ids:
            for package_id in package_ids:
                ckan_search.rebuild(package_id=package_id)

Ejemplo n.º 11

0

Mostrar archivo

    def rename(self, old_name, new_name, new_title=None):
        """ Changes the slug for the group """
        import ckan.model as model
        import ckan.lib.search as search

        print "Converting '{0}' to '{1}'".format(old_name, new_name)

        existing = model.Group.by_name(new_name)
        if existing:
            print "'{0}' is already in user, please choose another name".format(
                new_name)

        group = model.Group.by_name(old_name)
        if not group:
            print "Group {g} not found".format(g=old_name)
            return

        model.repo.new_revision()
        group.name = new_name
        x = sum(1 for k in group.extras.keys()
                if k.startswith('previous-name-'))
        group.extras['previous-name-%d' % (x + 1)] = old_name
        if new_title:
            group.extras['previous-title'] = group.title
            group.title = new_title
        group.save()

        print "Updating search index ...."
        members = model.Session.query(model.Member).filter(model.Member.group_id==group.id).\
            filter(model.Member.state=='active').filter(model.Member.table_name=='package')
        for member in members:
            search.rebuild(member.table_id)

        self._display_group(group)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: test_helpers.py Proyecto: datopian/ckanext-us_ed_theme

    def setup(self):
        test_helpers.reset_db()

        rebuild()

        if not plugins.plugin_loaded('us_ed_theme'):
            plugins.load('us_ed_theme')

Ejemplo n.º 13

0

Mostrar archivo

Archivo: base.py Proyecto: csu-anzai/ckanext-unhcr

    def setup(self):
        super(FunctionalTestBase, self).setup()
        core_helpers.reset_db()
        rebuild()

        # Get app
        self.app = self._get_test_app()

Ejemplo n.º 14

0

Mostrar archivo

Archivo: test_plugin.py Proyecto: EnviDat/ckanext-oaipmh_repository

 def teardown(self):
     '''Nose runs this method after each test method in our test class.'''
     # Rebuild CKAN's database after each test method, so that each test
     # method runs with a clean slate.
     model.repo.rebuild_db()
     search.index_for('Package').clear()
     search.rebuild()

Ejemplo n.º 15

0

Mostrar archivo

Archivo: test_workflow.py Proyecto: starsinmypockets/ckanext-ed

    def setup(self):
        self.pkg_1 = 'test-dataset-1'
        self.pkg_2 = 'test-dataset-2'
        test_helpers.reset_db()
        rebuild()
        core_factories.User(name='george')
        core_factories.User(name='john')
        core_factories.User(name='paul')
        core_factories.Organization(users=[{
            'name': 'george',
            'capacity': 'admin'
        }, {
            'name': 'john',
            'capacity': 'editor'
        }, {
            'name': 'paul',
            'capacity': 'reader'
        }],
                                    name='us-ed-1',
                                    id='us-ed-1')
        # Dataset created by factories seem to use sysadmin so approval_state
        # forced to be "approved". Creating packages this way to avoid that
        context = {'user': '******'}
        data_dict = _create_dataset_dict(self.pkg_1, 'us-ed-1')
        self.package_approved = call_action('package_create', context,
                                            **data_dict)

        # 1 datasets above "approveal_pending" and 1 below "approved"
        context = {'user': '******'}
        data_dict = _create_dataset_dict(self.pkg_2, 'us-ed-1')
        self.package_pending = call_action('package_create', context,
                                           **data_dict)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: tracking.py Proyecto: larrybabb/ckan

def update_tracking_solr(engine, start_date):
    sql = u'''SELECT package_id FROM tracking_summary
            where package_id!='~~not~found~~'
            and tracking_date >= %s;'''
    results = engine.execute(sql, start_date)

    package_ids = set()
    for row in results:
        package_ids.add(row[u'package_id'])

    total = len(package_ids)
    not_found = 0
    click.echo('{} package index{} to be rebuilt starting from {}'.format(
        total, '' if total < 2 else 'es', start_date))

    from ckan.lib.search import rebuild
    for package_id in package_ids:
        try:
            rebuild(package_id)
        except logic.NotFound:
            click.echo(u'Error: package {} not found.'.format(package_id))
            not_found += 1
        except KeyboardInterrupt:
            click.echo(u'Stopped.')
            return
        except Exception as e:
            error_shout(e)
    click.echo(u'search index rebuilding done.' +
               (u' {} not found.'.format(not_found) if not_found else u''))

Ejemplo n.º 17

0

Mostrar archivo

Archivo: group.py Proyecto: ASIMCC/ckan-1

def _force_reindex(grp):
    u''' When the group name has changed, we need to force a reindex
    of the datasets within the group, otherwise they will stop
    appearing on the read page for the group (as they're connected via
    the group name)'''
    group = model.Group.get(grp['name'])
    for dataset in group.packages():
        search.rebuild(dataset.name)

Ejemplo n.º 18

0

Mostrar archivo

 def _force_reindex(self, grp):
     ''' When the group name has changed, we need to force a reindex
     of the datasets within the group, otherwise they will stop
     appearing on the read page for the group (as they're connected via
     the group name)'''
     group = model.Group.get(grp['name'])
     for dataset in group.packages():
         search.rebuild(dataset.name)

Ejemplo n.º 19

0

Mostrar archivo

Archivo: test_most_viewed.py Proyecto: neoflex/ckanext-ecportal

 def test_05_view_counts_updated(self):
     search.rebuild()
     datasets = helpers.most_viewed_datasets(10)
     self.assert_equal([(d['title'], d['views_total']) for d in datasets],
                       [('Most Viewed 4', 8),
                        ('Most Viewed 3', 6),
                        ('Most Viewed 2', 4),
                        ('Most Viewed 1', 2)])

Ejemplo n.º 20

0

Mostrar archivo

def package_collaborator_org_create(context, data_dict):
    '''Make a user a collaborator in a dataset.

    If the user is already a collaborator in the dataset then their
    capacity will be updated.

    Currently you must be an Admin on the dataset owner organization to
    manage collaborators.

    :param id: the id or name of the dataset
    :type id: string
    :param user_id: the id or name of the user to add or edit
    :type user_id: string
    :param capacity: the capacity of the membership. Must be either 'editor' or 'member'
    :type capacity: string

    :returns: the newly created (or updated) collaborator
    :rtype: dictionary

    '''
    model = context.get('model', core_model)

    dataset_id, org_id, capacity = toolkit.get_or_bust(
        data_dict, ['id', 'org_id', 'capacity'])

    dataset = model.Package.get(dataset_id)
    if not dataset:
        raise toolkit.ObjectNotFound('Dataset not found')

    toolkit.check_access('package_collaborator_org_create', context, data_dict)

    org = model.Group.get(org_id)
    if not org:
        raise toolkit.ObjectNotFound('Organization not found')

    if capacity not in ALLOWED_CAPACITIES:
        raise toolkit.ValidationError('Capacity must be one of "{}"'.format(
            ', '.join(ALLOWED_CAPACITIES)))

    # Check if organization already exists
    member = model.Session.query(PackageOrgMember).\
        filter(PackageOrgMember.dataset_id == dataset.id).\
        filter(PackageOrgMember.org_id == org.id).one_or_none()
    if not member:
        member = PackageOrgMember(dataset_id=dataset.id, org_id=org.id)
    member.capacity = capacity
    member.modified = datetime.datetime.utcnow()

    model.Session.add(member)
    model.repo.commit()

    #Rebuild search index for package to reflect updated permissions
    rebuild(dataset_id)

    log.info('Organization {} added as collaborator in dataset {} ({})'.format(
        org.name, dataset.id, capacity))

    return member.as_dict()

Ejemplo n.º 21

0

Mostrar archivo

    def _rebuild_search_index(self):
        """Rebuild CKAN's search index.

        This simulates calling `paster search-index rebuild` on the command
        line.

        """
        from ckan.lib.search import rebuild
        rebuild()

Ejemplo n.º 22

0

Mostrar archivo

Archivo: cli.py Proyecto: arkka/ckan

    def rebuild(self):
        from ckan.lib.search import rebuild

        if len(self.args) > 1:
            rebuild(self.args[1])
        else:
            rebuild(only_missing=self.options.only_missing,
                    force=self.options.force,
                    refresh=self.options.refresh)

Ejemplo n.º 23

0

Mostrar archivo

Archivo: actions.py Proyecto: atehwa/ckanext-kata

def package_show(context, data_dict):
    '''
    Return the metadata of a dataset (package) and its resources.

    Called before showing the dataset in some interface (browser, API),
    or when adding package to Solr index (no validation / conversions then).

    :param id: the id or name of the dataset
    :type id: string

    :rtype: dictionary
    '''

    if data_dict.get('type') == 'harvest':
        context['schema'] = Schemas.harvest_source_show_package_schema()

    if not data_dict.get('id') and not data_dict.get('name'):
        # Get package by data PIDs
        data_dict['id'] = utils.get_package_id_by_data_pids(data_dict)

    pkg_dict1 = ckan.logic.action.get.package_show(context, data_dict)
    pkg_dict1 = utils.resource_to_dataset(pkg_dict1)

    # Remove empty agents that come from padding the agent list in converters
    if 'agent' in pkg_dict1:
        agents = filter(None, pkg_dict1.get('agent', []))
        pkg_dict1['agent'] = agents or []

    # Normally logic function should not catch the raised errors
    # but here it is needed so action package_show won't catch it instead
    # Hiding information from API calls
    try:
        check_access('package_update', context)
    except NotAuthorized:
        pkg_dict1 = utils.hide_sensitive_fields(pkg_dict1)

    pkg = Package.get(pkg_dict1['id'])
    if 'erelated' in pkg.extras:
        erelated = pkg.extras['erelated']
        if len(erelated):
            for value in erelated.split(';'):
                if len(Session.query(Related).filter(Related.title == value).all()) == 0:
                    data_dict = {'title': value,
                                 'type': _("Paper"),
                                 'dataset_id': pkg.id}
                    related_create(context, data_dict)

    # Update package.title to match package.extras.title_0
    extras_title = pkg.extras.get(u'title_0')
    if extras_title and extras_title != pkg.title:
        repo.new_revision()
        pkg.title = pkg.extras[u'title_0']
        pkg.save()
        rebuild(pkg.id)  # Rebuild solr-index for this dataset

    return pkg_dict1

Ejemplo n.º 24

0

Mostrar archivo

Archivo: test_plugin.py Proyecto: EnviDat/ckanext-oaipmh_repository

 def setup_class(cls):
     '''Nose runs this method once to setup our test class.'''
     # Test code should use CKAN's plugins.load() function to load plugins
     # to be tested.
     ckan.plugins.load('oaipmh_repository')
     
     model.repo.rebuild_db()
     search.index_for('Package').clear()
     search.rebuild()
     
     Converters().converters_dict = {}
     Converters().set_converter(TestOAIDCConverter())

Ejemplo n.º 25

0

Mostrar archivo

Archivo: search_index.py Proyecto: terrabrasilis-research-data/data-manager-ckan

def rebuild(verbose, force, refresh, only_missing, quiet, commit_each):
    u''' Rebuild search index '''
    from ckan.lib.search import rebuild, commit
    try:
        rebuild(only_missing=only_missing,
                force=force,
                refresh=refresh,
                defer_commit=(not commit_each),
                quiet=quiet)
    except Exception as e:
        tk.error_shout(e)
    if not commit_each:
        commit()

Ejemplo n.º 26

0

Mostrar archivo

Archivo: cli.py Proyecto: okfn/ckan-old

    def command(self):
        self._load_config()
        from ckan.lib.search import rebuild

        if not self.args:
            # default to run
            cmd = "rebuild"
        else:
            cmd = self.args[0]

        if cmd == "rebuild":
            rebuild()
        else:
            print "Command %s not recognized" % cmd

Ejemplo n.º 27

0

Mostrar archivo

Archivo: commands.py Proyecto: ngds/ckanext-geoserver

    def publish_ogc_worker(self):
        '''
        Publish dataset wms/wfs to geoserver by pop-ing
        an element (dataset id) from the publis_ogc_queue(redis)
        '''

        print str(datetime.datetime.now()
                  ) + ' PUBLISH_OGC_WORKER: Started the worker process'
        # flush stdout see https://github.com/Supervisor/supervisor/issues/13
        sys.stdout.flush()
        try:
            r = self._redis_connection()
        except:
            print str(datetime.datetime.now(
            )) + ' PUBLISH_OGC_WORKER: ERROR, could not connect to Redis '
            sys.stdout.flush()

        # Lovely infinite loop ;P, we do need them from time to time
        while True:
            # POP an element (package_id) from publis_ogc_queue and publish it to ogc
            try:
                # we need to slow down this loop by setting the blpop timeout to 5 seconds
                # when publish_ogc_queue is empty

                queue_task = r.blpop('publish_ogc_queue', 5)

                if queue_task is not None:
                    package_id = queue_task[1]
                    print str(
                        datetime.datetime.now()
                    ) + ' PUBLISH_OGC_WORKER: Start publishing dataset: ' + package_id
                    sys.stdout.flush()

                    self.publish_ogc(package_id)
                    print str(
                        datetime.datetime.now()
                    ) + ' PUBLISH_OGC_WORKER: finished publishing now index: ' + package_id
                    sys.stdout.flush()
                    # rebuild solr index for this dataset to avoid duplicate datasets in search results
                    rebuild(package_id)
                    commit()

            except:
                print str(
                    datetime.datetime.now()
                ) + ' PUBLISH_OGC_WORKER: An Error has occured while publishing dataset:' + package_id + ' to GeoServer'
                sys.stdout.flush()
                # retry in 30 seconds if something went south
                time.sleep(30)

Ejemplo n.º 28

0

Mostrar archivo

def rebuild(ctx, verbose, force, refresh, only_missing, quiet, commit_each):
    u''' Rebuild search index '''
    flask_app = ctx.obj.app.apps['flask_app']._wsgi_app
    from ckan.lib.search import rebuild, commit
    try:
        with flask_app.test_request_context():
            rebuild(only_missing=only_missing,
                    force=force,
                    refresh=refresh,
                    defer_commit=(not commit_each),
                    quiet=quiet)
    except Exception as e:
        error_shout(e)
    if not commit_each:
        commit()

Ejemplo n.º 29

0

Mostrar archivo

Archivo: search_index.py Proyecto: PublicaMundi/ckan

def rebuild(ctx, verbose, force, refresh, only_missing, quiet, commit_each):
    u''' Rebuild search index '''
    flask_app = ctx.obj.app.apps['flask_app']._wsgi_app
    from ckan.lib.search import rebuild, commit
    try:
        with flask_app.test_request_context():
            rebuild(only_missing=only_missing,
                    force=force,
                    refresh=refresh,
                    defer_commit=(not commit_each),
                    quiet=quiet)
    except Exception as e:
        error_shout(e)
    if not commit_each:
        commit()

Ejemplo n.º 30

0

Mostrar archivo

 def fix_missed_licenses(self):
     q = model.Session.query(model.Package).filter(
         model.Package.license_id.is_(None)
         | (model.Package.license_id == ''))
     ids = [pkg.id for pkg in q]
     if not ids:
         print('There are no packages with missed license_id')
         return
     broken_count = q.update({'license_id': 'notspecified'},
                             synchronize_session=False)
     model.Session.commit()
     print('{} packages were updated:'.format(broken_count))
     for id in ids:
         search.rebuild(id)
         print('\t' + id)

Ejemplo n.º 31

0

Mostrar archivo

Archivo: cli.py Proyecto: abulte/ckan

    def rebuild(self):
        from ckan.lib.search import rebuild, commit

        # BY default we don't commit after each request to Solr, as it is
        # a really heavy operation and slows things a lot

        if len(self.args) > 1:
            rebuild(self.args[1])
        else:
            rebuild(only_missing=self.options.only_missing,
                    force=self.options.force,
                    refresh=self.options.refresh,
                    defer_commit=(not self.options.commit_each))

        if not self.options.commit_each:
            commit()

Ejemplo n.º 32

0

Mostrar archivo

Archivo: cli.py Proyecto: rostock/opendata.hro

    def rebuild(self):
        from ckan.lib.search import rebuild, commit

        # BY default we don't commit after each request to Solr, as it is
        # a really heavy operation and slows things a lot

        if len(self.args) > 1:
            rebuild(self.args[1])
        else:
            rebuild(only_missing=self.options.only_missing,
                    force=self.options.force,
                    refresh=self.options.refresh,
                    defer_commit=(not self.options.commit_each))

        if not self.options.commit_each:
            commit()

Ejemplo n.º 33

0

Mostrar archivo

def rebuild(verbose: bool, force: bool, only_missing: bool, quiet: bool,
            commit_each: bool, package_id: str, clear: bool):
    u''' Rebuild search index '''
    from ckan.lib.search import rebuild, commit
    try:

        rebuild(package_id,
                only_missing=only_missing,
                force=force,
                defer_commit=(not commit_each),
                quiet=quiet,
                clear=clear)
    except Exception as e:
        error_shout(e)
    if not commit_each:
        commit()

Ejemplo n.º 34

0

Mostrar archivo

def group_dict_save(group_dict, context, prevent_packages_update=False):
    from ckan.lib.search import rebuild

    model = context["model"]
    session = context["session"]
    group = context.get("group")
    allow_partial_update = context.get("allow_partial_update", False)

    Group = model.Group
    if group:
        group_dict["id"] = group.id

    group = d.table_dict_save(group_dict, Group, context)
    if not group.id:
        group.id = str(uuid.uuid4())

    context['group'] = group

    # Under the new org rules we do not want to be able to update datasets
    # via group edit so we need a way to prevent this.  It may be more
    # sensible in future to send a list of allowed/disallowed updates for
    # groups, users, tabs etc.
    if not prevent_packages_update:
        pkgs_edited = group_member_save(context, group_dict, 'packages')
    else:
        pkgs_edited = {
            'added': [],
            'removed': []
        }
    group_users_changed = group_member_save(context, group_dict, 'users')
    group_groups_changed = group_member_save(context, group_dict, 'groups')
    group_tags_changed = group_member_save(context, group_dict, 'tags')
    log.debug('Group save membership changes - Packages: %r  Users: %r  '
            'Groups: %r  Tags: %r', pkgs_edited, group_users_changed,
            group_groups_changed, group_tags_changed)

    extras = group_dict.get("extras", [])
    new_extras = {i['key'] for i in extras}
    if extras:
        old_extras = group.extras
        for key in set(old_extras) - new_extras:
            del group.extras[key]
        for x in extras:
            if 'deleted' in x and x['key'] in old_extras:
                del group.extras[x['key']]
                continue
            group.extras[x['key']] = x['value']

    # We will get a list of packages that we have either added or
    # removed from the group, and trigger a re-index.
    package_ids = pkgs_edited['removed']
    package_ids.extend( pkgs_edited['added'] )
    if package_ids:
        session.commit()
        [rebuild(package_id) for package_id in package_ids]

    return group

Ejemplo n.º 35

0

Mostrar archivo

Archivo: commands.py Proyecto: ykhadilkar/ckanext-geodatagov

    def solr_tracking_update(self, start_date=None):
        if start_date:
            start_date = datetime.datetime.strptime(start_date, '%Y-%m-%d')
        else:
            # No date given. See when we last have data for and get data
            # from 2 days before then in case new data is available.
            # If no date here then use 2013-09-01 as the start date
            sql = '''SELECT tracking_date from tracking_summary
                     ORDER BY tracking_date DESC LIMIT 1;'''
            result = model.Session.execute(sql).fetchall()
            if result:
                start_date = result[0]['tracking_date']
                start_date += datetime.timedelta(-2)
            else:
                start_date = '2013-09-01'

        sql = '''SELECT package_id FROM tracking_summary
                where package_id!='~~not~found~~'
                and tracking_date >= :start_date;'''
        q = model.Session.execute(sql, {'start_date': start_date})

        package_ids = set()
        for row in q:
            package_ids.add(row['package_id'])

        total = len(package_ids)
        not_found = 0
        print 'updating %i records on solr starting from %s' % (total, start_date)
        for index, package_id in enumerate(package_ids):
            print "updating %i/%i %s ..." % (index+1, total, package_id),
            try:
                search.rebuild(package_id)
            except ckan.logic.NotFound:
                print "Error: Not Found."
                not_found += 1
            except KeyboardInterrupt:
                print "Stopped."
                return
            except:
                raise
            else:
                print "Done."
        print 'All Done!' + " %i Not Found." % (not_found) if not_found else ""

Ejemplo n.º 36

0

Mostrar archivo

Archivo: delete.py Proyecto: alexandru-m-g/hdx-ckan

def resource_view_delete(context, data_dict):
    ''' Wraps the default resource_view_delete ALSO reindexing the package

    :param id: the id of the resource_view
    :type id: string

    '''
    from ckan.lib.search import rebuild

    core_delete.resource_view_delete(context, data_dict)

    try:
        if context.get('resource_view').view_type == 'hdx_hxl_preview':
            resource = context.get('resource')
            package_id = resource.package_id
            rebuild(package_id)
    except NotFound:
        log.error("Error: package {} not found.".format(package_id))
    except Exception, e:
        log.error(str(e))

Ejemplo n.º 37

0

Mostrar archivo

def package_collaborator_org_delete(context, data_dict):
    '''Remove a collaborator from a dataset.

    Currently you must be an Admin on the dataset owner organization to
    manage collaborators.

    :param id: the id or name of the dataset
    :type id: string
    :param user_id: the id or name of the user to remove
    :type user_id: string

    '''
    model = context.get('model', core_model)

    dataset_id, org_id = toolkit.get_or_bust(data_dict, ['id', 'org_id'])
    dataset = model.Package.get(dataset_id)

    if not dataset:
        raise toolkit.ObjectNotFound('Dataset not found')

    toolkit.check_access('package_collaborator_org_delete', context, data_dict)

    member = model.Session.query(PackageOrgMember).\
        filter(PackageOrgMember.dataset_id == dataset.id).\
        filter(PackageOrgMember.org_id == org_id).one_or_none()

    if not member:
        raise toolkit.ObjectNotFound(
            '{} is not a collaborator on dataset {}'.format(
                org_id, dataset_id))

    model.Session.delete(member)
    model.repo.commit()

    org = model.Group.get(org_id)

    # Rebuild search index for package to reflect updated permissions
    rebuild(dataset_id)

    log.info('Organization {} removed as collaborator from dataset {}'.format(
        org.name, dataset.id))

Ejemplo n.º 38

0

Mostrar archivo

Archivo: commands.py Proyecto: GeoinformationSystems/ckanext-geoserver

    def publish_ogc_worker(self):
        '''
        Publish dataset wms/wfs to geoserver by pop-ing
        an element (dataset id) from the publis_ogc_queue(redis)
        ''' 

        print str(datetime.datetime.now()) + ' PUBLISH_OGC_WORKER: Started the worker process'
        # flush stdout see https://github.com/Supervisor/supervisor/issues/13
        sys.stdout.flush()
        try:
            r = self._redis_connection()
        except:
            print str(datetime.datetime.now()) + ' PUBLISH_OGC_WORKER: ERROR, could not connect to Redis '
            sys.stdout.flush()
            

        # Lovely infinite loop ;P, we do need them from time to time
        while True:
            # POP an element (package_id) from publis_ogc_queue and publish it to ogc
            try:
                # we need to slow down this loop by setting the blpop timeout to 5 seconds
                # when publish_ogc_queue is empty
                
                queue_task = r.blpop('publish_ogc_queue', 5) 

                if queue_task is not None:
                    package_id = queue_task[1] 
                    print str(datetime.datetime.now()) + ' PUBLISH_OGC_WORKER: Start publishing dataset: ' + package_id
                    sys.stdout.flush()
                    self.publish_ogc(package_id)

                    # rebuild solr index for this dataset to avoid duplicate datasets in search results
                    rebuild(package_id)
                    commit()


            except:
                print str(datetime.datetime.now()) + ' PUBLISH_OGC_WORKER: An Error has occured while publishing dataset:' + package_id + ' to GeoServer'
                sys.stdout.flush()
                # retry in 30 seconds if something went south
                time.sleep(30)

Ejemplo n.º 39

0

Mostrar archivo

Archivo: test_helpers.py Proyecto: keitaroinc/ckanext-datagovmk

    def setup(self):
        test_helpers.reset_db()
        init_tables_ga()
        setup_user_authority_table()
        setup_user_authority_dataset_table()
        setup_featured_charts_table()
        setup_most_active_organizations_table()

        rebuild()

        if not plugins.plugin_loaded('c3charts'):
            plugins.load('c3charts')

        if not plugins.plugin_loaded('datagovmk'):
            plugins.load('datagovmk')

        if not plugins.plugin_loaded('scheming_organizations'):
            plugins.load('scheming_organizations')

        if not plugins.plugin_loaded('fluent'):
            plugins.load('fluent')

Ejemplo n.º 40

0

Mostrar archivo

Archivo: cli.py Proyecto: AdamJensen-dk/ckan-drupal

    def command(self):
        self._load_config()
        from ckan.lib.search import rebuild, check, show

        if not self.args:
            # default to run
            cmd = 'rebuild'
        else:
            cmd = self.args[0]
        
        if cmd == 'rebuild':
            rebuild()
        elif cmd == 'check':
            check()
        elif cmd == 'show':
            if not len(self.args) == 2:
                import pdb; pdb.set_trace()
                self.args
            show(self.args[1])
        else:
            print 'Command %s not recognized' % cmd

Ejemplo n.º 41

0

Mostrar archivo

Archivo: tasks.py Proyecto: stadt-karlsruhe/ckanext-extractor

def extract(ini_path, res_dict):
    """
    Download resource, extract and store metadata.

    The extracted metadata is stored in the database.

    Note that this task does check whether the resource exists in the
    database, whether the resource's format is indexed or whether there
    is an existing task working on the resource's metadata. This is the
    responsibility of the caller.

    The task does check which metadata fields are configured to be
    indexed and only stores those in the database.

    Any previously stored metadata for the resource is cleared.
    """
    load_config(ini_path)

    # Get package data before doing any hard work so that we can fail
    # early if the package is private.
    try:
        toolkit.get_action('package_show')({'validate': False},
                                           {'id': res_dict['package_id']})
    except toolkit.NotAuthorized:
        log.debug(('Not extracting resource {} since it belongs to the ' +
                  'private dataset {}.').format(res_dict['id'],
                  res_dict['package_id']))
        return

    try:
        metadata = ResourceMetadata.one(resource_id=res_dict['id'])
    except NoResultFound:
        metadata = ResourceMetadata.create(resource_id=res_dict['id'])
    try:
        metadata.last_url = res_dict['url']
        metadata.last_format = res_dict['format']
        metadata.last_extracted = datetime.datetime.now()
        metadata.meta.clear()
        extracted = download_and_extract(res_dict['url'])
        for plugin in PluginImplementations(IExtractorPostprocessor):
            plugin.extractor_after_extract(res_dict, extracted)
        for key, value in extracted.iteritems():
            if not is_field_indexed(key):
                continue

            # Some documents contain multiple values for the same field. This
            # is not supported in our database model, hence we collapse these
            # into a single value.
            if isinstance(value, list):
                log.debug('Collapsing multiple values for metadata field ' +
                          '"{}" in resource {} into a single value.'.format(key,
                          res_dict['id']))
                value = ', '.join(value)

            metadata.meta[key] = value
    except RequestException as e:
        log.warn('Failed to download resource data from "{}": {}'.format(
                 res_dict['url'], e.message))
    finally:
        metadata.task_id = None
        metadata.save()

    for plugin in PluginImplementations(IExtractorPostprocessor):
        plugin.extractor_after_save(res_dict, metadata.as_dict())

    # We need to update the search index for the package here. Note that
    # we cannot rely on the automatic update that happens when a resource
    # is changed, since our extraction task runs asynchronously and may
    # be finished only when the automatic index update has already run.
    search.rebuild(package_id=res_dict['package_id'])

    for plugin in PluginImplementations(IExtractorPostprocessor):
        plugin.extractor_after_index(res_dict, metadata.as_dict())

Ejemplo n.º 42

0

Mostrar archivo

Archivo: test_solr_search_index.py Proyecto: HatemAlSum/ckan

 def setup_class(cls):
     setup_test_search_index()
     CreateTestData.create_search_test_data()
     cls.solr = search.make_connection()
     cls.fq = " +site_id:\"%s\" " % config['ckan.site_id']
     search.rebuild()

Ejemplo n.º 43

0

Mostrar archivo

Archivo: search_index.py Proyecto: PublicaMundi/ckan

 def start(ids):
     from ckan.lib.search import rebuild, commit
     rebuild(package_ids=ids)
     commit()

Ejemplo n.º 44

0

Mostrar archivo

 def setup_class(cls):
     setup_test_search_index()
     CreateTestData.create_search_test_data()
     cls.solr = search.make_connection()
     cls.fq = ' +site_id:"%s" ' % config["ckan.site_id"]
     search.rebuild()

Ejemplo n.º 45

0

Mostrar archivo

Archivo: commands.py Proyecto: sobakavich/ckanext-geodatagov

    def db_solr_sync(self):

        print str(datetime.datetime.now()) + ' Entering Database Solr Sync function.'

        url = config.get('solr_url') + "/select?q=*%3A*&sort=id+asc&fl=id%2Cmetadata_modified&wt=json&indent=true"
        response = get_response(url)
    
        if (response != 'error'):

          print str(datetime.datetime.now()) + ' Deleting records from solr_pkg_ids.'		
          sql = '''delete from solr_pkg_ids'''
          model.Session.execute(sql)
          model.Session.commit()
		
          f = response.read()
          data = json.loads(f)
          rows = data.get('response').get('numFound')

          start = 0
          chunk_size = 1000         

          print str(datetime.datetime.now()) + ' Starting insertion of records in solr_pkg_ids .'
 
          for x in range(0, int(math.ceil(rows/chunk_size))+1):
		  
            if(x == 0):
               start = 0
			
            print str(datetime.datetime.now()) + ' Fetching ' + url + "&rows=" + str(chunk_size) + "&start=" + str(start)			  
			  
            response = get_response(url + "&rows=" + str(chunk_size) + "&start=" + str(start))
            f = response.read()
            data = json.loads(f)
            results = data.get('response').get('docs')

            print str(datetime.datetime.now()) + ' Inserting ' + str(start) + ' - ' + str(start + int(data.get('responseHeader').get('params').get('rows')) - 1) + ' of ' + str(rows)			
			
            for x in range(0, len(results)):
                sql = '''select count(id) as count from package where id = :pkg_id;'''
                q = model.Session.execute(sql, {'pkg_id' : results[x]['id']})            
                for row in q:
                   if(row['count'] == 0):
                     sql = '''insert into solr_pkg_ids (pkg_id, action) values (:pkg_id, :action);'''
                     model.Session.execute(sql, {'pkg_id' : results[x]['id'], 'action' : 'notfound' })
                     model.Session.commit()			
                   else:
                     pkg_dict = logic.get_action('package_show')(
                                    {'model': model, 'ignore_auth': True, 'validate': False},
                                    {'id': results[x]['id']})
                     if(str(results[x]['metadata_modified'])[:19] != pkg_dict['metadata_modified'][:19]):
                       print str(datetime.datetime.now()) + ' Action Type : outsync for Package Id: ' + results[x]['id']
                       print ' ' * 26 +                     ' Modified Date from Solr: ' + str(results[x]['metadata_modified'])
                       print ' ' * 26 +                     ' Modified Date from Db: ' + pkg_dict['metadata_modified']
                       sql = '''insert into solr_pkg_ids (pkg_id, action) values (:pkg_id, :action);'''
                       model.Session.execute(sql, {'pkg_id' : results[x]['id'], 'action' : 'outsync' })
                       model.Session.commit()
                     else:
                       sql = '''insert into solr_pkg_ids (pkg_id, action) values (:pkg_id, :action);'''
                       model.Session.execute(sql, {'pkg_id' : results[x]['id'], 'action' : 'insync' })
                       model.Session.commit()
                     
            start = int(data.get('responseHeader').get('params').get('start')) + chunk_size			       
          
          print str(datetime.datetime.now()) + ' Starting Database to Solr Sync'           
          
          #sql = '''Select id from package where id not in (select pkg_id from solr_pkg_ids); '''
          sql = '''Select p.id as pkg_id from package p
                   left join solr_pkg_ids sp on sp.pkg_id = p.id
                   where sp.pkg_id is null; '''
          
          q = model.Session.execute(sql)
          pkg_ids = set()
          for row in q:
            pkg_ids.add(row['pkg_id'])
          for pkg_id in pkg_ids:
            try:
              print str(datetime.datetime.now()) + ' Building Id: ' + pkg_id
              search.rebuild(pkg_id)
            except ckan.logic.NotFound:
              print "Error: Not Found."
            except KeyboardInterrupt:
              print "Stopped."
              return
            except:
              raise
          
          sql = '''Select pkg_id from solr_pkg_ids where action = 'outsync'; '''
          q = model.Session.execute(sql)          
          pkg_ids = set()
          for row in q:
            pkg_ids.add(row['pkg_id'])
          for pkg_id in pkg_ids:
            try:
              print str(datetime.datetime.now()) + ' Rebuilding Id: ' + pkg_id
              search.rebuild(pkg_id)
            except ckan.logic.NotFound:
              print "Error: Not Found."
            except KeyboardInterrupt:
              print "Stopped."
              return
            except:
              raise
          
          print str(datetime.datetime.now()) + ' Starting Solr to Database Sync'
          
          sql = '''Select pkg_id from solr_pkg_ids where action = 'notfound'; '''
          q = model.Session.execute(sql)
          pkg_ids = set()
          for row in q:
            pkg_ids.add(row['pkg_id'])
          for pkg_id in pkg_ids:
            try:
              search.clear(pkg_id)
            except ckan.logic.NotFound:
              print "Error: Not Found."
            except KeyboardInterrupt:
              print "Stopped."
              return
            except:
              raise
          
          print str(datetime.datetime.now()) + " All Sync Done."

Ejemplo n.º 46

0

Mostrar archivo

Archivo: commands.py Proyecto: jjjohnst/ckanext-geodatagov

    def db_solr_sync(self):

        print str(datetime.datetime.now()) + " Entering Database Solr Sync function."

        url = config.get("solr_url") + "/select?q=*%3A*&sort=id+asc&fl=id%2Cmetadata_modified&wt=json&indent=true"
        response = get_response(url)

        if response != "error":

            print str(datetime.datetime.now()) + " Deleting records from solr_pkg_ids."
            sql = """delete from solr_pkg_ids"""
            model.Session.execute(sql)
            model.Session.commit()

            f = response.read()
            data = json.loads(f)
            rows = data.get("response").get("numFound")

            start = 0
            chunk_size = 1000

            print str(datetime.datetime.now()) + " Starting insertion of records in solr_pkg_ids ."

            for x in range(0, int(math.ceil(rows / chunk_size)) + 1):

                if x == 0:
                    start = 0

                print str(datetime.datetime.now()) + " Fetching " + url + "&rows=" + str(chunk_size) + "&start=" + str(
                    start
                )

                response = get_response(url + "&rows=" + str(chunk_size) + "&start=" + str(start))
                f = response.read()
                data = json.loads(f)
                results = data.get("response").get("docs")

                print str(datetime.datetime.now()) + " Inserting " + str(start) + " - " + str(
                    start + int(data.get("responseHeader").get("params").get("rows")) - 1
                ) + " of " + str(rows)

                for x in range(0, len(results)):
                    sql = """select count(id) as count from package where id = :pkg_id;"""
                    q = model.Session.execute(sql, {"pkg_id": results[x]["id"]})
                    for row in q:
                        if row["count"] == 0:
                            sql = """insert into solr_pkg_ids (pkg_id, action) values (:pkg_id, :action);"""
                            model.Session.execute(sql, {"pkg_id": results[x]["id"], "action": "notfound"})
                            model.Session.commit()
                        else:
                            pkg_dict = logic.get_action("package_show")(
                                {"model": model, "ignore_auth": True, "validate": False}, {"id": results[x]["id"]}
                            )
                            if str(results[x]["metadata_modified"])[:19] != pkg_dict["metadata_modified"][:19]:
                                print str(
                                    datetime.datetime.now()
                                ) + " Action Type : outsync for Package Id: " + results[x]["id"]
                                print " " * 26 + " Modified Date from Solr: " + str(results[x]["metadata_modified"])
                                print " " * 26 + " Modified Date from Db: " + pkg_dict["metadata_modified"]
                                sql = """insert into solr_pkg_ids (pkg_id, action) values (:pkg_id, :action);"""
                                model.Session.execute(sql, {"pkg_id": results[x]["id"], "action": "outsync"})
                                model.Session.commit()
                            else:
                                sql = """insert into solr_pkg_ids (pkg_id, action) values (:pkg_id, :action);"""
                                model.Session.execute(sql, {"pkg_id": results[x]["id"], "action": "insync"})
                                model.Session.commit()

                start = int(data.get("responseHeader").get("params").get("start")) + chunk_size

            print str(datetime.datetime.now()) + " Starting Database to Solr Sync"

            # sql = '''Select id from package where id not in (select pkg_id from solr_pkg_ids); '''
            sql = """Select p.id as pkg_id from package p
                   left join solr_pkg_ids sp on sp.pkg_id = p.id
                   where sp.pkg_id is null; """

            q = model.Session.execute(sql)
            pkg_ids = set()
            for row in q:
                pkg_ids.add(row["pkg_id"])
            for pkg_id in pkg_ids:
                try:
                    print str(datetime.datetime.now()) + " Building Id: " + pkg_id
                    search.rebuild(pkg_id)
                except ckan.logic.NotFound:
                    print "Error: Not Found."
                except KeyboardInterrupt:
                    print "Stopped."
                    return
                except:
                    raise

            sql = """Select pkg_id from solr_pkg_ids where action = 'outsync'; """
            q = model.Session.execute(sql)
            pkg_ids = set()
            for row in q:
                pkg_ids.add(row["pkg_id"])
            for pkg_id in pkg_ids:
                try:
                    print str(datetime.datetime.now()) + " Rebuilding Id: " + pkg_id
                    search.rebuild(pkg_id)
                except ckan.logic.NotFound:
                    print "Error: Not Found."
                except KeyboardInterrupt:
                    print "Stopped."
                    return
                except:
                    raise

            print str(datetime.datetime.now()) + " Starting Solr to Database Sync"

            sql = """Select pkg_id from solr_pkg_ids where action = 'notfound'; """
            q = model.Session.execute(sql)
            pkg_ids = set()
            for row in q:
                pkg_ids.add(row["pkg_id"])
            for pkg_id in pkg_ids:
                try:
                    search.clear(pkg_id)
                except ckan.logic.NotFound:
                    print "Error: Not Found."
                except KeyboardInterrupt:
                    print "Stopped."
                    return
                except:
                    raise

            print str(datetime.datetime.now()) + " All Sync Done."

Ejemplo n.º 47

0

Mostrar archivo

Archivo: commands.py Proyecto: jjjohnst/ckanext-geodatagov

    def harvest_object_relink(self):
        print "%s: Fix packages which lost harvest objects." % datetime.datetime.now()

        pkgs_problematic = set()
        # find packages that has no current harvest object
        sql = """
            SELECT DISTINCT package_id
            FROM harvest_object
            WHERE
                state = 'COMPLETE'
            AND
                package_id NOT IN (
                    SELECT DISTINCT package_id
                    FROM harvest_object
                    WHERE current='t'
                )
        """
        results = model.Session.execute(sql)
        for row in results:
            pkgs_problematic.add(row["package_id"])
        total = len(pkgs_problematic)
        print "%s packages to be fixed." % total

        # set last complete harvest object to be current
        sql = """
            UPDATE harvest_object
            SET current = 't'
            WHERE
                package_id = :id
            AND
                state = 'COMPLETE'
            AND
                import_finished = (
                    SELECT MAX(import_finished)
                    FROM harvest_object
                    WHERE
                        state = 'COMPLETE'
                    AND
                        package_id = :id
                )
            RETURNING 1
        """
        count = 0
        for id in pkgs_problematic:
            result = model.Session.execute(sql, {"id": id}).fetchall()
            model.Session.commit()
            count = count + 1
            if result:
                print "%s: %s/%s id %s fixed. Now pushing to solr... " % (datetime.datetime.now(), count, total, id),
                try:
                    search.rebuild(id)
                except KeyboardInterrupt:
                    print "Stopped."
                    return
                except:
                    raise
                print "Done."
            else:
                print "%s: %s/%s id %s has no valid harvest object. Need to inspect mannully. " % (
                    datetime.datetime.now(),
                    count,
                    total,
                    id,
                )

        if not pkgs_problematic:
            print "%s: All looks good. Nothing to do. " % datetime.datetime.now()

Ejemplo n.º 48

0

Mostrar archivo

    def harvest_object_relink(self, harvest_source_id=None):
        print '%s: Fix packages which lost harvest objects for harvest source %s.' % \
                (datetime.datetime.now(), harvest_source_id if harvest_source_id else 'all')

        pkgs_problematic = set()
        # find packages that has no current harvest object
        sql = '''
            WITH temp_ho AS (
              SELECT DISTINCT package_id
                      FROM harvest_object
                      WHERE current
            )
            SELECT DISTINCT harvest_object.package_id
            FROM harvest_object
            LEFT JOIN temp_ho
            ON harvest_object.package_id = temp_ho.package_id
            WHERE
                temp_ho.package_id IS NULL
            AND
                harvest_object.state = 'COMPLETE'
        '''
        if harvest_source_id:
            sql += '''
            AND
                harvest_object.harvest_source_id = :harvest_source_id
            '''
            results = model.Session.execute(sql,
                    {'harvest_source_id': harvest_source_id})
        else:
            results = model.Session.execute(sql)

        for row in results:
            pkgs_problematic.add(row['package_id'])
        total = len(pkgs_problematic)
        print '%s packages to be fixed.' % total

        # set last complete harvest object to be current
        sql = '''
            UPDATE harvest_object
            SET current = 't'
            WHERE
                package_id = :id
            AND
                state = 'COMPLETE'
            AND
                import_finished = (
                    SELECT MAX(import_finished)
                    FROM harvest_object
                    WHERE
                        state = 'COMPLETE'
                    AND
                        package_id = :id
                )
            RETURNING 1
        '''
        count = 0
        for id in pkgs_problematic:
            result = model.Session.execute(sql, {'id': id}).fetchall()
            model.Session.commit()
            count = count + 1
            if result:
                print '%s: %s/%s id %s fixed. Now pushing to solr... ' % (datetime.datetime.now(), count, total, id),
                try:
                    search.rebuild(id)
                except KeyboardInterrupt:
                    print "Stopped."
                    return
                except:
                    raise
                print 'Done.'
            else:
                print '%s: %s/%s id %s has no valid harvest object. Need to inspect mannully. ' % (
                    datetime.datetime.now(), count, total, id)

        if not pkgs_problematic:
            print '%s: All harvest objects look good. Nothing to do. ' % datetime.datetime.now()