Ejemplo n.º 1
0
 def get_es_db_missing_pids(cls,
                            doc_type,
                            with_deleted=False,
                            verbose=False):
     """Get ES and DB counts."""
     endpoint = current_app.config.get('RECORDS_REST_ENDPOINTS').get(
         doc_type, {})
     index = endpoint.get('search_index')
     pids_es_double = []
     pids_es = []
     pids_db = []
     if index:
         pids_es = {}
         progress = progressbar(
             items=RecordsSearch(index=index).source('pid').scan(),
             length=RecordsSearch(index=index).source('pid').count(),
             verbose=verbose)
         for hit in progress:
             if pids_es.get(hit.pid):
                 pids_es_double.append(hit.pid)
             pids_es[hit.pid] = 1
         agent_class = get_agent_class(doc_type)
         pids_db = []
         progress = progressbar(
             items=agent_class.get_all_pids(with_deleted=with_deleted),
             length=agent_class.count(with_deleted=with_deleted),
             verbose=verbose)
         for pid in progress:
             if pids_es.get(pid):
                 pids_es.pop(pid)
             else:
                 pids_db.append(pid)
         pids_es = [v for v in pids_es]
     return pids_es, pids_db, pids_es_double, index
Ejemplo n.º 2
0
def test_cli_full_reindex(app, db, es, capsys, es_acl_prepare, test_users):
    pid, record = create_record(
        {
            '$schema': RECORD_SCHEMA,
            'keywords': ['blah']
        },
        clz=SchemaEnforcingRecord)
    RecordIndexer().index(record)
    current_search_client.indices.flush()
    with db.session.begin_nested():
        acl = ElasticsearchACL(name='test',
                               schemas=[RECORD_SCHEMA],
                               priority=0,
                               operation='get',
                               originator=test_users.u1,
                               record_selector={'term': {
                                   'keywords': 'blah'
                               }})
        db.session.add(acl)
        u = UserActor(name='test',
                      acl=acl,
                      originator=test_users.u1,
                      users=[test_users.u1])
        db.session.add(u)

    # now the record is not indexed and ACL is not in the helper index, check it ...
    retrieved = RecordsSearch(
        index=schema_to_index(RECORD_SCHEMA)[0]).get_record(
            record.id).execute().hits[0].to_dict()
    assert '_invenio_explicit_acls' not in retrieved

    # just a precaution test
    assert current_explicit_acls.enabled_schemas == {RECORD_SCHEMA}

    # and run the reindex - should reindex one record
    from invenio_explicit_acls.cli import full_reindex_impl
    full_reindex_impl(verbose=True, records=True, in_bulk=False)

    captured = capsys.readouterr()
    assert captured.out.strip() == """
Reindexing ACLs
Updating ACL representation for "test" (%s) on schemas ['records/record-v1.0.0.json']
Getting records for schema records/record-v1.0.0.json
   ... collected 1 records
Adding 1 records to indexing queue""".strip() % (acl.id)

    current_search_client.indices.flush()

    retrieved = RecordsSearch(
        index=schema_to_index(RECORD_SCHEMA)[0]).get_record(
            record.id).execute().hits[0].to_dict()
    assert clear_timestamp(retrieved['_invenio_explicit_acls']) == [{
        'id':
        str(acl.id),
        'operation':
        'get',
        'timestamp':
        'cleared',
        'user': [1]
    }]
Ejemplo n.º 3
0
def get_elasticsearch_records_data_by_indexes(index_ids, start_date, end_date):
    """Get data from elastic search.

    Arguments:
        index_ids -- index tree identifier list

    Returns:
        dictionary -- elastic search data

    """
    records_search = RecordsSearch()
    records_search = records_search.with_preference_param().\
        params(version=False)
    records_search._index[0] = current_app.config['SEARCH_UI_SEARCH_INDEX']
    result = None
    try:
        from weko_search_ui.query import item_search_factory

        search_instance, _qs_kwargs = item_search_factory(
            None, records_search, start_date, end_date, index_ids)
        search_result = search_instance.execute()
        result = search_result.to_dict()
    except NotFoundError:
        current_app.logger.debug('Indexes do not exist yet!')

    return result
Ejemplo n.º 4
0
def cleanup_index_batch(hepsubmission_record_ids, index):
    log.info('Cleaning up index for data records for hepsubmission IDs {0} to {1}'.format(hepsubmission_record_ids[0], hepsubmission_record_ids[-1]))
    # Find all datasubmission entries matching the given hepsubmission ids,
    # where the version is not the highest version present (i.e. there is not
    # a v2 record with the same associated_recid)
    d1 = aliased(DataSubmission)
    d2 = aliased(DataSubmission)
    qry = db.session.query(d1.associated_recid) \
        .join(HEPSubmission,
              and_(HEPSubmission.publication_recid == d1.publication_recid,
                   HEPSubmission.version == d1.version),
              isouter=True) \
        .join(d2,
              and_(d1.associated_recid == d2.associated_recid,
                   d1.version < d2.version),
              isouter=True) \
        .filter(HEPSubmission.id.in_(hepsubmission_record_ids), d2.id == None) \
        .order_by(d1.id)
    res = qry.all()

    ids = [x[0] for x in res]
    if ids:
        log.info(f'Deleting entries from index with ids {ids}')
        s = RecordsSearch(index=index).filter('terms', _id=ids)
        s.delete()
Ejemplo n.º 5
0
def get_elasticsearch_result_by_date(start_date, end_date):
    """Get data from elastic search.

    Arguments:
        start_date {string} -- start date
        end_date {string} -- end date

    Returns:
        dictionary -- elastic search data

    """
    records_search = RecordsSearch()
    records_search = records_search.with_preference_param().params(
        version=False)
    records_search._index[0] = current_app.config['SEARCH_UI_SEARCH_INDEX']
    result = None
    try:
        search_instance, _qs_kwargs = item_search_factory(
            None, records_search, start_date, end_date)
        search_result = search_instance.execute()
        result = search_result.to_dict()
    except NotFoundError:
        current_app.logger.debug('Indexes do not exist yet!')

    return result
Ejemplo n.º 6
0
 def get_es_db_missing_pids(self, doc_type, with_deleted=False):
     """Get ES and DB counts."""
     endpoint = current_app.config.get(
         'RECORDS_REST_ENDPOINTS'
     ).get(doc_type, {})
     index = endpoint.get('search_index')
     pids_es_double = []
     pids_es = []
     pids_db = []
     if index and doc_type not in self.has_no_db:
         date = datetime.utcnow() - timedelta(minutes=self.time_delta)
         pids_es = {}
         es_query = RecordsSearch(index=index) \
             .filter('range', _created={'lte': date})
         for hit in es_query.source('pid').scan():
             if pids_es.get(hit.pid):
                 pids_es_double.append(hit.pid)
             pids_es[hit.pid] = 1
         pids_db = []
         for pid in self.get_all_pids(
             doc_type,
             with_deleted=with_deleted,
             date=date
         ):
             if pids_es.get(pid):
                 pids_es.pop(pid)
             else:
                 pids_db.append(pid)
         pids_es = [v for v in pids_es]
     return pids_es, pids_db, pids_es_double, index
Ejemplo n.º 7
0
def get_entry_uuid_by_unique_field(index, dict_unique_field_value):
    rs = RecordsSearch(index=index)
    res = rs.query(Q('match', **dict_unique_field_value)).execute().hits.hits

    if not res:
        raise DepositDoesNotExist
    else:
        return res[0]['_id']
Ejemplo n.º 8
0
Archivo: oai.py Proyecto: xbee/zenodo
 def _es_identifiers(self):
     """Return a set of the Community OAI Set recids from Elasticsearch."""
     query = Q('bool',
               filter=Q('exists', field='_oai.id'),
               must=Q('match', **{'_oai.sets': self.community.oaiset_spec}))
     index = current_app.config['OAISERVER_RECORD_INDEX']
     search = RecordsSearch(index=index).source(['_oai.id']).query(query)
     return {int(r._oai.id.rsplit(':', 1)[-1]) for r in search.scan()}
Ejemplo n.º 9
0
 def _es_identifiers(self):
     """Return a set of the Community OAI Set recids from Elasticsearch."""
     query = Q('bool',
               filter=Q('exists', field='_oai.id'),
               must=Q('match', **{'_oai.sets': self.community.oaiset_spec}))
     index = current_app.config['OAISERVER_RECORD_INDEX']
     fields = ['recid']
     search = RecordsSearch(index=index).fields(fields).query(query)
     return {int(r.meta.fields['recid'][0]) for r in search.scan()}
def get_entry_uuid_by_unique_field(index, dict_unique_field_value):
    """Return record by uuid."""
    rs = RecordsSearch(index=index)
    res = rs.query(Q('match',
                     **dict_unique_field_value)).execute().hits.hits

    if not res:
        raise DepositDoesNotExist
    else:
        return res[0]['_id']
Ejemplo n.º 11
0
def get_feedback_mail_list():
    """Get tree items."""
    records_search = RecordsSearch()
    records_search = records_search.with_preference_param().params(
        version=False)
    records_search._index[0] = current_app.config['SEARCH_UI_SEARCH_INDEX']
    search_instance = feedback_email_search_factory(None, records_search)
    search_result = search_instance.execute()
    rd = search_result.to_dict()
    return rd.get('aggregations').get('feedback_mail_list')\
        .get('email_list').get('buckets')
Ejemplo n.º 12
0
def get_items_by_index_tree(index_tree_id):
    """Get tree items."""
    records_search = RecordsSearch()
    records_search = records_search.with_preference_param().params(
        version=False)
    records_search._index[0] = current_app.config['SEARCH_UI_SEARCH_INDEX']
    search_instance = item_path_search_factory(search=records_search,
                                               index_id=index_tree_id)
    search_result = search_instance.execute().to_dict()

    return search_result.get('hits').get('hits')
Ejemplo n.º 13
0
def index():
    """Home Page."""
    search = RecordsSearch(index='records')[0:0]
    search.aggs.bucket('institutions', 'terms', field='_collections',
                       size=1000, include='RERO_DOC\.NAVSITE\.[A-Z]+')
    search.aggs.bucket('doc_type', 'terms', field='_collections',
                       size=1000, include='RERO_DOC.NAVDOCTYPE\.[A-Z]+')
    results = search.execute()
    institutions = results.aggregations.institutions.to_dict().get('buckets')
    # institutions = {}
    doc_types = results.aggregations.doc_type.to_dict().get('buckets')
    return render_template('rerodoc_app/index.html', institutions=institutions,
                           doc_types=doc_types, n_documents=results.hits.total)
Ejemplo n.º 14
0
def collection_records(collection=None):
    collections = Collection.query.filter(Collection.name.in_(
        [collection])).one().drilldown_tree()

    query_array = get_collections_queries(collections)
    query_string = ' or '.join(query_array)

    search = RecordsSearch().params(version=True).query(
        QueryString(query=query_string))
    response = search.execute().to_dict()
    recs = json_v1.serialize_search(cap_record_fetcher, response)

    records = {'records': recs}

    return recs
Ejemplo n.º 15
0
 def __init__(self,
              read_permission_factory=None,
              create_permission_factory=None,
              workflow_object_serializers=None,
              search_index=None,
              search_type=None,
              record_loaders=None,
              search_serializers=None,
              default_media_type=None,
              max_result_window=None,
              search_factory=None,
              item_links_factory=None,
              record_class=None,
              **kwargs):
     """Constructor."""
     super(WorkflowsListResource,
           self).__init__(method_serializers={
               'GET': search_serializers,
               'POST': workflow_object_serializers,
           },
                          default_method_media_type={
                              'GET': default_media_type,
                              'POST': default_media_type,
                          },
                          default_media_type=default_media_type,
                          **kwargs)
     self.searcher = RecordsSearch(
         index=search_index, doc_type=search_type).params(version=True)
     self.max_result_window = max_result_window
     self.search_factory = partial(search_factory, self)
Ejemplo n.º 16
0
    def get_es_count(cls, index, date=None):
        """Get elasticsearch count.

        Get count of items in elasticsearch for the given index.

        :param index: index.
        :return: items count.
        """
        try:
            query = RecordsSearch(index=index).query()
            if date:
                query = query.filter('range', _created={'lte': date})
            result = query.count()
        except NotFoundError:
            result = f'No >>{index}<< in ES'
        return result
Ejemplo n.º 17
0
    def missing(cls, doc_type):
        """Get missing pids.

        Get missing pids in database and elasticsearch and find duplicate
        pids in elasticsearch.

        :param doc_type: doc type to get missing pids.
        :return: dictionair with all missing pids.
        """
        endpoint = current_app.config.get('RECORDS_REST_ENDPOINTS').get(
            doc_type, {})
        index = endpoint.get('search_index', '')
        if index:
            pids_es = [
                v.pid
                for v in RecordsSearch(index=index).source(['pid']).scan()
            ]
            pids_db = [
                v.pid_value for v in PersistentIdentifier.query.filter(
                    PersistentIdentifier.pid_type == doc_type).all()
            ]
            missing_in_db = set(pids_es).difference(set(pids_db))
            missing_in_es = set(pids_db).difference(set(pids_es))
            return {
                'DB': list(missing_in_db),
                'ES': list(missing_in_es),
                'ES duplicate': [x for x in pids_es if pids_es.count(x) > 1]
            }
        else:
            return {
                'ERROR':
                'Document type not found: {doc_type}'.format(doc_type=doc_type)
            }
Ejemplo n.º 18
0
 def get_es_db_missing_pids(cls, doc_type, with_deleted=False):
     """Get ES and DB counts."""
     endpoint = current_app.config.get('RECORDS_REST_ENDPOINTS').get(
         doc_type, {})
     index = endpoint.get('search_index')
     pids_es_double = []
     pids_es = []
     pids_db = []
     if index:
         pids_es = {}
         for hit in RecordsSearch(index=index).source('pid').scan():
             if pids_es.get(hit.pid):
                 pids_es_double.append(hit.pid)
             pids_es[hit.pid] = 1
         query = PersistentIdentifier.query.filter_by(pid_type=doc_type)
         if not with_deleted:
             query = query.filter_by(status=PIDStatus.REGISTERED)
         pids_db = []
         for identifier in query:
             if pids_es.get(identifier.pid_value):
                 pids_es.pop(identifier.pid_value)
             else:
                 pids_db.append(identifier.pid_value)
         pids_es = [v for v in pids_es]
     return pids_es, pids_db, pids_es_double, index
def collection_records(collection=None):
    collections = Collection.query.filter(
            Collection.name.in_([collection])).one().drilldown_tree()

    query_array = get_collections_queries(collections)
    query_string = ' or '.join(query_array)

    search = RecordsSearch().params(version=True).query(QueryString(query=query_string))
    response = search.execute().to_dict()
    recs = json_v1.serialize_search(cap_record_fetcher, response)

    records = {
        'records': recs
    }

    return recs
Ejemplo n.º 20
0
def pending_in_holding_pen(obj, eng):
    """Check if a record exists in HP by looking in given KB."""
    from elasticsearch_dsl import Q
    from invenio_db import db
    from invenio_search import RecordsSearch
    from invenio_workflows.models import WorkflowObjectModel, ObjectStatus

    config = current_app.config['WORKFLOWS_UI_REST_ENDPOINT']
    index = config.get('search_index')
    doc_type = config.get('search_type')
    searcher = RecordsSearch(
        index=index, doc_type=doc_type
    ).params(version=True)

    identifiers = []
    for field, lookup in six.iteritems(
            current_app.config.get("HOLDING_PEN_MATCH_MAPPING", {})):
        # Add quotes around to make the search exact
        identifiers += ['{0}:"{1}"'.format(field, i)
                        for i in get_value(obj.data, lookup, [])]
    # Search for any existing record in Holding Pen, exclude self
    if identifiers:
        search = searcher.query(Q('query_string',
                                query=" OR ".join(identifiers),
                                allow_leading_wildcard=False))
        search_result = search.execute()
        id_list = [int(hit.id) for hit in search_result.hits]
        matches_excluding_self = set(id_list) - set([obj.id])
        if matches_excluding_self:
            obj.extra_data["holdingpen_ids"] = list(matches_excluding_self)
            pending_records = db.session.query(
                WorkflowObjectModel
            ).with_entities(WorkflowObjectModel.id).filter(
                WorkflowObjectModel.status != ObjectStatus.COMPLETED,
                WorkflowObjectModel.id.in_(matches_excluding_self)
            ).all()
            if pending_records:
                pending_ids = [o[0] for o in pending_records]
                obj.extra_data['pending_holdingpen_ids'] = pending_ids
                obj.log.info(
                    "Pending records already found in Holding Pen ({0})"
                    .format(
                        pending_ids
                    )
                )
                return True
    return False
Ejemplo n.º 21
0
def pending_in_holding_pen(obj, eng):
    """Check if a record exists in HP by looking in given KB."""
    from elasticsearch_dsl import Q
    from invenio_db import db
    from invenio_search import RecordsSearch
    from invenio_workflows.models import WorkflowObjectModel, ObjectStatus

    config = current_app.config['WORKFLOWS_UI_REST_ENDPOINT']
    index = config.get('search_index')
    doc_type = config.get('search_type')
    searcher = RecordsSearch(
        index=index, doc_type=doc_type
    ).params(version=True)

    identifiers = []
    for field, lookup in six.iteritems(
            current_app.config.get("HOLDING_PEN_MATCH_MAPPING", {})):
        # Add quotes around to make the search exact
        identifiers += ['{0}:"{1}"'.format(field, i)
                        for i in get_value(obj.data, lookup, [])]
    # Search for any existing record in Holding Pen, exclude self
    if identifiers:
        search = searcher.query(Q('query_string',
                                query=" OR ".join(identifiers),
                                allow_leading_wildcard=False))
        search_result = search.execute()
        id_list = [int(hit.id) for hit in search_result.hits]
        matches_excluding_self = set(id_list) - set([obj.id])
        if matches_excluding_self:
            obj.extra_data["holdingpen_ids"] = list(matches_excluding_self)
            pending_records = db.session.query(
                WorkflowObjectModel
            ).with_entities(WorkflowObjectModel.id).filter(
                WorkflowObjectModel.status != ObjectStatus.COMPLETED,
                WorkflowObjectModel.id.in_(matches_excluding_self)
            ).all()
            if pending_records:
                pending_ids = [o[0] for o in pending_records]
                obj.extra_data['pending_holdingpen_ids'] = pending_ids
                obj.log.info(
                    "Pending records already found in Holding Pen ({0})"
                    .format(
                        pending_ids
                    )
                )
                return True
    return False
Ejemplo n.º 22
0
def get_record(record_id, index=None):
    """ Fetch a given record from ES.

    :param record_id: [int] ES record id
    :param index: [string] name of the index. If None a default is used

    :return: [dict] Fetched record
    """
    try:
        search = RecordsSearch(using=es, index=index).source(includes="*")
        result = search.get_record(record_id).execute()
        if result.hits.total.value > 0:
            return result.hits[0].to_dict()
        else:
            return None

    except TransportError:
        return None
Ejemplo n.º 23
0
def get_deposit_by_cadi_id(cadi_id):
    """Return deposit with given cadi id.

    :params str cadi_id: CADI identifier

    :rtype `cap.modules.deposits.api:CAPDeposit`
    """
    rs = RecordsSearch(index='deposits-records')

    res = rs.query(Q('match', basic_info__cadi_id__keyword=cadi_id)) \
        .execute().hits.hits

    if not res:
        raise DepositDoesNotExist
    else:
        uuid = res[0]['_id']
        deposit = CAPDeposit.get_record(uuid)

    return deposit
def get_deposit_by_cadi_id(cadi_id):
    """Return deposit with given cadi id.

    :params str cadi_id: CADI identifier

    :rtype `cap.modules.deposits.api:CAPDeposit`
    """
    rs = RecordsSearch(index='deposits-records')

    res = rs.query(Q('match', basic_info__cadi_id__keyword=cadi_id)) \
        .execute().hits.hits

    if not res:
        raise DepositDoesNotExist
    else:
        uuid = res[0]['_id']
        deposit = CAPDeposit.get_record(uuid)

    return deposit
Ejemplo n.º 25
0
    def get_record(cls, _id):
        """Retrieve the record by ID.

        Raise a database exception if the record does not exist.
        :param id_: record ID.
        :returns: The :class:`Record` instance.
        """
        # here the elasticsearch get API cannot be used with an index alias
        return cls(
            next(
                RecordsSearch(index=cls.index_name).filter(
                    'term', _id=_id).scan()).to_dict())
Ejemplo n.º 26
0
    def get_es_count(self, index):
        """Get elasticsearch count.

        Get count of items in elasticsearch for the given index.

        :param index: Elasticsearch index.
        :return: Items count.
        """
        try:
            return RecordsSearch(index=index).query().count()
        except NotFoundError:
            raise Exception('No index found for "{type}"'.format(type=index))
Ejemplo n.º 27
0
    def get_es_count(cls, index):
        """Get elasticsearch count.

        Get count of items in elasticsearch for the given index.

        :param index: index.
        :return: items count.
        """
        try:
            result = RecordsSearch(index=index).query().count()
        except NotFoundError:
            result = 'No >>{index}<< in ES'.format(index=index)
        return result
Ejemplo n.º 28
0
    def get_logs_by_record_pid(cls, pid):
        """Get all logs for a given record PID.

        :param str pid: record PID.
        :returns: List of logs.
        :rtype: list
        """
        return list(
            RecordsSearch(index=cls.index_name).filter(
                'bool', must={
                    'exists': {
                        'field': 'loan'
                    }
                }).filter('term', record__value=pid).scan())
Ejemplo n.º 29
0
def assert_es_equals_db():
    """Assert that the relationships in ES the GroupRelationships in DB.

    NOTE: This tests takes the state of the DB as the reference for comparison.
    """
    # Wait for ES to be available
    current_search.flush_and_refresh('relationships')

    # Fetch all DB objects and all ES objects
    es_q = list(RecordsSearch(index='relationships').query().scan())
    db_q = GroupRelationship.query.all()

    # normalize and compare two sets
    es_norm_q = list(map(normalize_es_result, es_q))
    db_norm_q = list(map(normalize_db_result, db_q))
    assert set(es_norm_q) == set(db_norm_q)
Ejemplo n.º 30
0
    def missing_pids(self, doc_type, with_deleted=False):
        """Get ES and DB counts.

        :param doc_type: Resource type.
        :param with_deleted: Check also delete items in database.
        """
        index = current_app.config.get('RECORDS_REST_ENDPOINTS').get(
            doc_type, {}).get('search_index')

        if not index:
            raise Exception(
                'No "search_index" configured for resource "{type}"'.format(
                    type=doc_type))

        result = {'es': [], 'es_double': [], 'db': []}

        # Elastic search PIDs
        es_pids = {}
        for hit in RecordsSearch(index=index).source('pid').scan():
            if es_pids.get(hit.pid):
                result['es_double'].append(hit.pid)
            es_pids[hit.pid] = 1

        # Database PIDs
        query = PersistentIdentifier.query.filter_by(pid_type=doc_type)
        if not with_deleted:
            query = query.filter_by(status=PIDStatus.REGISTERED)

        for identifier in query:
            if es_pids.get(identifier.pid_value):
                es_pids.pop(identifier.pid_value)
            else:
                result['db'].append(identifier.pid_value)

        # Transform dictionary to list
        result['es'] = [v for v in es_pids]

        return result
Ejemplo n.º 31
0
 def get_es_db_missing_pids(cls, doc_type, with_deleted=False):
     """Get ES and DB counts."""
     endpoint = current_app.config.get('RECORDS_REST_ENDPOINTS').get(
         doc_type, {})
     index = endpoint.get('search_index')
     record_class = get_record_class_from_schema_or_pid_type(
         pid_type=doc_type)
     pids_es_double = []
     pids_es = []
     pids_db = []
     if index:
         pids_es = {}
         for hit in RecordsSearch(index=index).source('pid').scan():
             if pids_es.get(hit.pid):
                 pids_es_double.append(hit.pid)
             pids_es[hit.pid] = 1
         pids_db = []
         for pid in record_class.get_all_pids(with_deleted=with_deleted):
             if pids_es.get(pid):
                 pids_es.pop(pid)
             else:
                 pids_db.append(pid)
         pids_es = [v for v in pids_es]
     return pids_es, pids_db, pids_es_double, index
Ejemplo n.º 32
0
def search(query,
           index=None,
           filters=list(),
           size=10,
           include="*",
           exclude="authors",
           offset=0,
           sort_field=None,
           sort_order='',
           post_filter=None):
    """ Perform a search query.

    :param query: [string] query string e.g. 'higgs boson'
    :param index: [string] name of the index. If None a default is used
    :param filters: [list of tuples] list of filters for the query.
                    Currently supported: ('author', author_fullname),
                    ('collaboration', collaboration_name), ('date', date)
    :param size: [int] max number of hits that should be returned
    :param offset: [int] offset for the results (used for pagination)
    :param sort_by: [string] sorting field. Currently supported fields:
                    "title", "collaboration", "date", "relevance"
    :param sort_order: [string] order of the sorting either original
                    (for a particular field) or reversed. Supported:
                    '' or 'rev'

    :return: [dict] dictionary with processed results and facets
    """
    # If empty query then sort by date
    if query == '' and not sort_field:
        sort_field = 'date'

    query = HEPDataQueryParser.parse_query(query)
    # Create search with preference param to ensure consistency of results across shards
    search = RecordsSearch(using=es, index=index).with_preference_param()

    if query:
        fuzzy_query = QueryString(query=query, fuzziness='AUTO')
        search.query = fuzzy_query | \
                       Q('nested', query=fuzzy_query, path='authors') | \
                       Q('has_child', type="child_datatable", query=fuzzy_query)

    search = search.filter("term", doc_type=CFG_PUB_TYPE)
    search = QueryBuilder.add_filters(search, filters)

    mapped_sort_field = sort_fields_mapping(sort_field)
    search = search.sort({mapped_sort_field : {"order" : calculate_sort_order(sort_order, sort_field)}})
    search = add_default_aggregations(search, filters)

    if post_filter:
        search = search.post_filter(post_filter)

    search = search.source(includes=include, excludes=exclude)
    search = search[offset:offset+size]
    pub_result = search.execute().to_dict()

    parent_filter = {
        "terms": {
                    "_id": [hit["_id"] for hit in pub_result['hits']['hits']]
        }
    }

    data_search = RecordsSearch(using=es, index=index)
    data_search = data_search.query('has_parent',
                                    parent_type="parent_publication",
                                    query=parent_filter)
    if query:
        data_search = data_search.query(QueryString(query=query))

    data_search = data_search[0:size*50]
    data_result = data_search.execute().to_dict()

    merged_results = merge_results(pub_result, data_result)
    return map_result(merged_results, filters)
Ejemplo n.º 33
0
def search(query,
           index=None,
           filters=list(),
           size=10,
           include="*",
           exclude="authors",
           offset=0,
           sort_field=None,
           sort_order='',
           post_filter=None):
    """ Perform a search query.

    :param query: [string] query string e.g. 'higgs boson'
    :param index: [string] name of the index. If None a default is used
    :param filters: [list of tuples] list of filters for the query.
                    Currently supported: ('author', author_fullname),
                    ('collaboration', collaboration_name), ('date', date)
    :param size: [int] max number of hits that should be returned
    :param offset: [int] offset for the results (used for pagination)
    :param sort_by: [string] sorting field. Currently supported fields:
                    "title", "collaboration", "date", "relevance"
    :param sort_order: [string] order of the sorting either original
                    (for a particular field) or reversed. Supported:
                    '' or 'rev'

    :return: [dict] dictionary with processed results and facets
    """
    # If empty query then sort by date
    if query == '' and not sort_field:
        sort_field = 'date'

    query = HEPDataQueryParser.parse_query(query)
    # Create search with preference param to ensure consistency of results across shards
    search = RecordsSearch(using=es, index=index).with_preference_param()

    if query:
        fuzzy_query = QueryString(query=query, fuzziness='AUTO')
        search.query = fuzzy_query | \
                       Q('nested', query=fuzzy_query, path='authors') | \
                       Q('has_child', type="child_datatable", query=fuzzy_query)

    search = search.filter("term", doc_type=CFG_PUB_TYPE)
    search = QueryBuilder.add_filters(search, filters)

    mapped_sort_field = sort_fields_mapping(sort_field)
    search = search.sort({
        mapped_sort_field: {
            "order": calculate_sort_order(sort_order, sort_field)
        }
    })
    search = add_default_aggregations(search, filters)

    if post_filter:
        search = search.post_filter(post_filter)

    search = search.source(includes=include, excludes=exclude)
    search = search[offset:offset + size]

    try:
        pub_result = search.execute().to_dict()

        parent_filter = {
            "terms": {
                "_id": [hit["_id"] for hit in pub_result['hits']['hits']]
            }
        }

        data_search = RecordsSearch(using=es, index=index)
        data_search = data_search.query('has_parent',
                                        parent_type="parent_publication",
                                        query=parent_filter)
        if query:
            data_search = data_search.query(QueryString(query=query))

        data_search_size = size * ELASTICSEARCH_MAX_RESULT_WINDOW // LIMIT_MAX_RESULTS_PER_PAGE
        data_search = data_search[0:data_search_size]
        data_result = data_search.execute().to_dict()

        merged_results = merge_results(pub_result, data_result)
        return map_result(merged_results, filters)
    except TransportError as e:
        # For search phase execution exceptions we pass the reason as it's
        # likely to be user error (e.g. invalid search query)
        if e.error == 'search_phase_execution_exception' and e.info \
                and "error" in e.info and isinstance(e.info['error'], dict):
            reason = e.info['error']['root_cause'][0]['reason']
        # Otherwise we hide the details from the user
        else:
            log.error(f'An unexpected error occurred when searching: {e}')
            reason = f'An unexpected error occurred: {e.error}'
        return {'error': reason}
Ejemplo n.º 34
0
def create_blueprint(config, url_endpoint, context_processors):
    """Create UI blueprint for invenio-workflows-ui."""

    blueprint = Blueprint(
        'invenio_workflows_ui',
        __name__,
        url_prefix=url_endpoint,
        template_folder='../templates',
        static_folder='../static',
    )

    index = config.get('search_index')
    doc_type = config.get('search_type')
    search_factory = config.get('search_factory', default_query_factory)
    search_factory = obj_or_import_string(search_factory)

    searcher = RecordsSearch(index=index,
                             doc_type=doc_type).params(version=True)

    def _search(**kwargs):
        search, dummy = search_factory(blueprint, searcher, **kwargs)
        return search.execute()

    @blueprint.route('/', methods=['GET', 'POST'])
    @blueprint.route('/index', methods=['GET', 'POST'])
    @login_required
    def index():
        """Display basic dashboard interface of Workflows UI."""
        q = '_workflow.status:"{0}"'
        error_state_total = _search(q=q.format(ObjectStatus.labels[
            ObjectStatus.ERROR.value])).hits.total
        halted_state_total = _search(q=q.format(ObjectStatus.labels[
            ObjectStatus.HALTED.value])).hits.total
        return render_template(
            current_app.config['WORKFLOWS_UI_INDEX_TEMPLATE'],
            error_state_total=error_state_total,
            halted_state_total=halted_state_total)

    @blueprint.route('/load', methods=['GET', 'POST'])
    @login_required
    def load():
        """Load objects for the table."""
        query_string = request.args.get("search") or ""  # empty to show all
        sort_key = request.args.get('sort_key', "_workflow.modified")
        page = request.args.get('page', 1, type=int)
        per_page = request.args.get('per_page', 25, type=int)

        # __, results = searcher.search(
        #     query_string=search, size=per_page, page=page, sort_key=sort_key
        # )
        search = searcher[(page - 1) * per_page:page * per_page]
        search, dummy = search_factory(blueprint,
                                       search,
                                       sort=sort_key,
                                       q=query_string)
        search_result = search.execute()

        current_app.logger.debug("Total hits: {0}".format(
            search_result.hits.total))
        pagination = Pagination(page, per_page, search_result.hits.total)

        # Make sure requested page is within limits.
        if pagination.page > pagination.pages:
            pagination.page = pagination.pages

        pages_iteration = []
        for iter_page in pagination.iter_pages():
            res = {"page": iter_page}
            if iter_page == pagination.page:
                res["active"] = True
            else:
                res["active"] = False
            pages_iteration.append(res)

        table_data = {
            'rows': [],
            'pagination': {
                "page": pagination.page,
                "pages": pagination.pages,
                "iter_pages": pages_iteration,
                "per_page": pagination.per_page,
                "total_count": pagination.total_count
            }
        }

        # Add current ids in table for use by previous/next
        session['workflows_ui_sort_key'] = sort_key
        session['workflows_ui_per_page'] = per_page
        session['workflows_ui_page'] = page
        session['workflows_ui_search'] = query_string

        table_data["rows"] = get_rows(search_result)
        table_data["rendered_rows"] = "".join(table_data["rows"])
        return jsonify(table_data)

    @blueprint.route('/list', methods=[
        'GET',
    ])
    @blueprint.route('/list/', methods=[
        'GET',
    ])
    @blueprint.route('/list/<search_value>', methods=[
        'GET',
    ])
    @login_required
    def list_objects(search_value=None):
        """Display main table interface of workflows UI."""
        search_value = search_value or session.get(
            "workflows_ui_search", '_workflow.status:"{0}"'.format(
                ObjectStatus.labels[ObjectStatus.HALTED.value]))
        sort_key = request.args.get('sort_key', "_workflow.modified")
        page = request.args.get('page', session.get('workflows_ui_page', 1))
        per_page = request.args.get('per_page',
                                    session.get('workflows_ui_per_page', 25))
        return render_template(
            current_app.config['WORKFLOWS_UI_LIST_TEMPLATE'],
            search=search_value,
            total=_search(q=search_value, sort=sort_key).hits.total,
            type_list=get_data_types(),
            name_list=get_workflow_names(),
            per_page=per_page)

    @blueprint.route('/<int:objectid>', methods=['GET', 'POST'])
    @blueprint.route('/details/<int:objectid>', methods=['GET', 'POST'])
    @login_required
    def details(objectid):
        """Display info about the object."""
        workflow_object = WorkflowObject.query.get_or_404(objectid)

        previous_object_id, next_object_id = get_previous_next_objects(
            session.get("workflows_ui_current_ids"), objectid)

        formatted_data = workflow_object.get_formatted_data()
        action_name = workflow_object.get_action()
        if action_name:
            action = actions[action_name]
            rendered_actions = action().render(workflow_object)
        else:
            rendered_actions = {}

        return render_template(
            current_app.config['WORKFLOWS_UI_DETAILS_TEMPLATE'],
            workflow_object=workflow_object,
            rendered_actions=rendered_actions,
            data_preview=formatted_data,
            workflow_name=workflow_object.get_workflow_name() or "",
            previous_object_id=previous_object_id,
            next_object_id=next_object_id,
        )

    for proc in context_processors:
        blueprint.context_processor(obj_or_import_string(proc))

    return blueprint
Ejemplo n.º 35
0
def test_publish(app, db, schemas, mappings, prepare_es):
    with disable_test_authenticated():
        with db.session.begin_nested():
            draft_uuid = uuid.uuid4()

            rec1 = DraftRecord.create({
                'id': '1',
                'title': 'rec1'
            }, id_=draft_uuid)
            draft1_pid = PersistentIdentifier.create(
                pid_type='drecid', pid_value='1', status=PIDStatus.REGISTERED,
                object_type='rec', object_uuid=draft_uuid
            )

            published_uuid = uuid.uuid4()
            published = PublishedRecord.create({
                'id': '3',
                'title': 'rec1a'
            }, id_=published_uuid)
            published_pid = PersistentIdentifier.create(
                pid_type='recid', pid_value='3', status=PIDStatus.REGISTERED,
                object_type='rec', object_uuid=published_uuid
            )

            draft2_uuid = uuid.uuid4()
            rec2 = DraftRecord.create({
                'id': '2',
                'title': 'rec2',
                'ref': {'$ref': 'http://localhost/drafts/records/1'},
                'ref_pub': {'$ref': 'http://localhost/records/3'}
            }, id_=draft2_uuid)
            draft2_pid = PersistentIdentifier.create(
                pid_type='drecid', pid_value='2', status=PIDStatus.REGISTERED,
                object_type='rec', object_uuid=draft2_uuid
            )
            RecordIndexer().index(rec2)

        current_search_client.indices.refresh()
        current_search_client.indices.flush()

        es_draft2 = RecordsSearch(index='draft-records-record-v1.0.0').\
            get_record(draft2_pid.object_uuid).execute()
        assert len(es_draft2.hits) == 1

        current_drafts.publish(RecordContext(record=rec2, record_pid=draft2_pid))

        published2_pid = PersistentIdentifier.get(pid_type='recid', pid_value=draft2_pid.pid_value)
        pr = PublishedRecord.get_record(published2_pid.object_uuid)
        assert pr.dumps() == {
            '$schema': 'https://localhost/schemas/records/record-v1.0.0.json',
            'id': '2',
            'ref': {'$ref': 'http://localhost/records/1'},
            'ref_pub': {'$ref': 'http://localhost/records/3'},
            'title': 'rec2'
        }

        current_search_client.indices.refresh()
        current_search_client.indices.flush()

        es_published2 = RecordsSearch(index='records-record-v1.0.0').\
            get_record(published2_pid.object_uuid).execute()
        assert len(es_published2.hits) == 1
        es_published2 = es_published2.hits[0].to_dict()
        es_published2.pop('_created')
        es_published2.pop('_updated')
        assert es_published2 == {
            '$schema': 'https://localhost/schemas/records/record-v1.0.0.json',
            'id': '2',
            'ref': {'published': '1'},
            'ref_pub': {'published': '3'},
            'title': 'rec2'}

        es_draft2 = RecordsSearch(index='draft-records-record-v1.0.0').\
            get_record(draft2_pid.object_uuid).execute()
        assert len(es_draft2.hits) == 0