def submit_edit_set(spec): """Insert a new set.""" form = get_NewSetForm(request.form) if request.method == 'POST' and form.validate(): old_set = Set.query.filter(spec=spec) query = Query(old_set.search_pattern) old_recid = current_search_client.search( index="records", doc_type="record", body=query.body, fields="_id, oaiid" ) query = Query(form.search_pattern) new_recid = current_search_client.search( index="records", doc_type="record", body=query.body, fields="_id, oaiid" ) recids_to_delete = set(old_recid)-set(new_recid) # TODO: marks records as deleted from set remove_recids_from_set(recids_to_delete) add_records_to_set(new_recid) flash('Set was changed') return redirect(url_for('.manage_sets')) return render_template('edit_set.html', edit_set_form=form, spec=spec)
def push_data_keywords(pub_ids=None, index=None): """ Go through all the publications and their datatables and move data keywords from tables to their parent publications. """ if not pub_ids: body = {"query": {"match_all": {}}} results = es.search(index=index, doc_type=CFG_PUB_TYPE, body=body, _source=False) pub_ids = [i["_id"] for i in results["hits"]["hits"]] for pub_id in pub_ids: query_builder = QueryBuilder() query_builder.add_child_parent_relation( "publication", relation="parent", must=True, related_query={"match": {"recid": pub_id}} ) tables = es.search(index=index, doc_type=CFG_DATA_TYPE, body=query_builder.query, _source_include="keywords") keywords = [d["_source"].get("keywords", None) for d in tables["hits"]["hits"]] # Flatten the list keywords = [i for inner in keywords for i in inner] # Aggregate agg_keywords = defaultdict(list) for kw in keywords: agg_keywords[kw["name"]].append(kw["value"]) # Remove duplicates for k, v in agg_keywords.items(): agg_keywords[k] = list(set(v)) body = {"doc": {"data_keywords": dict(agg_keywords)}} try: es.update(index=index, doc_type=CFG_PUB_TYPE, id=pub_id, body=body) except Exception as e: log.error(e.message)
def match(record, config=None): """Given a record, yield the records in INSPIRE most similar to it. This method can be used to detect if a record that we are ingesting as a submission or as an harvest is already present in the system, or to find out which record a reference should be pointing to. """ if config is None: current_app.logger.debug( 'No configuration provided. Falling back to the default configuration.' ) config = current_app.config['MATCHER_DEFAULT_CONFIGURATION'] try: algorithm, doc_type, index = config['algorithm'], config[ 'doc_type'], config['index'] except KeyError as e: raise KeyError('Malformed configuration: %s.' % repr(e)) source = config.get('source', []) for i, step in enumerate(algorithm): try: queries = step['queries'] except KeyError: raise KeyError('Malformed algorithm: step %d has no queries.' % i) validator = _get_validator(step.get('validator')) for j, query in enumerate(queries): try: body = compile(query, record) except Exception as e: raise ValueError( 'Malformed query. Query %d of step %d does not compile: %s.' % (j, i, repr(e))) if not body: continue if source: result = es.search(index=index, doc_type=doc_type, body=body, _source=source) else: result = es.search(index=index, doc_type=doc_type, body=body) for hit in result['hits']['hits']: if validator(record, hit): yield hit
def test_index_institutions_record(base_app, es_clear, db, datadir, create_record): data = json.loads((datadir / "902725.json").read_text()) record = create_record("ins", data=data) expected_count = 1 expected_metadata = deepcopy(record) expected_metadata["affiliation_suggest"] = { "input": [ "CERN, Geneva", "CERN", "European Organization for Nuclear Research", "CERN", "Centre Européen de Recherches Nucléaires", "01631", "1211", ] } expected_metadata["_created"] = utils.isoformat(record.created) expected_metadata["_updated"] = utils.isoformat(record.updated) response = es.search("records-institutions") assert response["hits"]["total"] == expected_count assert response["hits"]["hits"][0]["_source"] == expected_metadata
def test_index_record_manually(app, celery_app_with_context, celery_session_worker, retry_until_matched): data = faker.record("lit") rec = LiteratureRecord.create(data) models_committed.disconnect(index_after_commit) db.session.commit() models_committed.connect(index_after_commit) es.indices.refresh("records-hep") result = es.search("records-hep") assert result["hits"]["total"] == 0 rec.index() steps = [ { "step": es.indices.refresh, "args": ["records-hep"] }, { "step": es.search, "args": ["records-hep"], "expected_result": { "expected_key": "hits.total", "expected_result": 1 }, }, ] retry_until_matched(steps)
def get_record_acls(clz, record: Record) -> Iterable['ACL']: """ Returns a list of ACL objects applicable for the given record. :param record: Invenio record :return: """ # run percolate query on the index record's index query = clz._get_percolate_query(record) if logger.isEnabledFor(logging.DEBUG) <= logging.DEBUG: logger.debug('get_material_acls: query %s', json.dumps(query, indent=4, ensure_ascii=False)) index, _doc_type = current_record_to_index(record) try: for r in current_search_client.search( index=clz.get_acl_index_name(index), **add_doc_type(current_app.config['INVENIO_EXPLICIT_ACLS_DOCTYPE_NAME']), body=query )['hits']['hits']: yield clz.query.get(r['_id']) except elasticsearch.TransportError as e: logger.error('Error running ACL query on index %s, doctype %s, query %s', clz.get_acl_index_name(index), current_app.config['INVENIO_EXPLICIT_ACLS_DOCTYPE_NAME'], query) if e.status_code == 404: raise RuntimeError('Explicit ACLs were not prepared for the given schema. ' 'Please run invenio explicit-acls prepare %s' % record.get('$schema', '')) else: # pragma: no cover raise
def test_index_literature_record(es_clear, db, datadir, create_record): author_data = json.loads((datadir / "1032336.json").read_text()) author = create_record("aut", data=author_data) data = json.loads((datadir / "1630825.json").read_text()) record = create_record("lit", data=data) expected_count = 1 expected_metadata = json.loads((datadir / "es_1630825.json").read_text()) expected_metadata_ui_display = json.loads( expected_metadata.pop("_ui_display")) expected_facet_author_name = expected_metadata.pop("facet_author_name") expected_metadata.pop("authors") response = es.search("records-hep") result = response["hits"]["hits"][0]["_source"] result_ui_display = json.loads(result.pop("_ui_display")) result_authors = result.pop("authors") result_facet_author_name = result.pop("facet_author_name") del result["_created"] del result["_updated"] assert response["hits"]["total"] == expected_count assert result == expected_metadata assert result_ui_display == expected_metadata_ui_display assert len(record.get("authors")) == len(result_facet_author_name) assert sorted(result_facet_author_name) == sorted( expected_facet_author_name)
def load_records(app, filename, schema, tries=5): """Try to index records.""" indexer = RecordIndexer() records = [] with app.app_context(): with mock.patch('invenio_records.api.Record.validate', return_value=None): data_filename = pkg_resources.resource_filename( 'invenio_records', filename) records_data = load(data_filename) with db.session.begin_nested(): for item in records_data: record_id = uuid.uuid4() item_dict = dict(marc21.do(item)) item_dict['$schema'] = schema recid_minter(record_id, item_dict) oaiid_minter(record_id, item_dict) record = Record.create(item_dict, id_=record_id) indexer.index(record) records.append(record.id) db.session.commit() # Wait for indexer to finish for i in range(tries): response = current_search_client.search() if response['hits']['total'] >= len(records): break current_search.flush_and_refresh('_all') return records
def get_n_latest_records(n_latest, field="last_updated", index=None): """ Gets latest N records from the index """ query = {"size": n_latest, "query": QueryBuilder.generate_query_string(), "sort": [{field: {"order": "desc"}}]} query_result = es.search(index=index, doc_type=CFG_PUB_TYPE, body=query) return query_result["hits"]["hits"]
def api(): """Search API for search UI demo. .. note:: WARNING! This search API is just for demo proposes only. """ page = request.values.get('page', 1, type=int) size = request.values.get('size', 1, type=int) query = Query(request.values.get('q', ''))[(page-1)*size:page*size] # dummy facets query.body["aggs"] = { "by_body": { "terms": { "field": "summary.summary" } }, "by_title": { "terms": { "field": "title_statement.title" } } } response = current_search_client.search( index=request.values.get('index', 'records'), doc_type=request.values.get('type'), body=query.body, ) return jsonify(**response)
def load_records(app, filename, schema, tries=5): """Try to index records.""" indexer = RecordIndexer() records = [] with app.app_context(): with mock.patch("invenio_records.api.Record.validate", return_value=None): data_filename = pkg_resources.resource_filename( "invenio_records", filename) records_data = load(data_filename) with db.session.begin_nested(): for item in records_data: record_id = uuid.uuid4() item_dict = dict(marc21.do(item)) item_dict["$schema"] = schema recid_minter(record_id, item_dict) oaiid_minter(record_id, item_dict) record = current_oaiserver.record_cls.create(item_dict, id_=record_id) indexer.index(record) records.append(record.id) db.session.commit() # Wait for indexer to finish for i in range(tries): response = current_search_client.search() if response["hits"]["total"] >= len(records): break current_search.flush_and_refresh("_all") return records
def submit_set(): """Insert a new set.""" form = get_NewSetForm(request.form) if request.method == 'POST' and form.validate(): new_set = Set(spec=form.spec.data, name=form.name.data, description=form.description.data, search_pattern=form.search_pattern.data, #collection=form.collection.data, parent=form.parent.data) db.session.add(new_set) #this shoul be moved to UPDATER (celery task) and it sould always take care of adding records to sets. ########## query = Query(form.query.data) response = current_search_client.search( index="records",# make configurable PER SET doc_type="record",# make configurable PER SET body=query.body, fields="_id, oaiid" #path to oaiid as a configurable ) ids = [(a['_id'], a['oaiid']) for a in response['hits']['hits']] add_records_to_set(ids) ######### db.session.commit() flash('New set was added.') return redirect(url_for('.manage_sets')) return render_template('make_set.html', new_set_form=form)
def delete_halted_workflows_for_doi(doi): """ Deletes all workflow that contain the given doi and are in HALTED state. The workflow index will only be updated, when a WorkflowObjectModel instance is saved. When a workflow is halted, the connected object's status won't be changed, hence the index won't be updated. Because of all this, we cannot filter for HALTED state in ElasticSearch. """ current_search_client.indices.refresh("scoap3-workflows-harvesting") search_result = current_search_client.search( index='scoap3-workflows-harvesting', q='metadata.dois.value:"%s"' % doi) workflow_ids = { x['_source']['_workflow']['id_workflow'] for x in search_result['hits']['hits'] } for wid in workflow_ids: if wid: w = Workflow.query.get(wid) if w and w.status == WorkflowStatus.HALTED: db.session.delete(w) db.session.commit()
def search_export(es_dict): """ Exports basic record data for all filtered records. :param es_dict: defines the ElasticSearch data in order to filter the records. """ fields = current_app.config.get('SEARCH_EXPORT_FIELDS') source_fields = [field for _, field, _ in fields] size = current_app.config.get('TOOL_ELASTICSEARCH_PAGE_SIZE', 100) search_index = current_app.config.get('SEARCH_UI_SEARCH_INDEX') result_data = [] index = 0 total_hits = None while total_hits is None or index < total_hits: # query ElasticSearch for result search_results = current_search_client.search(body=es_dict, index=search_index, _source=source_fields, size=size, from_=index) total_hits = search_results['hits']['total']['value'] index += len(search_results['hits']['hits']) # extract and add data to result list for hit in search_results['hits']['hits']: record = hit['_source'] result_data.append( [get_value(record, key, '') for _, _, key in fields]) return {'header': [name for name, _, _ in fields], 'data': result_data}
def load_records(app, filename, schema, tries=5): """Try to index records.""" indexer = RecordIndexer() records = [] with app.app_context(): with mock.patch('invenio_records.api.Record.validate', return_value=None): data_filename = pkg_resources.resource_filename( 'invenio_records', filename) records_data = load(data_filename) with db.session.begin_nested(): for item in records_data: record_id = uuid.uuid4() item_dict = dict(marc21.do(item)) item_dict['$schema'] = schema recid_minter(record_id, item_dict) oaiid_minter(record_id, item_dict) record = Record.create(item_dict, id_=record_id) indexer.index(record) records.append(record.id) db.session.commit() # Wait for indexer to finish for i in range(tries): response = current_search_client.search() if response['hits']['total'] >= len(records): break sleep(5) return records
def already_pending_in_holdingpen_validator(property_name, value): """Check if there's a submission in the holdingpen with the same arXiv ID. """ if property_name == 'arXiv ID': query_should = { 'metadata.arxiv_eprints.value.raw': value, } elif property_name == 'DOI': query_should = { 'metadata.dois.value.raw': value, } query = { "query": { "bool": { "filter": [ { "term": { "metadata.acquisition_source.source": "submitter" }, }, { "bool": { "must_not": { "term": { "_workflow.status": "COMPLETED" } } } } ], "must": [ { "term": query_should, } ] } }, "_source": { "includes": [ "_id" ] } } hits = es.search( index='holdingpen-hep', doc_type='hep', body=query, )['hits']['hits'] matches = dedupe_list(hits) holdingpen_ids = [int(el['_id']) for el in matches] if holdingpen_ids: raise ValidationError( 'There exists already a pending suggestion with the same %s ' '"%s", it will be attended to shortly.' % (property_name, value) )
def already_pending_in_holdingpen_validator(property_name, value): """Check if there's a submission in the holdingpen with the same arXiv ID. """ if property_name == 'arXiv ID': query_should = { 'metadata.arxiv_eprints.value.raw': value, } elif property_name == 'DOI': query_should = { 'metadata.dois.value.raw': value, } query = { "query": { "bool": { "filter": [ { "term": { "metadata.acquisition_source.source": "submitter" }, }, { "bool": { "must_not": { "term": { "_workflow.status": "COMPLETED" } } } } ], "must": [ { "term": query_should, } ] } }, "_source": { "includes": [ "_id" ] } } hits = es.search( index='holdingpen-hep', doc_type='hep', body=query, )['hits']['hits'] matches = dedupe_list(hits) holdingpen_ids = [int(el['_id']) for el in matches] if holdingpen_ids: raise ValidationError( 'There exists already a pending suggestion with the same %s ' '"%s", it will be attended to shortly.' % (property_name, value) )
def validate_record_selector(self, form, field): """Checks that the record selector is valid and we can use it to perform query in elasticsearch index.""" schemas = form.schemas.data record_selector = field.data if not record_selector: raise StopValidation( 'Record selector must not be empty. If you want to match all resources, use {"match_all": {}}') try: for index in schemas: current_search_client.search( index=index, size=0, body={ 'query': record_selector } ) except Exception as e: raise StopValidation(str(e))
def submit_set(): """Insert a new set.""" form = get_NewSetForm(request.form) if request.method == 'POST' and form.validate(): new_set = OAISet(spec=form.spec.data, name=form.name.data, description=form.description.data, search_pattern=form.search_pattern.data, parent=form.parent.data) db.session.add(new_set) # this shoul be moved to UPDATER (celery task) and it sould always # take care of adding records to sets. ########## query = Query(form.query.data) response = current_search_client.search( index='records', # make configurable PER SET doc_type='record', # make configurable PER SET body=query.body, fields='_id, oaiid' # path to oaiid as a configurable ) ids = [(a['_id'], a['oaiid']) for a in response['hits']['hits']] add_records_to_set(ids) ######### db.session.commit() flash(_('New set %(spec)s was added.', spec=new_set.spec)) return redirect(url_for('.manage_sets')) return render_template('make_set.html', new_set_form=form)
def _percolate_query(index, doc_type, percolator_doc_type, document): """Get results for a percolate query.""" if ES_VERSION[0] in (2, 5): results = current_search_client.percolate(index=index, doc_type=doc_type, allow_no_indices=True, ignore_unavailable=True, body={'doc': document}) return results['matches'] elif ES_VERSION[0] == 6: results = current_search_client.search(index=index, doc_type=percolator_doc_type, allow_no_indices=True, ignore_unavailable=True, body={ 'query': { 'percolate': { 'field': 'query', 'document_type': percolator_doc_type, 'document': document, } } }) return results['hits']['hits']
def _percolate_query(index, doc_type, percolator_doc_type, document): """Get results for a percolate query.""" index = _build_percolator_index_name(index) if ES_VERSION[0] in (2, 5): results = current_search_client.percolate(index=index, doc_type=doc_type, allow_no_indices=True, ignore_unavailable=True, body={'doc': document}) return results['matches'] elif ES_VERSION[0] in (6, 7): es_client_params = dict(index=index, doc_type=percolator_doc_type, allow_no_indices=True, ignore_unavailable=True, body={ 'query': { 'percolate': { 'field': 'query', 'document_type': percolator_doc_type, 'document': document, } } }) if ES_VERSION[0] == 7: es_client_params.pop('doc_type') results = current_search_client.search(**es_client_params) return results['hits']['hits']
def get_records_matching_field(field, id, index=None, doc_type=None, source=None): """ Checks if a record with a given ID exists in the index """ query = { "size": 9999, 'query': { "bool": { "must": [ { "match": { field: id } } ] } } } if doc_type: query["query"]["bool"]["must"].append({ "match": { "doc_type": doc_type } }) if source: query["_source"] = source return es.search(index=index, body=query)
def push_data_keywords(pub_ids=None, index=None): """ Go through all the publications and their datatables and move data keywords from tables to their parent publications. """ if not pub_ids: body = {'query': {'match_all': {}}} results = es.search(index=index, doc_type=CFG_PUB_TYPE, body=body, _source=False) pub_ids = [i['_id'] for i in results['hits']['hits']] for pub_id in pub_ids: query_builder = QueryBuilder() query_builder.add_child_parent_relation( 'publication', relation='parent', must=True, related_query={'match': { 'recid': pub_id }}) tables = es.search(index=index, doc_type=CFG_DATA_TYPE, body=query_builder.query, _source_include='keywords') keywords = [ d['_source'].get('keywords', None) for d in tables['hits']['hits'] ] # Flatten the list keywords = [i for inner in keywords for i in inner] # Aggregate agg_keywords = defaultdict(list) for kw in keywords: agg_keywords[kw['name']].append(kw['value']) # Remove duplicates for k, v in agg_keywords.items(): agg_keywords[k] = list(set(v)) body = {"doc": {'data_keywords': dict(agg_keywords)}} try: es.update(index=index, doc_type=CFG_PUB_TYPE, id=pub_id, body=body) except Exception as e: log.error(e.message)
def references(self): """Reference export for single record in datatables format. :returns: list List of lists where every item represents a datatables row. A row consists of [reference_number, reference, num_citations] """ out = [] references = self.record.get('references') if references: refs_to_get_from_es = [ ref['recid'] for ref in references if ref.get('recid') ] query = IQ(' OR '.join('recid:' + str(ref) for ref in refs_to_get_from_es)) records_from_es = current_search_client.search( index='records-hep', doc_type='hep', body={"query": query.to_dict()}, size=9999, _source=[ 'control_number', 'citation_count', 'titles', 'earliest_date', 'authors', 'collaboration', 'corporate_author', 'publication_info' ] )['hits']['hits'] refs_from_es = { str(ref['_source']['control_number']): ref['_source'] for ref in records_from_es } for reference in references: row = [] recid = reference.get('recid') ref_record = refs_from_es.get(str(recid)) if recid else None if recid and ref_record: ref_record = Record(ref_record) if ref_record: row.append(render_template_to_string( "inspirehep_theme/references.html", record=ref_record, reference=reference )) row.append(ref_record.get('citation_count', '')) out.append(row) else: row.append(render_template_to_string( "inspirehep_theme/references.html", reference=reference)) row.append('') out.append(row) return out
def index(): """Query Elasticsearch using Invenio query syntax.""" page = request.values.get("page", 1, type=int) size = request.values.get("size", 1, type=int) query = Query(request.values.get("q", ""))[(page - 1) * size : page * size] response = current_search_client.search( index=request.values.get("index", "demo"), doc_type=request.values.get("type"), body=query.body ) return jsonify(**response)
def search_authors(name, size=20): """ Search for authors in the author index. """ from hepdata.config import CFG_ES_AUTHORS index, doc_type = CFG_ES_AUTHORS query = {"size": size, "query": {"match": {"full_name": {"query": name, "fuzziness": "AUTO"}}}} results = es.search(index=index, doc_type=doc_type, body=query) return [x["_source"] for x in results["hits"]["hits"]]
def index(index_name='demo'): page = request.values.get('page', 1, type=int) size = request.values.get('size', 1, type=int) query = Query(request.values.get('q', ''))[(page-1)*size:page*size] response = current_search_client.search( index=index_name, doc_type=request.values.get('type', 'example'), body=query.body, ) return jsonify(**response)
def test_regression_index_literature_record_with_related_records( es_clear, db, datadir, create_record): data = json.loads((datadir / "1503270.json").read_text()) record = create_record("lit", data=data) response = es.search("records-hep") result = response["hits"]["hits"][0]["_source"] assert data["related_records"] == result["related_records"]
def test_cli_delete_edit_article_workflows(app_cli_runner): wf_to_be_deleted = build_workflow({}, data_type='hep') wf_to_be_deleted.save() start('edit_article', object_id=wf_to_be_deleted.id) wf_to_be_deleted = workflow_object_class.get(wf_to_be_deleted.id) wf_to_be_deleted.status = ObjectStatus.WAITING wf_to_be_deleted.created = datetime.datetime(2020, 7, 8, 12, 31, 8, 299777) wf_to_be_deleted.save() wf_in_error = build_workflow({}, data_type='hep') wf_in_error.status = ObjectStatus.ERROR wf_in_error.extra_data["_error_msg"] = "Error in WebColl" wf_in_error.created = datetime.datetime(2020, 7, 8, 12, 31, 8, 299777) wf_in_error.save() recent_wf = build_workflow({}, data_type='hep') recent_wf.save() start('edit_article', object_id=recent_wf.id) recent_wf = workflow_object_class.get(recent_wf.id) recent_wf.status = ObjectStatus.WAITING recent_wf.created = datetime.datetime(2020, 7, 11, 12, 31, 8, 299777) recent_wf.save() indices = ['holdingpen-hep'] es.indices.refresh(indices) es_result = es.search(indices) assert es_result['hits']['total']['value'] == 3 wf_count = WorkflowObjectModel.query.count() assert wf_count == 3 result = app_cli_runner.invoke(workflows, ['delete_edit_article_older_than']) assert "Found 1 workflows to delete older than 48 hours" in result.output_bytes es.indices.refresh(indices) es_result = es.search(indices) assert es_result['hits']['total']['value'] == 2 wf_count = WorkflowObjectModel.query.count() assert wf_count == 2 assert WorkflowObjectModel.query.filter_by( id=wf_to_be_deleted.id).one_or_none() is None
def get(self, **kwargs): """Search records. :returns: the search result containing hits and aggregations as returned by invenio-search. """ page = request.values.get('page', 1, type=int) size = request.values.get('size', 10, type=int) if page*size >= self.max_result_window: raise MaxResultWindowRESTError() # Parse and slice query try: query = Query(request.values.get('q', ''))[(page-1)*size:page*size] except SyntaxError: raise InvalidQueryRESTError() # Arguments that must be added in prev/next links urlkwargs = dict() # Facets query, qs_kwargs = self.facets_factory(query, self.search_index) urlkwargs.update(qs_kwargs) # Sort query, qs_kwargs = self.sorter_factory(query, self.search_index) urlkwargs.update(qs_kwargs) # Execute search response = current_search_client.search( index=self.search_index, doc_type=self.search_type, body=query.body, version=True, ) # Generate links for prev/next urlkwargs.update( size=size, q=request.values.get('q', ''), _external=True, ) endpoint = 'invenio_records_rest.{0}_list'.format(self.pid_type) links = dict(self=url_for(endpoint, page=page, **urlkwargs)) if page > 1: links['prev'] = url_for(endpoint, page=page-1, **urlkwargs) if size * page < int(response['hits']['total']) and \ size * page < self.max_result_window: links['next'] = url_for(endpoint, page=page+1, **urlkwargs) return self.make_response( pid_fetcher=self.pid_fetcher, search_result=response, links=links, )
def test_migrate_mirror_broken_migrates_invalid(app_cli_runner): index = 'holdingpen-hep' build_workflow({}) wf_count = WorkflowObjectModel.query.count() assert wf_count == 1 es.indices.refresh(index) es_result = es.search(index) assert es_result['hits']['total'] == 1 result = app_cli_runner.invoke(workflows, ['purge', '--yes-i-know']) assert result.exit_code == 0 wf_count = WorkflowObjectModel.query.count() assert wf_count == 0 es.indices.refresh(index) es_result = es.search(index) assert es_result['hits']['total'] == 0
def index(): """Query Elasticsearch using Invenio query syntax.""" page = request.values.get('page', 1, type=int) size = request.values.get('size', 1, type=int) query = Query(request.values.get('q', ''))[(page - 1) * size:page * size] response = current_search_client.search( index=request.values.get('index', 'demo'), doc_type=request.values.get('type'), body=query.body, ) return jsonify(**response)
def perform_es_search(query_string, page, size, collection, sort=''): query, qs_kwargs = perform_query(query_string, page, size) search_result = current_search_client.search( index='records-{0}'.format(collection), doc_type=collection, sort=sort, body=query.body, version=True) results = [hit['_source'] for hit in search_result['hits']['hits']] return results
def perform_es_search(query_string, page, size, collection, sort=''): query, qs_kwargs = perform_query(query_string, page, size) search_result = current_search_client.search( index='records-{0}'.format(collection), doc_type=collection, sort=sort, body=query.body, version=True) results = [hit['_source'] for hit in search_result['hits']['hits']] return results
def index(): """Query Elasticsearch using Invenio query syntax.""" page = request.values.get('page', 1, type=int) size = request.values.get('size', 1, type=int) query = Query(request.values.get('q', ''))[(page-1)*size:page*size] response = current_search_client.search( index=request.values.get('index', 'demo'), doc_type=request.values.get('type'), body=query.body, ) return jsonify(**response)
def get(self, **kwargs): """Search records. :returns: the search result containing hits and aggregations as returned by invenio-search. """ page = request.values.get('page', 1, type=int) size = request.values.get('size', 10, type=int) if page * size >= self.max_result_window: raise MaxResultWindowRESTError() # Arguments that must be added in prev/next links urlkwargs = dict() query, qs_kwargs = self.query_factory(self.search_index, page, size) urlkwargs.update(qs_kwargs) # Facets query, qs_kwargs = self.facets_factory(query, self.search_index) urlkwargs.update(qs_kwargs) # Sort query, qs_kwargs = self.sorter_factory(query, self.search_index) urlkwargs.update(qs_kwargs) # Execute search search_result = current_search_client.search( index=self.search_index, doc_type=self.search_type, body=query.body, version=True, ) # Generate links for prev/next urlkwargs.update( size=size, q=request.values.get('q', ''), _external=True, ) endpoint = 'invenio_records_rest.{0}_list'.format(self.pid_type) links = dict(self=url_for(endpoint, page=page, **urlkwargs)) if page > 1: links['prev'] = url_for(endpoint, page=page-1, **urlkwargs) if size * page < int(search_result['hits']['total']) and \ size * page < self.max_result_window: links['next'] = url_for(endpoint, page=page+1, **urlkwargs) return self.make_response( pid_fetcher=self.pid_fetcher, search_result=search_result, links=links, item_links_factory=self.item_links_factory, )
def search(cls, query): """Search for objects using the invenio query syntax.""" from flask import current_app as app from invenio_search import Query, current_search_client index = app.config['INDEXER_DEFAULT_INDEX'] res = current_search_client.search(index=index, body=Query(query).body, size=1000) return [cls.get(x['_id']) for x in res['hits']['hits'] if x['_score'] > 0.3]
def test_cli_purges_db_and_es(app_cli_runner): indices = ['holdingpen-hep', 'holdingpen-authors'] build_workflow({}, data_type='hep') build_workflow({}, data_type='authors') wf_count = WorkflowObjectModel.query.count() assert wf_count == 2 es.indices.refresh(indices) es_result = es.search(indices) assert es_result['hits']['total'] == 2 result = app_cli_runner.invoke(workflows, ['purge', '--yes-i-know']) assert result.exit_code == 0 wf_count = WorkflowObjectModel.query.count() assert wf_count == 0 es.indices.refresh(indices) es_result = es.search(indices) assert es_result['hits']['total'] == 0
def test_index_data_record(base_app, es_clear, db, datadir, create_record): record = create_record("dat") expected_count = 1 expected_metadata = deepcopy(record) expected_metadata["_created"] = utils.isoformat(record.created) expected_metadata["_updated"] = utils.isoformat(record.updated) response = es.search("records-data") assert response["hits"]["total"] == expected_count assert response["hits"]["hits"][0]["_source"] == expected_metadata
def get_records(**kwargs): """Get records.""" page = kwargs.get('resumptionToken', {}).get('page', 1) size = current_app.config['OAISERVER_PAGE_SIZE'] query = Query()[(page-1)*size:page*size] body = {} if 'set' in kwargs: body['must'] = [{'match': {'_oai.sets': kwargs['set']}}] time_range = {} if 'from_' in kwargs: time_range['gte'] = kwargs['from_'] if 'until' in kwargs: time_range['lte'] = kwargs['until'] if time_range: body['filter'] = [{'range': {'_oai.updated': time_range}}] if body: query.body = {'query': {'bool': body}} response = current_search_client.search( index=current_app.config['OAISERVER_RECORD_INDEX'], body=query.body, ) class Pagination(object): """Dummy pagination class.""" @property def total(self): """Return number of hits found.""" return response['hits']['total'] @property def has_next(self): """Return True if there are more results.""" return page*size <= self.total @property def items(self): """Return iterator.""" for result in response['hits']['hits']: yield { 'id': result['_id'], 'json': result['_source'], # FIXME use ES 'updated': RecordMetadata.query.filter_by( id=result['_id']).one().updated, } return Pagination()
def get_records_matching_field(field, id, index=None, doc_type=None, source=None): """ Checks if a record with a given ID exists in the index """ query = {"size": 9999, 'query': {'match': {field: id}}} if source: query["_source"] = source return es.search(index=index, doc_type=doc_type, body=query)
def submit_edit_set(spec): """Insert a new set.""" form = get_NewSetForm(request.form) if request.method == 'POST' and form.validate(): old_set = OAISet.query.filter(spec=spec) query = Query(old_set.search_pattern) old_recid = current_search_client.search(index='records', doc_type='record', body=query.body, fields='_id, oaiid') query = Query(form.search_pattern) new_recid = current_search_client.search(index='records', doc_type='record', body=query.body, fields='_id, oaiid') recids_to_delete = set(old_recid) - set(new_recid) # TODO: marks records as deleted from set remove_recids_from_set(recids_to_delete) add_records_to_set(new_recid) flash('Set was changed') return redirect(url_for('.manage_sets')) return render_template('edit_set.html', edit_set_form=form, spec=spec)
def index(): """Frontpage blueprint.""" query = Query("") query.body["size"] = 10 query.body["sort"] = [{"creation_date": "desc"}] response = current_search_client.search( index='records', body=query.body, ) return render_template('zenodo_frontpage/index.html', records=(h['_source'] for h in response['hits']['hits']))
def aggregate_and_check_version(expected_version): # Aggregate events StatAggregator(name='file-download-agg', event='file-download', aggregation_field='file_id', aggregation_interval='day', query_modifiers=[]).run() current_search_client.indices.refresh(index='*') res = current_search_client.search( index='stats-file-download', doc_type='file-download-day-aggregation', version=True) for hit in res['hits']['hits']: assert hit['_version'] == expected_version
def test_index_author_record(base_app, es_clear, db, datadir, create_record): data = json.loads((datadir / "999108.json").read_text()) record = create_record("aut", data=data) expected_count = 1 expected_metadata = data = json.loads( (datadir / "999108_expected.json").read_text()) expected_metadata["_created"] = utils.isoformat(record.created) expected_metadata["_updated"] = utils.isoformat(record.updated) response = es.search("records-authors") assert response["hits"]["total"] == expected_count assert response["hits"]["hits"][0]["_source"] == expected_metadata
def index(): """Frontpage blueprint.""" query = Query("communities:zenodo AND access_right:open") query.body["size"] = 10 query.body["sort"] = [{"creation_date": "desc"}] response = current_search_client.search( index='records', body=query.body, ) return render_template( 'zenodo_frontpage/index.html', records=(h['_source'] for h in response['hits']['hits']) )
def get(self, **kwargs): """Search records. :returns: the search result containing hits and aggregations as returned by invenio-search. """ page = request.values.get("page", 1, type=int) size = request.values.get("size", 10, type=int) if page * size >= self.max_result_window: raise MaxResultWindowRESTError() # Parse and slice query try: query = Query(request.values.get("q", ""))[(page - 1) * size : page * size] except SyntaxError: raise InvalidQueryRESTError() # Arguments that must be added in prev/next links urlkwargs = dict() # Facets query, qs_kwargs = self.facets_factory(query, self.search_index) urlkwargs.update(qs_kwargs) # Sort query, qs_kwargs = self.sorter_factory(query, self.search_index) urlkwargs.update(qs_kwargs) # Execute search response = current_search_client.search( index=self.search_index, doc_type=self.search_type, body=query.body, version=True ) # Generate links for prev/next urlkwargs.update(size=size, q=request.values.get("q", ""), _external=True) endpoint = "invenio_records_rest.{0}_list".format(self.pid_type) links = dict(self=url_for(endpoint, page=page, **urlkwargs)) if page > 1: links["prev"] = url_for(endpoint, page=page - 1, **urlkwargs) if size * page < int(response["hits"]["total"]) and size * page < self.max_result_window: links["next"] = url_for(endpoint, page=page + 1, **urlkwargs) return self.make_response(pid_fetcher=self.pid_fetcher, search_result=response, links=links)
def get_records(page=1): """Get records.""" size = current_app.config['OAISERVER_PAGE_SIZE'] query = Query()[(page-1)*size:page*size] response = current_search_client.search( index=current_app.config['OAISERVER_RECORD_INDEX'], body=query.body, # version=True, ) for result in response['hits']['hits']: yield { # FIXME "id": result['_id'], "json": result['_source'], # FIXME retrieve from elastic search "updated": RecordMetadata.query.filter_by( id=result['_id']).one().updated }
def _search(self): from invenio_search import current_search_client if self._results is None: if current_app.debug: import json json_body = json.dumps(self.body, indent=2) current_app.logger.debug( "index: {0} - doc_type: {1} - query: {2}".format( self.index, self.doc_type, json_body ) ) self._results = current_search_client.search( index=self.index, doc_type=self.doc_type, body=self.body, ) return self._results
def _search(query): """Make a call to the Elasticsearch instance. Receives the dictionary as the parameter. With the given paremeter, the Elasticsearch instance is being queried. :query: The query for the Elasticsearch instance. Example: query = {"query": {"match_all": {}}} :return: The Elasticsearch instance response. """ elasticsearch_index = current_app.config.get('DISAMBIGUATION_RECORD_INDEX') return es.search( index=elasticsearch_index, body=query)['hits']['hits']
def get_author_collection_records_from_valid_authors(authors_refs): """Query elasticsearch for the author of the given authors references.""" es_query = { "filter": { "bool": { "must": [ {"terms": { "self.$ref": authors_refs }}, {"match": { "ids.type": "ORCID" }} ] } } } authors = current_search_client.search( index='records-authors', doc_type='authors', body=es_query )['hits']['hits'] return authors
def search(self, size=25, page=1, query_string=None, sort_key=None): """Return search results for query.""" # Arguments that must be added in prev/next links urlkwargs = dict() query, qs_kwargs = self.query_factory( self.search_index, page, size, query_string ) urlkwargs.update(qs_kwargs) query, qs_kwargs = self.sorter_factory( query, self.search_index, sort_key ) urlkwargs.update(qs_kwargs) search_result = current_search_client.search( index=self.search_index, doc_type=self.search_type, body=query.body, version=True, ) return urlkwargs, search_result
def update(): sets = Set.query.all() for set in sets: query = Query(form.query.data) response = current_search_client.search( index=set.search_index, doc_type=set.search_doc_type, body=query.body ) ids = [(a['_id'], _get_oaiid(a)) for a in response['hits']['hits']] # get all current records with this set current_ids = [] # new records that need to be added new_ids = ids - current_ids # records that were deleted from the set del_ids = current_ids - ids _add_records_to_set(new_ids, set.spec) _del_records_from_set(del_ids, set.spec)
def get(self, **kwargs): """Search records. :returns: the search result containing hits and aggregations as returned by invenio-search. """ page = request.values.get("page", 1, type=int) size = request.values.get("size", 10, type=int) sort = request.values.get("sort", "", type=str) query = Query(request.values.get("q", ""))[(page - 1) * size : page * size] for sort_key in sort.split(","): if sort_key: query = query.sort(sort_key) response = current_search_client.search( index=self.search_index, doc_type=self.search_type, body=query.body, version=True ) links = {} if page > 1: links["prev"] = url_for( "invenio_records_rest.{0}_list".format(self.pid_type), page=page - 1, size=size, sort=sort, q=request.values.get("q", ""), _external=True, ) if size * page < int(response["hits"]["total"]): links["next"] = url_for( "invenio_records_rest.{0}_list".format(self.pid_type), page=page + 1, size=size, sort=sort, q=request.values.get("q", ""), _external=True, ) return self.make_response(pid_fetcher=self.pid_fetcher, search_result=response, links=links)
def _percolate_query(index, doc_type, percolator_doc_type, document): """Get results for a percolate query.""" if ES_VERSION[0] in (2, 5): results = current_search_client.percolate( index=index, doc_type=doc_type, allow_no_indices=True, ignore_unavailable=True, body={'doc': document} ) return results['matches'] elif ES_VERSION[0] == 6: results = current_search_client.search( index=index, doc_type=percolator_doc_type, allow_no_indices=True, ignore_unavailable=True, body={ 'query': { 'percolate': { 'field': 'query', 'document_type': percolator_doc_type, 'document': document, } } } ) return results['hits']['hits']
def get_expired_embargos(cls): """Get records for which the embargo period have expired.""" query_str = 'access_right:{0} AND embargo_date:{{* TO {1}}}'.format( cls.EMBARGOED, datetime.utcnow().isoformat() ) query = Query() query.body['from'] = 0 query.body['size'] = 1000 query.body['query'] = { 'query_string': { 'query': query_str, 'allow_leading_wildcard': False, }, } endpoints = current_app.config['RECORDS_REST_ENDPOINTS'] index = endpoints['recid']['search_index'] response = current_search_client.search( index=index, body=query.body ) return [hit['_id'] for hit in response['hits']['hits']]
def citations(self): """Return citation export for single record.""" out = [] row = [] # Get citations es_query = IQ('refersto:' + self.record['control_number']) record_citations = es.search( index='records-hep', doc_type='hep', body={"query": es_query.to_dict()}, size=10, _source=[ 'control_number', 'citation_count', 'titles', 'earliest_date' ] )['hits']['hits'] for citation in record_citations: citation_from_es = es.get_source(index='records-hep', id=citation['_id'], doc_type='hep', ignore=404) row.append(render_template_to_string( "inspirehep_theme/citations.html", record=citation_from_es)) row.append(citation.get('citation_count', '')) out.append(row) row = [] return out
def update_record(obj, eng): """Updates existing record""" doi = get_first_doi(obj) query = {'query': {'bool': {'must': [{'match': {'dois.value': doi}}], }}} search_result = es.search(index='records-record', doc_type='record-v1.0.0', body=query) recid = search_result['hits']['hits'][0]['_source']['control_number'] obj.extra_data['recid'] = recid obj.data['control_number'] = recid pid = PersistentIdentifier.get('recid', recid) existing_record = Record.get_record(pid.object_uuid) if '_files' in existing_record: obj.data['_files'] = existing_record['_files'] if '_oai' in existing_record: obj.data['_oai'] = existing_record['_oai'] # preserving original creation date creation_date = existing_record['record_creation_date'] obj.data['record_creation_date'] = creation_date obj.data['record_creation_year'] = parse_date(creation_date).year existing_record.clear() existing_record.update(obj.data) try: existing_record.commit() obj.save() db.session.commit() except ValidationError as err: __halt_and_notify("Validation error: %s." % err, eng) except SchemaError as err: __halt_and_notify('SchemaError during record validation! %s' % err, eng)
def get_institution_people_datatables_rows(recid): """ Datatable rows to render people working in an institution. :param recid: id of the institution. :type recid: string """ query = { "query": { "term": { "authors.affiliations.recid": recid } }, "aggs": { "authors": { "nested": { "path": "authors" }, "aggs": { "affiliated": { "filter": { "term": {"authors.affiliations.recid": recid} }, "aggs": { "byrecid": { "terms": { "field": "authors.recid" } } } } } } } } records_from_es = current_search_client.search( index='records-hep', doc_type='hep', body=query, search_type='count' ) # Extract all the record ids from the aggregation papers_per_author = records_from_es[ 'aggregations' ]['authors']['affiliated']['byrecid']['buckets'] recids = [int(paper['key']) for paper in papers_per_author] # Generate query to retrieve records from author index query = "" for i, recid in enumerate(recids): query += "recid:{}".format(recid) if i != len(recids) - 1: query += " OR " results = perform_es_search( query, 'records-authors', size=9999, fields=['control_number', 'name'] ) recid_map = dict( [(int(result.control_number), result.name) for result in results] ) result = [] author_html_link = "<a href='/authors/{recid}'>{name}</a>" for author in papers_per_author: row = [] try: row.append( author_html_link.format( recid=author['key'], name=recid_map[author['key']].preferred_name ) ) except: # No preferred name, use value row.append( author_html_link.format( recid=author['key'], name=recid_map[author['key']].value ) ) row.append(author['doc_count']) result.append(row) return result