Esempio n. 1
0
def push_data_keywords(pub_ids=None, index=None):
    """ Go through all the publications and their datatables and move data
     keywords from tables to their parent publications. """

    if not pub_ids:
        body = {"query": {"match_all": {}}}
        results = es.search(index=index, doc_type=CFG_PUB_TYPE, body=body, _source=False)
        pub_ids = [i["_id"] for i in results["hits"]["hits"]]

    for pub_id in pub_ids:
        query_builder = QueryBuilder()
        query_builder.add_child_parent_relation(
            "publication", relation="parent", must=True, related_query={"match": {"recid": pub_id}}
        )
        tables = es.search(index=index, doc_type=CFG_DATA_TYPE, body=query_builder.query, _source_include="keywords")
        keywords = [d["_source"].get("keywords", None) for d in tables["hits"]["hits"]]

        # Flatten the list
        keywords = [i for inner in keywords for i in inner]

        # Aggregate
        agg_keywords = defaultdict(list)
        for kw in keywords:
            agg_keywords[kw["name"]].append(kw["value"])

        # Remove duplicates
        for k, v in agg_keywords.items():
            agg_keywords[k] = list(set(v))

        body = {"doc": {"data_keywords": dict(agg_keywords)}}

        try:
            es.update(index=index, doc_type=CFG_PUB_TYPE, id=pub_id, body=body)
        except Exception as e:
            log.error(e.message)
Esempio n. 2
0
def push_data_keywords(pub_ids=None, index=None):
    """ Go through all the publications and their datatables and move data
     keywords from tables to their parent publications. """

    if not pub_ids:
        body = {'query': {'match_all': {}}}
        results = es.search(index=index,
                            doc_type=CFG_PUB_TYPE,
                            body=body,
                            _source=False)
        pub_ids = [i['_id'] for i in results['hits']['hits']]

    for pub_id in pub_ids:
        query_builder = QueryBuilder()
        query_builder.add_child_parent_relation(
            'publication',
            relation='parent',
            must=True,
            related_query={'match': {
                'recid': pub_id
            }})
        tables = es.search(index=index,
                           doc_type=CFG_DATA_TYPE,
                           body=query_builder.query,
                           _source_include='keywords')
        keywords = [
            d['_source'].get('keywords', None) for d in tables['hits']['hits']
        ]

        # Flatten the list
        keywords = [i for inner in keywords for i in inner]

        # Aggregate
        agg_keywords = defaultdict(list)
        for kw in keywords:
            agg_keywords[kw['name']].append(kw['value'])

        # Remove duplicates
        for k, v in agg_keywords.items():
            agg_keywords[k] = list(set(v))

        body = {"doc": {'data_keywords': dict(agg_keywords)}}

        try:
            es.update(index=index, doc_type=CFG_PUB_TYPE, id=pub_id, body=body)
        except Exception as e:
            log.error(e.message)
Esempio n. 3
0
def search(query,
           index=None,
           filters=list(),
           size=10,
           include="*",
           exclude="authors",
           offset=0,
           sort_field=None,
           sort_order='',
           post_filter=None):
    """ Perform a search query.

    :param query: [string] query string e.g. 'higgs boson'
    :param index: [string] name of the index. If None a default is used
    :param filters: [list of tuples] list of filters for the query.
                    Currently supported: ('author', author_fullname),
                    ('collaboration', collaboration_name), ('date', date)
    :param size: [int] max number of hits that should be returned
    :param offset: [int] offset for the results (used for pagination)
    :param sort_by: [string] sorting field. Currently supported fields:
                    "title", "collaboration", "date", "relevance"
    :param sort_order: [string] order of the sorting either original
                    (for a particular field) or reversed. Supported:
                    '' or 'rev'

    :return: [dict] dictionary with processed results and facets
    """
    # If empty query then sort by date
    if query == '' and not sort_field:
        sort_field = 'date'

    query = HEPDataQueryParser.parse_query(query)

    # Build core query
    data_query = get_query_by_type(CFG_DATA_TYPE, query)
    pub_query = get_query_by_type(CFG_PUB_TYPE, query)
    authors_query = get_authors_query(query)

    query_builder = QueryBuilder()
    query_builder.add_child_parent_relation(
        CFG_DATA_TYPE,
        relation="child",
        related_query=data_query,
        other_queries=[pub_query, authors_query])

    # Add additional options
    query_builder.add_pagination(size=size, offset=offset)
    query_builder.add_sorting(sort_field=sort_field, sort_order=sort_order)
    query_builder.add_filters(filters)
    query_builder.add_post_filter(post_filter)
    query_builder.add_aggregations()
    query_builder.add_source_filter(include, exclude)

    pub_result = es.search(index=index,
                           body=query_builder.query,
                           doc_type=CFG_PUB_TYPE)

    parent_filter = {
        "filtered": {
            "filter": {
                "terms": {
                    "_id": [hit["_id"] for hit in pub_result['hits']['hits']]
                }
            }
        }
    }

    query_builder = QueryBuilder()
    query_builder.add_child_parent_relation(CFG_PUB_TYPE,
                                            relation="parent",
                                            related_query=parent_filter,
                                            must=True,
                                            other_queries=[data_query])
    query_builder.add_pagination(size=size * 50)

    data_result = es.search(index=index,
                            body=query_builder.query,
                            doc_type=CFG_DATA_TYPE)

    merged_results = merge_results(pub_result, data_result)

    return map_result(merged_results)
Esempio n. 4
0
def search(
    query,
    index=None,
    filters=list(),
    size=10,
    include="*",
    exclude="",
    offset=0,
    sort_field=None,
    sort_order="",
    post_filter=None,
):
    """ Perform a search query.

    :param query: [string] query string e.g. 'higgs boson'
    :param index: [string] name of the index. If None a default is used
    :param filters: [list of tuples] list of filters for the query.
                    Currently supported: ('author', author_fullname),
                    ('collaboration', collaboration_name), ('date', date)
    :param size: [int] max number of hits that should be returned
    :param offset: [int] offset for the results (used for pagination)
    :param sort_by: [string] sorting field. Currently supported fields:
                    "title", "collaboration", "date", "relevance"
    :param sort_order: [string] order of the sorting either original
                    (for a particular field) or reversed. Supported:
                    '' or 'rev'

    :return: [dict] dictionary with processed results and facets
    """
    # If empty query then sort by date
    if query == "" and not sort_field:
        sort_field = "date"

    query = HEPDataQueryParser.parse_query(query)

    # Build core query
    data_query = get_query_by_type(CFG_DATA_TYPE, query)
    pub_query = get_query_by_type(CFG_PUB_TYPE, query)
    authors_query = get_authors_query(query)

    query_builder = QueryBuilder()
    query_builder.add_child_parent_relation(
        CFG_DATA_TYPE, relation="child", related_query=data_query, other_queries=[pub_query, authors_query]
    )

    # Add additional options
    query_builder.add_pagination(size=size, offset=offset)
    query_builder.add_sorting(sort_field=sort_field, sort_order=sort_order)
    query_builder.add_filters(filters)
    query_builder.add_post_filter(post_filter)
    query_builder.add_aggregations()
    query_builder.add_source_filter(include, exclude)

    pub_result = es.search(index=index, body=query_builder.query, doc_type=CFG_PUB_TYPE)

    parent_filter = {"filtered": {"filter": {"terms": {"_id": [hit["_id"] for hit in pub_result["hits"]["hits"]]}}}}

    query_builder = QueryBuilder()
    query_builder.add_child_parent_relation(
        CFG_PUB_TYPE, relation="parent", related_query=parent_filter, must=True, other_queries=[data_query]
    )
    query_builder.add_pagination(size=size * 50)

    data_result = es.search(index=index, body=query_builder.query, doc_type=CFG_DATA_TYPE)

    merged_results = merge_results(pub_result, data_result)

    return map_result(merged_results)