Esempio n. 1
0
def index_record_ids(record_ids, index=None):
    """ Index records given in the argument.

    :param record_ids: [list of ints] list of record ids e.g. [1, 5, 2, 3]
    :param index: [string] name of the index. If None a default is used
    :return: list of indexed publication and data recids
    """
    from hepdata.modules.records.utils.common import get_record_by_id

    docs = filter(None, [get_record_by_id(recid) for recid in record_ids])

    to_index = []
    indexed_result = {CFG_DATA_TYPE: [], CFG_PUB_TYPE: []}

    for doc in docs:
        if "related_publication" in doc:
            # Remove unnecessary fields if it's a data record
            for field in ["authors", "_additional_authors", "_first_author"]:
                if field in doc:
                    del doc[field]

            enhance_data_document(doc)

            op_dict = {
                "index": {
                    "_index": index,
                    "_type": CFG_DATA_TYPE,
                    "_id": doc["recid"],
                    "_parent": str(doc["related_publication"]),
                }
            }

            indexed_result[CFG_DATA_TYPE].append(doc["recid"])
            to_index.append(op_dict)

        else:
            author_docs = prepare_author_for_indexing(es, doc)
            to_index += author_docs

            enhance_publication_document(doc)

            op_dict = {"index": {"_index": index, "_type": CFG_PUB_TYPE, "_id": doc["recid"]}}

            indexed_result[CFG_PUB_TYPE].append(doc["recid"])
            to_index.append(op_dict)

        if doc["last_updated"] is not None:
            doc["last_updated"] = parse(doc["last_updated"]).isoformat()
        to_index.append(doc)

    es.bulk(index=index, body=to_index, refresh=True)

    return indexed_result
Esempio n. 2
0
def index_record_ids(record_ids, index=None):
    """ Index records given in the argument.

    :param record_ids: [list of ints] list of record ids e.g. [1, 5, 2, 3]
    :param index: [string] name of the index. If None a default is used
    :return: list of indexed publication and data recids
    """
    from hepdata.modules.records.utils.common import get_record_by_id

    docs = filter(None, [get_record_by_id(recid) for recid in record_ids])

    existing_record_ids = [doc['recid'] for doc in docs]
    print('Indexing existing record IDs:', existing_record_ids)

    to_index = []
    indexed_result = {CFG_DATA_TYPE: [], CFG_PUB_TYPE: []}

    for doc in docs:
        if 'related_publication' in doc:
            # Remove unnecessary fields if it's a data record
            for field in ['authors', '_additional_authors', '_first_author']:
                if field in doc:
                    del doc[field]

            enhance_data_document(doc)

            op_dict = {
                "index": {
                    "_index": index,
                    "_type": CFG_DATA_TYPE,
                    "_id": doc['recid'],
                    "_parent": str(doc['related_publication'])
                }
            }

            indexed_result[CFG_DATA_TYPE].append(doc['recid'])
            to_index.append(op_dict)

        else:

            if 'version' not in doc:
                print('Skipping unfinished record ID {}'.format(doc['recid']))
                continue

            author_docs = prepare_author_for_indexing(doc)
            to_index += author_docs

            enhance_publication_document(doc)

            op_dict = {
                "index": {
                    "_index": index,
                    "_type": CFG_PUB_TYPE,
                    "_id": doc['recid']
                }
            }

            indexed_result[CFG_PUB_TYPE].append(doc['recid'])
            to_index.append(op_dict)

        if doc["last_updated"] is not None:
            doc["last_updated"] = parse(doc["last_updated"]).isoformat()
        to_index.append(doc)

    if to_index:
        es.bulk(index=index, body=to_index, refresh=True)

    return indexed_result