def enhance_metajson(document): if isinstance(document, dict): document = load_dict(document) # rec_id if "rec_id" not in document or document["rec_id"] is None: document["rec_id"] = str(uuid.uuid1()) # language # todo use language_service # title_non_sort manage_title_non_sort(document) # rec_status if "rec_status" not in document or document["rec_status"] is None: document["rec_status"] = constants.REC_STATUS_PRIVATE # rec_created_date if "rec_created_date" not in document or document["rec_created_date"] is None: document["rec_created_date"] = datetime.now().isoformat() # rec_modified_date if "rec_modified_date" not in document or document["rec_modified_date"] is None: document["rec_modified_date"] = datetime.now().isoformat() # rec_deleted_date if document["rec_status"] == constants.REC_STATUS_DELETED and "rec_deleted_date" not in metajson: document["rec_deleted_date"] = datetime.now().isoformat() # For "rec_class": "Document" only if "rec_class" in document and document["rec_class"] == constants.REC_CLASS_DOCUMENT: # citations citations_manager.add_citations_to_metadata(document, None, None) # date_sort date_iso = document.get_date() date_sort = date_service.parse_date(date_iso) document["date_sort"] = date_sort return document
def search(corpus, search_query): if not corpus: corpus = default_corpus search_response = SearchResponse() # empty search_query if search_query is None: raise exceptions.metajsonprc_error(40) # filter_class -> collection collection = None if "filter_class" not in search_query or search_query[ "filter_class"] not in [ "Document", "Agent", "Person", "OrgUnit", "Event", "Family" ]: raise exceptions.metajsonprc_error(40) elif search_query["filter_class"] == "Document": collection = DOCUMENTS #elif search_query["filter_class"] in ["Agent", "Person", "OrgUnit", "Event", "Family"]: # collection = AGENTS # other filters # todo: filter_peer_review, filter_with_full_text, filter_favorite filter_query = [] if "filter_date_end" in search_query: filter_date_end = date_service.parse_date( search_query["filter_date_end"]) filter_query.append({"date_sort": {"$lte": filter_date_end}}) if "filter_date_begin" in search_query: filter_date_begin = date_service.parse_date( search_query["filter_date_begin"]) filter_query.append({"date_sort": {"$gte": filter_date_begin}}) if "filter_languages" in search_query: filter_query.append( {"languages": { "$in": search_query["filter_languages"] }}) if "filter_types" in search_query: filter_query.append( {"rec_type": { "$in": search_query["filter_types"] }}) if "filter_status" in search_query: # "private", "pending", "rejected", "published", "deleted" filter_query.append( {"rec_status": { "$in": search_query["filter_status"] }}) # search_terms # a # and b # or c # -> or(and(a,b),c) # a # or b # and c # -> and(or(a,b),c) search_indexes = [] if "search_terms" in search_query: for idx, search_term in enumerate(search_query["search_terms"]): # value if "value" not in search_term or search_term["value"] is None: # useless break # split value values = search_term["value"].replace(",", " ").split() # index if "index" not in search_term: # useless raise exceptions.metajsonprc_error(40) elif search_term["index"] == "all": all_terms = [] if values: for value in values: all_terms.append( {"rec_id": { "$regex": value, "$options": 'i' }}) all_terms.append({ "identifiers.value": { "$regex": value, "$options": 'i' } }) all_terms.append( {"title": { "$regex": value, "$options": 'i' }}) all_terms.append( {"title_sub": { "$regex": value, "$options": 'i' }}) all_terms.append( {"publishers": { "$regex": value, "$options": 'i' }}) all_terms.append({ "is_part_ofs.title": { "$regex": value, "$options": 'i' } }) all_terms.append({ "is_part_ofs.is_part_ofs.title": { "$regex": value, "$options": 'i' } }) all_terms.append({ "creators.agent.name_family": { "$regex": value, "$options": 'i' } }) all_terms.append({ "creators.agent.name_given": { "$regex": value, "$options": 'i' } }) all_terms.append({ "creators.agent.name": { "$regex": value, "$options": 'i' } }) all_terms.append({ "creators.agent.title": { "$regex": value, "$options": 'i' } }) all_terms.append( {"rec_type": { "$regex": value, "$options": 'i' }}) search_indexes.append({"$or": all_terms}) elif search_term["index"] == "identifier": try: obid = ObjectId(search_term["index"]) search_indexes.append({"_id": obid}) except (InvalidId, TypeError): search_indexes.append({ "$or": [{ "rec_id": { "$regex": search_term["value"], "$options": 'i' } }, { "identifiers.value": { "$regex": search_term["value"], "$options": 'i' } }] }) elif search_term["index"] == "title": title_terms = [] for value in values: title_terms.append( {"title": { "$regex": value, "$options": 'i' }}) search_indexes.append({"$and": title_terms}) elif search_term["index"] == "is_part_of": is_part_of_terms = [] for value in values: is_part_of_terms.append({ "is_part_ofs.title": { "$regex": value, "$options": 'i' } }) is_part_of_terms.append({ "is_part_ofs.is_part_ofs.title": { "$regex": value, "$options": 'i' } }) search_indexes.append({"$or": is_part_of_terms}) elif search_term["index"] == "creator": creator_terms = [] for value in values: creator_terms.append({ "creators.agent.name_family": { "$regex": value, "$options": 'i' } }) creator_terms.append({ "creators.agent.name_given": { "$regex": value, "$options": 'i' } }) creator_terms.append({ "creators.agent.name": { "$regex": value, "$options": 'i' } }) creator_terms.append({ "creators.agent.title": { "$regex": value, "$options": 'i' } }) search_indexes.append({"$or": creator_terms}) elif search_term["index"] == "creator_id": search_indexes.append( {"creators.agent.rec_id": search_term["value"]}) elif search_term["index"] == "affiliation": search_indexes.append({ "creators.affiliation.name": { "$regex": search_term["value"], "$options": 'i' } }) elif search_term["index"] == "affiliation_id": search_indexes.append( {"creators.affiliation.rec_id": search_term["value"]}) elif search_term["index"] == "publisher": publisher_terms = [] for value in values: publisher_terms.append( {"publishers": { "$regex": value, "$options": 'i' }}) publisher_terms.append({ "is_part_ofs.publishers": { "$regex": value, "$options": 'i' } }) publisher_terms.append({ "is_part_ofs.is_part_ofs.publishers": { "$regex": value, "$options": 'i' } }) search_indexes.append({"$or": publisher_terms}) elif search_term["index"] == "keyword": search_indexes.append({ "keywords.value": { "$regex": search_term["value"], "$options": 'i' } }) elif search_term["index"] == "classification": search_indexes.append({ "classifications.value": { "$regex": search_term["value"], "$options": 'i' } }) elif search_term["index"] == "research_area": search_indexes.append({ "research_areas.value": { "$regex": search_term["value"], "$options": 'i' } }) elif search_term["index"] == "subject": search_indexes.append({ "subjects.value": { "$regex": search_term["value"], "$options": 'i' } }) elif search_term["index"] == "set": search_indexes.append({ "sets.value": { "$regex": search_term["value"], "$options": 'i' } }) # operator if "operator" in search_term: if search_term["operator"] == "or": pass elif search_term["operator"] == "and": pass elif search_term["operator"] == "not": pass # result_sorts : how to with this index ? ... sort = [("title", pymongo.ASCENDING), ("rec_type", pymongo.ASCENDING)] # combine filter_query and search_indexes mongo_args = filter_query mongo_args.extend(search_indexes) # Generate the mongo query if mongo_args: mongo_query = {"$and": mongo_args} else: # search all mongo_query = {} logging.debug("mongo_query:") logging.debug(jsonbson.dumps_bson(mongo_query, True)) mongo_response = mongodb[database_name(corpus)][collection].find( mongo_query).sort(sort) logging.debug(mongo_response) if mongo_response: records = metajson_service.load_dict_list(mongo_response) records_total_count = len(records) else: records = [] records_total_count = 0 search_response["records"] = records search_response["records_total_count"] = records_total_count search_response["result_batch_size"] = records_total_count search_response["result_offset"] = 0 search_response["search_query"] = search_query return search_response
def search(corpus, search_query): if not corpus: corpus = default_corpus search_response = SearchResponse() # empty search_query if search_query is None: raise exceptions.metajsonprc_error(40) # filter_class -> collection collection = None if "filter_class" not in search_query or search_query["filter_class"] not in ["Document", "Agent", "Person", "OrgUnit", "Event", "Family"]: raise exceptions.metajsonprc_error(40) elif search_query["filter_class"] == "Document": collection = DOCUMENTS #elif search_query["filter_class"] in ["Agent", "Person", "OrgUnit", "Event", "Family"]: # collection = AGENTS # other filters # todo: filter_peer_review, filter_with_full_text, filter_favorite filter_query = [] if "filter_date_end" in search_query: filter_date_end = date_service.parse_date(search_query["filter_date_end"]) filter_query.append({"date_sort": {"$lte": filter_date_end}}) if "filter_date_begin" in search_query: filter_date_begin = date_service.parse_date(search_query["filter_date_begin"]) filter_query.append({"date_sort": {"$gte": filter_date_begin}}) if "filter_languages" in search_query: filter_query.append({"languages": {"$in": search_query["filter_languages"]}}) if "filter_types" in search_query: filter_query.append({"rec_type": {"$in": search_query["filter_types"]}}) if "filter_status" in search_query: # "private", "pending", "rejected", "published", "deleted" filter_query.append({"rec_status": {"$in": search_query["filter_status"]}}) # search_terms # a # and b # or c # -> or(and(a,b),c) # a # or b # and c # -> and(or(a,b),c) search_indexes = [] if "search_terms" in search_query: for idx, search_term in enumerate(search_query["search_terms"]): # value if "value" not in search_term or search_term["value"] is None: # useless break # split value values = search_term["value"].replace(",", " ").split() # index if "index" not in search_term: # useless raise exceptions.metajsonprc_error(40) elif search_term["index"] == "all": all_terms = [] if values: for value in values: all_terms.append({"rec_id": {"$regex": value, "$options": 'i'}}) all_terms.append({"identifiers.value": {"$regex": value, "$options": 'i'}}) all_terms.append({"title": {"$regex": value, "$options": 'i'}}) all_terms.append({"title_sub": {"$regex": value, "$options": 'i'}}) all_terms.append({"publishers": {"$regex": value, "$options": 'i'}}) all_terms.append({"is_part_ofs.title": {"$regex": value, "$options": 'i'}}) all_terms.append({"is_part_ofs.is_part_ofs.title": {"$regex": value, "$options": 'i'}}) all_terms.append({"creators.agent.name_family": {"$regex": value, "$options": 'i'}}) all_terms.append({"creators.agent.name_given": {"$regex": value, "$options": 'i'}}) all_terms.append({"creators.agent.name": {"$regex": value, "$options": 'i'}}) all_terms.append({"creators.agent.title": {"$regex": value, "$options": 'i'}}) all_terms.append({"rec_type": {"$regex": value, "$options": 'i'}}) search_indexes.append({"$or": all_terms}) elif search_term["index"] == "identifier": try: obid = ObjectId(search_term["index"]) search_indexes.append({"_id": obid}) except (InvalidId, TypeError): search_indexes.append({"$or": [{"rec_id": {"$regex": search_term["value"], "$options": 'i'}}, {"identifiers.value": {"$regex": search_term["value"], "$options": 'i'}}]}) elif search_term["index"] == "title": title_terms = [] for value in values: title_terms.append({"title": {"$regex": value, "$options": 'i'}}) search_indexes.append({"$and": title_terms}) elif search_term["index"] == "is_part_of": is_part_of_terms = [] for value in values: is_part_of_terms.append({"is_part_ofs.title": {"$regex": value, "$options": 'i'}}) is_part_of_terms.append({"is_part_ofs.is_part_ofs.title": {"$regex": value, "$options": 'i'}}) search_indexes.append({"$or": is_part_of_terms}) elif search_term["index"] == "creator": creator_terms = [] for value in values: creator_terms.append({"creators.agent.name_family": {"$regex": value, "$options": 'i'}}) creator_terms.append({"creators.agent.name_given": {"$regex": value, "$options": 'i'}}) creator_terms.append({"creators.agent.name": {"$regex": value, "$options": 'i'}}) creator_terms.append({"creators.agent.title": {"$regex": value, "$options": 'i'}}) search_indexes.append({"$or": creator_terms}) elif search_term["index"] == "creator_id": search_indexes.append({"creators.agent.rec_id": search_term["value"]}) elif search_term["index"] == "affiliation": search_indexes.append({"creators.affiliation.name": {"$regex": search_term["value"], "$options": 'i'}}) elif search_term["index"] == "affiliation_id": search_indexes.append({"creators.affiliation.rec_id": search_term["value"]}) elif search_term["index"] == "publisher": publisher_terms = [] for value in values: publisher_terms.append({"publishers": {"$regex": value, "$options": 'i'}}) publisher_terms.append({"is_part_ofs.publishers": {"$regex": value, "$options": 'i'}}) publisher_terms.append({"is_part_ofs.is_part_ofs.publishers": {"$regex": value, "$options": 'i'}}) search_indexes.append({"$or": publisher_terms}) elif search_term["index"] == "keyword": search_indexes.append({"keywords.value": {"$regex": search_term["value"], "$options": 'i'}}) elif search_term["index"] == "classification": search_indexes.append({"classifications.value": {"$regex": search_term["value"], "$options": 'i'}}) elif search_term["index"] == "research_area": search_indexes.append({"research_areas.value": {"$regex": search_term["value"], "$options": 'i'}}) elif search_term["index"] == "subject": search_indexes.append({"subjects.value": {"$regex": search_term["value"], "$options": 'i'}}) elif search_term["index"] == "set": search_indexes.append({"sets.value": {"$regex": search_term["value"], "$options": 'i'}}) # operator if "operator" in search_term: if search_term["operator"] == "or": pass elif search_term["operator"] == "and": pass elif search_term["operator"] == "not": pass # result_sorts : how to with this index ? ... sort = [("title",pymongo.ASCENDING), ("rec_type",pymongo.ASCENDING)] # combine filter_query and search_indexes mongo_args = filter_query mongo_args.extend(search_indexes) # Generate the mongo query if mongo_args: mongo_query = {"$and": mongo_args} else: # search all mongo_query = {} logging.debug("mongo_query:") logging.debug(jsonbson.dumps_bson(mongo_query, True)) mongo_response = mongodb[database_name(corpus)][collection].find(mongo_query).sort(sort) logging.debug(mongo_response) if mongo_response: records = metajson_service.load_dict_list(mongo_response) records_total_count = len(records) else: records = [] records_total_count = 0 search_response["records"] = records search_response["records_total_count"] = records_total_count search_response["result_batch_size"] = records_total_count search_response["result_offset"] = 0 search_response["search_query"] = search_query return search_response