def test_search(): search_query = {"filter_class": "Document"} search_query["filter_date_begin"] = "2010" search_query["filter_date_end"] = "2013" search_query["filter_languages"] = ["en", "fr"] search_query["filter_types"] = ["Book", "BookPart"] search_query["rec_class"] = "SearchQuery" search_query["rec_metajson"] = 1 search_query["result_batch_size"] = 100 search_query["result_bibliographic_styles"] = ["mla"] search_query["result_offset"] = 0 search_query["result_sorts"] = [{"field": "rec_type", "order": "asc"}] search_query["search_terms"] = [{ "index": "title", "operator": "and", "value": "Cheyenne" }, { "index": "title", "operator": "or", "value": "technique" }] print "search_query:" print jsonbson.dumps_json(search_query, True) search_result = repository_service.search(None, search_query) print "search_result:" print jsonbson.dumps_bson(search_result, True)
def load_dict(meta_dict): if "rec_class" not in meta_dict: return Common(meta_dict) elif meta_dict["rec_class"] == "Document": return Document(meta_dict) elif meta_dict["rec_class"] == "Person": return Person(meta_dict) elif meta_dict["rec_class"] == "Orgunit": return Orgunit(meta_dict) elif meta_dict["rec_class"] == "Project": return Project(meta_dict) elif meta_dict["rec_class"] == "Event": return Event(meta_dict) elif meta_dict["rec_class"] == "Family": return Family(meta_dict) elif meta_dict["rec_class"] == "Field": return Field(meta_dict) elif meta_dict["rec_class"] == "Resource": return Resource(meta_dict) elif meta_dict["rec_class"] == "Target": return Target(meta_dict) elif meta_dict["rec_class"] == "Type": return Type(meta_dict) elif meta_dict["rec_class"] == "Collection": return Collection(meta_dict) else: logging.debug(jsonbson.dumps_bson(meta_dict)) logging.warning("Unknown rec_class: {O}".format(meta_dict["rec_class"])) return Common(meta_dict)
def test_search(): search_query = {"filter_class": "Document"} search_query["filter_date_begin"] = "2010" search_query["filter_date_end"] = "2013" search_query["filter_languages"] = ["en", "fr"] search_query["filter_types"] = ["Book", "BookPart"] search_query["rec_class"] = "SearchQuery" search_query["rec_metajson"] = 1 search_query["result_batch_size"] = 100 search_query["result_bibliographic_styles"] = ["mla"] search_query["result_offset"] = 0 search_query["result_sorts"] = [{"field": "rec_type", "order": "asc"}] search_query["search_terms"] = [{"index": "title", "operator": "and", "value": "Cheyenne"}, {"index": "title", "operator": "or", "value": "technique"}] print "search_query:" print jsonbson.dumps_json(search_query, True) search_result = repository_service.search(None, search_query) print "search_result:" print jsonbson.dumps_bson(search_result, True)
def test_search_mongo(): mongo_query = { "$or": [{ "title": { "$options": "i", "$regex": "Cheyenne" } }, { "title": { "$options": "i", "$regex": "technique" } }] } mongo_query = { "$and": [{ "$or": [{ "title": { "$options": "i", "$regex": "Cheyenne" } }, { "title": { "$options": "i", "$regex": "technique" } }] }, { "publishers": { "$regex": "press", "$options": 'i' } }] } search_result = repository_service.search_mongo(None, mongo_query) print "search_result:" print jsonbson.dumps_bson(search_result, True)
def write_json(item, output_file_path): #logging.debug("write_json type(item): {}".format(type(item))) with open(output_file_path, "w") as output_file: dump = jsonbson.dumps_bson(item, True) if dump: output_file.write(dump)
def print_document(document): logging.info("document : {}".format(jsonbson.dumps_bson(document)))
def search(corpus, search_query): if not corpus: corpus = default_corpus search_response = SearchResponse() # empty search_query if search_query is None: raise exceptions.metajsonprc_error(40) # filter_class -> collection collection = None if "filter_class" not in search_query or search_query[ "filter_class"] not in [ "Document", "Agent", "Person", "OrgUnit", "Event", "Family" ]: raise exceptions.metajsonprc_error(40) elif search_query["filter_class"] == "Document": collection = DOCUMENTS #elif search_query["filter_class"] in ["Agent", "Person", "OrgUnit", "Event", "Family"]: # collection = AGENTS # other filters # todo: filter_peer_review, filter_with_full_text, filter_favorite filter_query = [] if "filter_date_end" in search_query: filter_date_end = date_service.parse_date( search_query["filter_date_end"]) filter_query.append({"date_sort": {"$lte": filter_date_end}}) if "filter_date_begin" in search_query: filter_date_begin = date_service.parse_date( search_query["filter_date_begin"]) filter_query.append({"date_sort": {"$gte": filter_date_begin}}) if "filter_languages" in search_query: filter_query.append( {"languages": { "$in": search_query["filter_languages"] }}) if "filter_types" in search_query: filter_query.append( {"rec_type": { "$in": search_query["filter_types"] }}) if "filter_status" in search_query: # "private", "pending", "rejected", "published", "deleted" filter_query.append( {"rec_status": { "$in": search_query["filter_status"] }}) # search_terms # a # and b # or c # -> or(and(a,b),c) # a # or b # and c # -> and(or(a,b),c) search_indexes = [] if "search_terms" in search_query: for idx, search_term in enumerate(search_query["search_terms"]): # value if "value" not in search_term or search_term["value"] is None: # useless break # split value values = search_term["value"].replace(",", " ").split() # index if "index" not in search_term: # useless raise exceptions.metajsonprc_error(40) elif search_term["index"] == "all": all_terms = [] if values: for value in values: all_terms.append( {"rec_id": { "$regex": value, "$options": 'i' }}) all_terms.append({ "identifiers.value": { "$regex": value, "$options": 'i' } }) all_terms.append( {"title": { "$regex": value, "$options": 'i' }}) all_terms.append( {"title_sub": { "$regex": value, "$options": 'i' }}) all_terms.append( {"publishers": { "$regex": value, "$options": 'i' }}) all_terms.append({ "is_part_ofs.title": { "$regex": value, "$options": 'i' } }) all_terms.append({ "is_part_ofs.is_part_ofs.title": { "$regex": value, "$options": 'i' } }) all_terms.append({ "creators.agent.name_family": { "$regex": value, "$options": 'i' } }) all_terms.append({ "creators.agent.name_given": { "$regex": value, "$options": 'i' } }) all_terms.append({ "creators.agent.name": { "$regex": value, "$options": 'i' } }) all_terms.append({ "creators.agent.title": { "$regex": value, "$options": 'i' } }) all_terms.append( {"rec_type": { "$regex": value, "$options": 'i' }}) search_indexes.append({"$or": all_terms}) elif search_term["index"] == "identifier": try: obid = ObjectId(search_term["index"]) search_indexes.append({"_id": obid}) except (InvalidId, TypeError): search_indexes.append({ "$or": [{ "rec_id": { "$regex": search_term["value"], "$options": 'i' } }, { "identifiers.value": { "$regex": search_term["value"], "$options": 'i' } }] }) elif search_term["index"] == "title": title_terms = [] for value in values: title_terms.append( {"title": { "$regex": value, "$options": 'i' }}) search_indexes.append({"$and": title_terms}) elif search_term["index"] == "is_part_of": is_part_of_terms = [] for value in values: is_part_of_terms.append({ "is_part_ofs.title": { "$regex": value, "$options": 'i' } }) is_part_of_terms.append({ "is_part_ofs.is_part_ofs.title": { "$regex": value, "$options": 'i' } }) search_indexes.append({"$or": is_part_of_terms}) elif search_term["index"] == "creator": creator_terms = [] for value in values: creator_terms.append({ "creators.agent.name_family": { "$regex": value, "$options": 'i' } }) creator_terms.append({ "creators.agent.name_given": { "$regex": value, "$options": 'i' } }) creator_terms.append({ "creators.agent.name": { "$regex": value, "$options": 'i' } }) creator_terms.append({ "creators.agent.title": { "$regex": value, "$options": 'i' } }) search_indexes.append({"$or": creator_terms}) elif search_term["index"] == "creator_id": search_indexes.append( {"creators.agent.rec_id": search_term["value"]}) elif search_term["index"] == "affiliation": search_indexes.append({ "creators.affiliation.name": { "$regex": search_term["value"], "$options": 'i' } }) elif search_term["index"] == "affiliation_id": search_indexes.append( {"creators.affiliation.rec_id": search_term["value"]}) elif search_term["index"] == "publisher": publisher_terms = [] for value in values: publisher_terms.append( {"publishers": { "$regex": value, "$options": 'i' }}) publisher_terms.append({ "is_part_ofs.publishers": { "$regex": value, "$options": 'i' } }) publisher_terms.append({ "is_part_ofs.is_part_ofs.publishers": { "$regex": value, "$options": 'i' } }) search_indexes.append({"$or": publisher_terms}) elif search_term["index"] == "keyword": search_indexes.append({ "keywords.value": { "$regex": search_term["value"], "$options": 'i' } }) elif search_term["index"] == "classification": search_indexes.append({ "classifications.value": { "$regex": search_term["value"], "$options": 'i' } }) elif search_term["index"] == "research_area": search_indexes.append({ "research_areas.value": { "$regex": search_term["value"], "$options": 'i' } }) elif search_term["index"] == "subject": search_indexes.append({ "subjects.value": { "$regex": search_term["value"], "$options": 'i' } }) elif search_term["index"] == "set": search_indexes.append({ "sets.value": { "$regex": search_term["value"], "$options": 'i' } }) # operator if "operator" in search_term: if search_term["operator"] == "or": pass elif search_term["operator"] == "and": pass elif search_term["operator"] == "not": pass # result_sorts : how to with this index ? ... sort = [("title", pymongo.ASCENDING), ("rec_type", pymongo.ASCENDING)] # combine filter_query and search_indexes mongo_args = filter_query mongo_args.extend(search_indexes) # Generate the mongo query if mongo_args: mongo_query = {"$and": mongo_args} else: # search all mongo_query = {} logging.debug("mongo_query:") logging.debug(jsonbson.dumps_bson(mongo_query, True)) mongo_response = mongodb[database_name(corpus)][collection].find( mongo_query).sort(sort) logging.debug(mongo_response) if mongo_response: records = metajson_service.load_dict_list(mongo_response) records_total_count = len(records) else: records = [] records_total_count = 0 search_response["records"] = records search_response["records_total_count"] = records_total_count search_response["result_batch_size"] = records_total_count search_response["result_offset"] = 0 search_response["search_query"] = search_query return search_response
def test_search_mongo(): mongo_query = {"$or": [{"title": {"$options": "i", "$regex": "Cheyenne"}}, {"title": {"$options": "i", "$regex": "technique"}}]} mongo_query = {"$and": [{"$or": [{"title": {"$options": "i", "$regex": "Cheyenne"}}, {"title": {"$options": "i", "$regex": "technique"}}]}, {"publishers": {"$regex": "press", "$options": 'i'}}]} search_result = repository_service.search_mongo(None, mongo_query) print "search_result:" print jsonbson.dumps_bson(search_result, True)
def search(corpus, search_query): if not corpus: corpus = default_corpus search_response = SearchResponse() # empty search_query if search_query is None: raise exceptions.metajsonprc_error(40) # filter_class -> collection collection = None if "filter_class" not in search_query or search_query["filter_class"] not in ["Document", "Agent", "Person", "OrgUnit", "Event", "Family"]: raise exceptions.metajsonprc_error(40) elif search_query["filter_class"] == "Document": collection = DOCUMENTS #elif search_query["filter_class"] in ["Agent", "Person", "OrgUnit", "Event", "Family"]: # collection = AGENTS # other filters # todo: filter_peer_review, filter_with_full_text, filter_favorite filter_query = [] if "filter_date_end" in search_query: filter_date_end = date_service.parse_date(search_query["filter_date_end"]) filter_query.append({"date_sort": {"$lte": filter_date_end}}) if "filter_date_begin" in search_query: filter_date_begin = date_service.parse_date(search_query["filter_date_begin"]) filter_query.append({"date_sort": {"$gte": filter_date_begin}}) if "filter_languages" in search_query: filter_query.append({"languages": {"$in": search_query["filter_languages"]}}) if "filter_types" in search_query: filter_query.append({"rec_type": {"$in": search_query["filter_types"]}}) if "filter_status" in search_query: # "private", "pending", "rejected", "published", "deleted" filter_query.append({"rec_status": {"$in": search_query["filter_status"]}}) # search_terms # a # and b # or c # -> or(and(a,b),c) # a # or b # and c # -> and(or(a,b),c) search_indexes = [] if "search_terms" in search_query: for idx, search_term in enumerate(search_query["search_terms"]): # value if "value" not in search_term or search_term["value"] is None: # useless break # split value values = search_term["value"].replace(",", " ").split() # index if "index" not in search_term: # useless raise exceptions.metajsonprc_error(40) elif search_term["index"] == "all": all_terms = [] if values: for value in values: all_terms.append({"rec_id": {"$regex": value, "$options": 'i'}}) all_terms.append({"identifiers.value": {"$regex": value, "$options": 'i'}}) all_terms.append({"title": {"$regex": value, "$options": 'i'}}) all_terms.append({"title_sub": {"$regex": value, "$options": 'i'}}) all_terms.append({"publishers": {"$regex": value, "$options": 'i'}}) all_terms.append({"is_part_ofs.title": {"$regex": value, "$options": 'i'}}) all_terms.append({"is_part_ofs.is_part_ofs.title": {"$regex": value, "$options": 'i'}}) all_terms.append({"creators.agent.name_family": {"$regex": value, "$options": 'i'}}) all_terms.append({"creators.agent.name_given": {"$regex": value, "$options": 'i'}}) all_terms.append({"creators.agent.name": {"$regex": value, "$options": 'i'}}) all_terms.append({"creators.agent.title": {"$regex": value, "$options": 'i'}}) all_terms.append({"rec_type": {"$regex": value, "$options": 'i'}}) search_indexes.append({"$or": all_terms}) elif search_term["index"] == "identifier": try: obid = ObjectId(search_term["index"]) search_indexes.append({"_id": obid}) except (InvalidId, TypeError): search_indexes.append({"$or": [{"rec_id": {"$regex": search_term["value"], "$options": 'i'}}, {"identifiers.value": {"$regex": search_term["value"], "$options": 'i'}}]}) elif search_term["index"] == "title": title_terms = [] for value in values: title_terms.append({"title": {"$regex": value, "$options": 'i'}}) search_indexes.append({"$and": title_terms}) elif search_term["index"] == "is_part_of": is_part_of_terms = [] for value in values: is_part_of_terms.append({"is_part_ofs.title": {"$regex": value, "$options": 'i'}}) is_part_of_terms.append({"is_part_ofs.is_part_ofs.title": {"$regex": value, "$options": 'i'}}) search_indexes.append({"$or": is_part_of_terms}) elif search_term["index"] == "creator": creator_terms = [] for value in values: creator_terms.append({"creators.agent.name_family": {"$regex": value, "$options": 'i'}}) creator_terms.append({"creators.agent.name_given": {"$regex": value, "$options": 'i'}}) creator_terms.append({"creators.agent.name": {"$regex": value, "$options": 'i'}}) creator_terms.append({"creators.agent.title": {"$regex": value, "$options": 'i'}}) search_indexes.append({"$or": creator_terms}) elif search_term["index"] == "creator_id": search_indexes.append({"creators.agent.rec_id": search_term["value"]}) elif search_term["index"] == "affiliation": search_indexes.append({"creators.affiliation.name": {"$regex": search_term["value"], "$options": 'i'}}) elif search_term["index"] == "affiliation_id": search_indexes.append({"creators.affiliation.rec_id": search_term["value"]}) elif search_term["index"] == "publisher": publisher_terms = [] for value in values: publisher_terms.append({"publishers": {"$regex": value, "$options": 'i'}}) publisher_terms.append({"is_part_ofs.publishers": {"$regex": value, "$options": 'i'}}) publisher_terms.append({"is_part_ofs.is_part_ofs.publishers": {"$regex": value, "$options": 'i'}}) search_indexes.append({"$or": publisher_terms}) elif search_term["index"] == "keyword": search_indexes.append({"keywords.value": {"$regex": search_term["value"], "$options": 'i'}}) elif search_term["index"] == "classification": search_indexes.append({"classifications.value": {"$regex": search_term["value"], "$options": 'i'}}) elif search_term["index"] == "research_area": search_indexes.append({"research_areas.value": {"$regex": search_term["value"], "$options": 'i'}}) elif search_term["index"] == "subject": search_indexes.append({"subjects.value": {"$regex": search_term["value"], "$options": 'i'}}) elif search_term["index"] == "set": search_indexes.append({"sets.value": {"$regex": search_term["value"], "$options": 'i'}}) # operator if "operator" in search_term: if search_term["operator"] == "or": pass elif search_term["operator"] == "and": pass elif search_term["operator"] == "not": pass # result_sorts : how to with this index ? ... sort = [("title",pymongo.ASCENDING), ("rec_type",pymongo.ASCENDING)] # combine filter_query and search_indexes mongo_args = filter_query mongo_args.extend(search_indexes) # Generate the mongo query if mongo_args: mongo_query = {"$and": mongo_args} else: # search all mongo_query = {} logging.debug("mongo_query:") logging.debug(jsonbson.dumps_bson(mongo_query, True)) mongo_response = mongodb[database_name(corpus)][collection].find(mongo_query).sort(sort) logging.debug(mongo_response) if mongo_response: records = metajson_service.load_dict_list(mongo_response) records_total_count = len(records) else: records = [] records_total_count = 0 search_response["records"] = records search_response["records_total_count"] = records_total_count search_response["result_batch_size"] = records_total_count search_response["result_offset"] = 0 search_response["search_query"] = search_query return search_response