def _process_doc(self, collection, doc): _id = doc['_id'] del doc['_id'] del doc['UnitHeaderDMSoundex'] # un null the fields that are used for completion if collection in ('places', 'familyNames'): self._add_phonetics(doc) # fill empty headers as es completion fails on null values header = doc['Header'] for lang in ('En', 'He'): if not header[lang]: header[lang] = '1234567890' header["{}_lc".format(lang)] = header[lang].lower() res = None try: res = app.es.index(index=self.es_index_name, doc_type=collection, id=_id, body=doc) except elasticsearch.exceptions.SerializationError: # UUID fields are causing es to crash, turn them to strings uuids_to_str(doc) try: res = app.es.index(index=self.es_index_name, doc_type=collection, id=_id, body=doc) except elasticsearch.exceptions.SerializationError as e: import pdb pdb.set_trace() except elasticsearch.exceptions.RequestError as e: import pdb pdb.set_trace() return res
def update_es(collection_name, doc, is_new, es_index_name=None, es=None, app=None): app = current_app if not app else app es_index_name = app.es_data_db_index_name if not es_index_name else es_index_name es = app.es if not es else es # index only the docs that are publicly available if doc_show_filter(collection_name, doc): body = deepcopy(doc) # adjust attributes for elasticsearch if collection_name == "persons": body["person_id"] = body.get("id", body.get("ID")) body["first_name_lc"] = body["name_lc"][0] body["last_name_lc"] = body["name_lc"][1] # maps all known SEX values to normalized gender value body["gender"] = {"F": "F", "M": "M", None: "U", "": "U", "U": "U", "?": "U", "P": "U"}[body.get("SEX", "").strip()] # _id field is internal to mongo if '_id' in body: del body['_id'] # id field has special meaning in elasticsearch if 'id' in body: del body['id'] if "thumbnail" in body and "data" in body["thumbnail"]: # no need to have thumbnail data in elasticsearch # TODO: ensure we only store and use thumbnail from filesystem del body["thumbnail"]["data"] # persons collection gets a fake header to support searching if collection_name == "persons": name = " ".join(body["name"]) if isinstance(body["name"], list) else body["name"] body["Header"] = {"En": name, "He": name} # elasticsearch uses the header for completion field # this field does not support empty values, so we put a string with space here # this is most likely wrong, but works for now # TODO: figure out how to handle it properly, maybe items without header are invalid? if "Header" in body: for lang in ("He", "En"): if body["Header"].get(lang) is None: body["Header"][lang] = '_' if collection_name == "persons": doc_id = "{}_{}_{}".format(body["tree_num"], body["tree_version"], body["person_id"]) else: doc_id = get_doc_id(collection_name, body) if is_new: uuids_to_str(body) es.index(index=es_index_name, doc_type=collection_name, id=doc_id, body=body) return True, "indexed successfully (inserted)" else: es.update(index=es_index_name, doc_type=collection_name, id=doc_id, body=body) return True, "indexed successfully (updated)" else: return True, "item should not be shown - so not indexed"
def es_mlt_search(index_name, doc, doc_fields, target_collection, limit): '''Build an mlt query and execute it''' clean_doc = doc.copy() del clean_doc['_id'] query = { 'query': { 'mlt': { 'fields': doc_fields, 'docs': [{ 'doc': clean_doc }], } } } try: results = es.search(index=data_db.name, doc_type=target_collection, body=query, size=limit) except elasticsearch.exceptions.SerializationError: # UUID fields are causing es to crash, turn them to strings uuids_to_str(clean_doc) results = es.search(index=data_db.name, doc_type=target_collection, body=query, size=limit) except elasticsearch.exceptions.ConnectionError as e: logger.error('Error connecting to Elasticsearch: {}'.format(e.error)) raise e if len(results['hits']['hits']) > 0: ret = [] for h in results['hits']['hits']: try: slug = get_item_slug(h['_source']) except KeyError: logger.error("couldn't find slug for {},{}".format( h['_source']['_id'], h['_source']['UnitType'])) continue ret.append(slug) return ret else: return None
def es_mlt_search(index_name, doc, doc_fields, target_collection, limit): '''Build an mlt query and execute it''' clean_doc = doc.copy() del clean_doc['_id'] query = {'query': {'mlt': {'fields': doc_fields, 'docs': [ {'doc': clean_doc} ], } } } try: results = es.search(index=data_db.name, doc_type=target_collection, body=query, size=limit) except elasticsearch.exceptions.SerializationError: # UUID fields are causing es to crash, turn them to strings uuids_to_str(clean_doc) results = es.search(index=data_db.name, doc_type=target_collection, body=query, size=limit) except elasticsearch.exceptions.ConnectionError as e: logger.error('Error connecting to Elasticsearch: {}'.format(e.error)) raise e if len(results['hits']['hits']) > 0: ret = [] for h in results['hits']['hits']: try: slug = get_item_slug(h['_source']) except KeyError: logger.error("couldn't find slug for {},{}".format(h['_source']['_id'], h['_source']['UnitType'])) continue ret.append(slug) return ret else: return None
def update_es(collection, doc, id): if MIGRATE_ES != '1': return index_name = current_app.data_db.name body = doc.copy() if '_id' in body: del body['_id'] try: current_app.es.index(index=index_name, doc_type=collection, id=id, body=body) except elasticsearch.exceptions.SerializationError: # UUID fields are causing es to crash, turn them to strings uuids_to_str(doc) try: current_app.es.index(index=index_name, doc_type=collection, id=id, body=doc) except elasticsearch.exceptions.SerializationError as e: current_app.logger.error("Elastic search index failed for {}:{} with {}" .format(collection, id, e))
del doc['UnitHeaderDMSoundex'] # un null the fields that are used for completion if collection in ('places', 'familyNames'): add_phonetics(doc) # fill empty headers as es completion fails on null values header = doc['Header'] for lang in ('En', 'He'): if not header[lang]: header[lang] = '1234567890' try: res = app.es.index(index=index_name, doc_type=collection, id=_id, body=doc) except elasticsearch.exceptions.SerializationError: # UUID fields are causing es to crash, turn them to strings uuids_to_str(doc) try: res = app.es.index(index=index_name, doc_type=collection, id=_id, body=doc) except elasticsearch.exceptions.SerializationError as e: import pdb pdb.set_trace() except elasticsearch.exceptions.RequestError as e: import pdb pdb.set_trace() finished = datetime.datetime.now() print 'Collection {} took {}'.format(collection, finished - started)