def _get_elasticsearch_item_key_query(self, collection_name, item_key): if collection_name == "persons": tree_num, tree_version, person_id = item_key return { "bool": { "must": [{ "term": { "tree_num": tree_num } }, { "term": { "tree_version": tree_version } }, { "term": { "person_id": person_id } }] } } else: return { "term": { get_collection_id_field(collection_name): item_key } }
def parse_n_update(row, collection_name): doc = parse_doc(row, collection_name) id_field = get_collection_id_field(collection_name) logger.info('{}:Updating {}: {}'.format( collection_name, id_field, doc[id_field])) update_row.delay(doc, collection_name) return doc
def _get_elasticsearch_item_key_query(self, collection_name, item_key): if collection_name == "persons": tree_num, tree_version, person_id = item_key return {"bool": {"must": [{"term": {"tree_num": tree_num}}, {"term": {"tree_version": tree_version}}, {"term": {"person_id": person_id}}]}} else: return {"term": {get_collection_id_field(collection_name): item_key}}
def parse_n_update(row, collection_name, dryrun=False): doc = parse_doc(row, collection_name) id_field = get_collection_id_field(collection_name) logger.info('{}:Updating {}: {}, updated {}'.format( collection_name, id_field, doc[id_field], doc.get('UpdateDate', '?'))) if not dryrun: update_row.delay(doc, collection_name) return doc
def _get_mongo_items(self, collection_name, key): if key: if collection_name == "persons": raise NotImplementedError("persons does not support updating by key yet") else: items = self.app.data_db[collection_name].find({get_collection_id_field(collection_name): key}) else: items = self.app.data_db[collection_name].find() items = self._limit(items) return items
def update_doc(collection, document): # update place items with geojson if collection.name == 'places': document['geometry'] = get_place_geo(document) # family trees get special treatment if collection.name == 'persons': tree_num = document['tree_num'] id = document['id'] tree_key = 'tree_vers_'+str(tree_num) query = {'tree_num': tree_num, 'id': id} tree_vers = current_app.redis.get(tree_key) if tree_vers: tree_vers = json.loads(tree_vers) i = find_version(tree_vers, document['tree_file_id']) else: tree = current_app.data_db['trees'].find_one({'num':tree_num}) if tree: tree_vers = tree['versions'] current_app.redis.set(tree_key, json.dumps(tree_vers), 300) i = find_version(tree_vers, document['tree_file_id']) else: current_app.logger.info("didn't find tree number {} using version 0 for {}" .format(tree_num, id)) i = 0 document['tree_version'] = i query['tree_version'] = i # we have to create it here as at the moment create_slug function requires Header to create slug # TODO: move this logic to create_slug function document['Slug'] = {'En': 'person_{};{}.{}'.format( tree_num, i, id)} created = update_collection(collection, query, document) if MIGRATE_ES == '1': is_ok, msg = update_es(collection.name, document, created) if not is_ok: current_app.logger.error(msg) current_app.logger.info('Updated person: {}.{}' .format(tree_num, id)) else: doc_id = get_doc_id(collection.name, document) if doc_id: query = {get_collection_id_field(collection): doc_id} created = update_collection(collection, query, document) if MIGRATE_ES == '1': is_ok, msg = update_es(collection.name, document, created) if not is_ok: current_app.logger.error(msg) slug = document.get("Slug", {}).get("En") current_app.logger.info('Updated {} {}, Slug: {}'.format(collection.name, doc_id, slug)) else: current_app.logger.error('update failed because of id {}'.format(collection.name))
def update_doc(collection, document): # update place items with geojson if collection.name == 'places': document['geometry'] = get_place_geo(document) # family trees get special treatment if collection.name == 'persons': tree_num = document['tree_num'] id = document['id'] tree_key = 'tree_vers_' + str(tree_num) query = {'tree_num': tree_num, 'id': id} tree_vers = current_app.redis.get(tree_key) if tree_vers: tree_vers = json.loads(tree_vers) i = find_version(tree_vers, document['tree_file_id']) else: tree = current_app.data_db['trees'].find_one({'num': tree_num}) if tree: tree_vers = tree['versions'] current_app.redis.set(tree_key, json.dumps(tree_vers), 300) i = find_version(tree_vers, document['tree_file_id']) else: current_app.logger.info( "didn't find tree number {} using version 0 for {}".format( tree_num, id)) i = 0 document['tree_version'] = i query['tree_version'] = i # we have to create it here as at the moment create_slug function requires Header to create slug # TODO: move this logic to create_slug function document['Slug'] = {'En': 'person_{};{}.{}'.format(tree_num, i, id)} created = update_collection(collection, query, document) if MIGRATE_ES == '1': is_ok, msg = update_es(collection.name, document, created) if not is_ok: current_app.logger.error(msg) current_app.logger.info('Updated person: {}.{}'.format(tree_num, id)) else: doc_id = get_doc_id(collection.name, document) if doc_id: query = {get_collection_id_field(collection): doc_id} created = update_collection(collection, query, document) if MIGRATE_ES == '1': is_ok, msg = update_es(collection.name, document, created) if not is_ok: current_app.logger.error(msg) slug = document.get("Slug", {}).get("En") current_app.logger.info('Updated {} {}, Slug: {}'.format( collection.name, doc_id, slug)) else: current_app.logger.error('update failed because of id {}'.format( collection.name))
def _get_mongo_items(self, collection_name, key): if key: if collection_name == "persons": raise NotImplementedError( "persons does not support updating by key yet") else: items = self.app.data_db[collection_name].find( {get_collection_id_field(collection_name): key}) else: items = self.app.data_db[collection_name].find() items = self._limit(items) return items
def _get_elasticsearch_item_key(self, collection_name, es_item): if collection_name == "persons": person_id = es_item.get("person_id", None) tree_num = es_item.get("tree_num", None) tree_version = es_item.get("tree_version", None) if person_id is not None and tree_num is not None and tree_version is not None: item_key = int(tree_num), int(tree_version), str(person_id) else: item_key = None else: id_field = get_collection_id_field(collection_name) item_key = es_item.get(id_field, None) return item_key
def _get_mongo_item_key(self, collection_name, mongo_item): if collection_name == "persons": person_id = mongo_item.get("id", None) if self.args.legacy and not person_id: person_id = mongo_item.get("ID", None) tree_num = mongo_item.get("tree_num", None) tree_version = mongo_item.get("tree_version", None) if person_id is not None and tree_num is not None and tree_version is not None: item_key = int(tree_num), int(tree_version), str(person_id) else: item_key = None else: id_field = get_collection_id_field(collection_name) item_key = mongo_item.get(id_field, None) return item_key
def _get_index_body(self): body = { "mappings": { collection: { "properties": {"Header": self.header_mapping,} } for collection in SEARCHABLE_COLLECTIONS } } body["mappings"]["familyNames"]["properties"]["dm_soundex"] = { "type": "completion", "max_input_length": 20, "contexts": [{ "name": "collection", "type": "CATEGORY", "path": "_type" }] } for collection_name, mapping in body["mappings"].items(): if collection_name == "persons": # persons specific mappings # ensure all fields relevant for search are properly indexed mapping["properties"].update({"tree_num": {"type": "integer"}, "tree_version": {"type": "integer"}, "person_id": {"type": "keyword"}, "birth_year": {"type": "integer"}, "death_year": {"type": "integer"}, "marriage_years": {"type": "integer"}, # these are updated in bhs_api.item.update_es functions "first_name_lc": {"type": "text"}, "last_name_lc": {"type": "text"}, "BIRT_PLAC_lc": {"type": "text"}, "MARR_PLAC_lc": {"type": "text"}, "DEAT_PLAC_lc": {"type": "text"}, "gender": {"type": "keyword"}}) else: mapping["properties"][get_collection_id_field(collection_name)] = {"type": "keyword"} return body
def _get_item_log_identifier(self, item_key, collection_name): if collection_name == "persons": return "(tree_num,version,id={},{},{})".format(*item_key) else: return "{}={}".format(get_collection_id_field(collection_name), item_key)
for c_name in SEARCHABLE_COLLECTIONS: if c_name != "persons": # TODO: add support for persons, at the moment it's not working due to the persons not having a single unique id field print("starting work on " + c_name) # in the process we might create duplicate index so remove them for now try: todb[c_name].drop_index('Slug.He_1') except pymongo.errors.OperationFailure: pass try: todb[c_name].drop_index('Slug.En_1') except pymongo.errors.OperationFailure: pass id_field = get_collection_id_field(c_name) # loop on all docs with a slug for from_doc in fromdb[c_name].find({'Slug': {'$exists': True, '$ne': {}}}): to_doc = app.data_db[c_name].find_one( {id_field: from_doc[id_field]}) if not to_doc: print("missing {}".format(get_item_slug(from_doc))) continue if from_doc['Slug'] != to_doc['Slug']: try: todb[c_name].update_one({'_id': to_doc['_id']}, {'$set': {'Slug': from_doc['Slug']} }) except pymongo.errors.DuplicateKeyError as e:
def reslugify(collection, document): ''' append the document id to the slug to ensure uniquness ''' for lang, val in document['Slug'].items(): if val: doc_id = get_collection_id_field(collection.name) document['Slug'][lang] += '-' + str(document[doc_id])
app, conf = create_app() fromdb = app.client_data_db[args.fromdb] todb = app.data_db for c_name in SEARCHABLE_COLLECTIONS: print("starting work on " + c_name) # in the process we might create duplicate index so remove them for now try: todb[c_name].drop_index('Slug.He_1') except pymongo.errors.OperationFailure: pass try: todb[c_name].drop_index('Slug.En_1') except pymongo.errors.OperationFailure: pass id_field = get_collection_id_field(c_name) # loop on all docs with a slug for from_doc in fromdb[c_name].find( {'Slug': { '$exists': True, '$ne': {} }}): to_doc = app.data_db[c_name].find_one( {id_field: from_doc[id_field]}) if not to_doc: print("missing {}".format(get_item_slug(from_doc))) continue if from_doc['Slug'] != to_doc['Slug']: try: todb[c_name].update_one( {'_id': to_doc['_id']},
def update_doc(collection, document): # update place items with geojson if collection.name == 'places': document['geometry'] = get_place_geo(document) # family trees get special treatment if collection.name == 'persons': tree_num = document['tree_num'] id = document['id'] tree_key = 'tree_vers_'+str(tree_num) query = {'tree_num': tree_num, 'id': id} tree_vers = current_app.redis.get(tree_key) if tree_vers: tree_vers = json.loads(tree_vers) i = find_version(tree_vers, document['tree_file_id']) else: tree = current_app.data_db['trees'].find_one({'num':tree_num}) if tree: tree_vers = tree['versions'] current_app.redis.set(tree_key, json.dumps(tree_vers), 300) i = find_version(tree_vers, document['tree_file_id']) else: current_app.logger.info("didn't find tree number {} using version 0 for {}" .format(tree_num, id)) i = 0; document['tree_version'] = i query['tree_version'] = i document['Slug'] = {'En': 'person_{};{}.{}'.format( tree_num, i, id)} update_collection(collection, query, document) current_app.logger.info('Updated person: {}.{}' .format(tree_num, id)) else: # post parsing: add _id and Slug doc_id_field = get_collection_id_field(collection.name) try: doc_id = document[doc_id_field] except KeyError: current_app.logger.error('update failed because of id {} {}' .format(collection.name, doc_id_field, )) if doc_id: document['_id'] = doc_id query = {doc_id_field: doc_id} result = update_collection(collection, query, document) update_es(collection.name, document, doc_id) try: slug = document['Slug']['En'] except KeyError: slug = 'None' current_app.logger.info('Updated {} {}: {}, Slug: {}'.format( collection.name, doc_id_field, doc_id, slug))