Example #1
0
 def _process_doc(self, collection, doc):
     _id = doc['_id']
     del doc['_id']
     del doc['UnitHeaderDMSoundex']
     # un null the fields that are used for completion
     if collection in ('places', 'familyNames'):
         self._add_phonetics(doc)
     # fill empty headers as es completion fails on null values
     header = doc['Header']
     for lang in ('En', 'He'):
         if not header[lang]:
             header[lang] = '1234567890'
         header["{}_lc".format(lang)] = header[lang].lower()
     res = None
     try:
         res = app.es.index(index=self.es_index_name,
                            doc_type=collection,
                            id=_id,
                            body=doc)
     except elasticsearch.exceptions.SerializationError:
         # UUID fields are causing es to crash, turn them to strings
         uuids_to_str(doc)
         try:
             res = app.es.index(index=self.es_index_name,
                                doc_type=collection,
                                id=_id,
                                body=doc)
         except elasticsearch.exceptions.SerializationError as e:
             import pdb
             pdb.set_trace()
     except elasticsearch.exceptions.RequestError as e:
         import pdb
         pdb.set_trace()
     return res
Example #2
0
 def _process_doc(self, collection, doc):
     _id = doc['_id']
     del doc['_id']
     del doc['UnitHeaderDMSoundex']
     # un null the fields that are used for completion
     if collection in ('places', 'familyNames'):
         self._add_phonetics(doc)
     # fill empty headers as es completion fails on null values
     header = doc['Header']
     for lang in ('En', 'He'):
         if not header[lang]:
             header[lang] = '1234567890'
         header["{}_lc".format(lang)] = header[lang].lower()
     res = None
     try:
         res = app.es.index(index=self.es_index_name, doc_type=collection, id=_id, body=doc)
     except elasticsearch.exceptions.SerializationError:
         # UUID fields are causing es to crash, turn them to strings
         uuids_to_str(doc)
         try:
             res = app.es.index(index=self.es_index_name, doc_type=collection, id=_id, body=doc)
         except elasticsearch.exceptions.SerializationError as e:
             import pdb
             pdb.set_trace()
     except elasticsearch.exceptions.RequestError as e:
         import pdb
         pdb.set_trace()
     return res
Example #3
0
def update_es(collection_name, doc, is_new, es_index_name=None, es=None, app=None):
    app = current_app if not app else app
    es_index_name = app.es_data_db_index_name if not es_index_name else es_index_name
    es = app.es if not es else es
    # index only the docs that are publicly available
    if doc_show_filter(collection_name, doc):
        body = deepcopy(doc)
        # adjust attributes for elasticsearch
        if collection_name == "persons":
            body["person_id"] = body.get("id", body.get("ID"))
            body["first_name_lc"] = body["name_lc"][0]
            body["last_name_lc"] = body["name_lc"][1]
            # maps all known SEX values to normalized gender value
            body["gender"] = {"F": "F", "M": "M",
                              None: "U", "": "U", "U": "U", "?": "U", "P": "U"}[body.get("SEX", "").strip()]
        # _id field is internal to mongo
        if '_id' in body:
            del body['_id']
        # id field has special meaning in elasticsearch
        if 'id' in body:
            del body['id']
        if "thumbnail" in body and "data" in body["thumbnail"]:
            # no need to have thumbnail data in elasticsearch
            # TODO: ensure we only store and use thumbnail from filesystem
            del body["thumbnail"]["data"]
        # persons collection gets a fake header to support searching
        if collection_name == "persons":
            name = " ".join(body["name"]) if isinstance(body["name"], list) else body["name"]
            body["Header"] = {"En": name, "He": name}
        # elasticsearch uses the header for completion field
        # this field does not support empty values, so we put a string with space here
        # this is most likely wrong, but works for now
        # TODO: figure out how to handle it properly, maybe items without header are invalid?
        if "Header" in body:
            for lang in ("He", "En"):
                if body["Header"].get(lang) is None:
                    body["Header"][lang] = '_'
        if collection_name == "persons":
            doc_id = "{}_{}_{}".format(body["tree_num"], body["tree_version"], body["person_id"])
        else:
            doc_id = get_doc_id(collection_name, body)
        if is_new:
            uuids_to_str(body)
            es.index(index=es_index_name, doc_type=collection_name, id=doc_id, body=body)
            return True, "indexed successfully (inserted)"
        else:
            es.update(index=es_index_name, doc_type=collection_name, id=doc_id, body=body)
            return True, "indexed successfully (updated)"
    else:
        return True, "item should not be shown - so not indexed"
Example #4
0
def update_es(collection_name, doc, is_new, es_index_name=None, es=None, app=None):
    app = current_app if not app else app
    es_index_name = app.es_data_db_index_name if not es_index_name else es_index_name
    es = app.es if not es else es
    # index only the docs that are publicly available
    if doc_show_filter(collection_name, doc):
        body = deepcopy(doc)
        # adjust attributes for elasticsearch
        if collection_name == "persons":
            body["person_id"] = body.get("id", body.get("ID"))
            body["first_name_lc"] = body["name_lc"][0]
            body["last_name_lc"] = body["name_lc"][1]
            # maps all known SEX values to normalized gender value
            body["gender"] = {"F": "F", "M": "M",
                              None: "U", "": "U", "U": "U", "?": "U", "P": "U"}[body.get("SEX", "").strip()]
        # _id field is internal to mongo
        if '_id' in body:
            del body['_id']
        # id field has special meaning in elasticsearch
        if 'id' in body:
            del body['id']
        if "thumbnail" in body and "data" in body["thumbnail"]:
            # no need to have thumbnail data in elasticsearch
            # TODO: ensure we only store and use thumbnail from filesystem
            del body["thumbnail"]["data"]
        # persons collection gets a fake header to support searching
        if collection_name == "persons":
            name = " ".join(body["name"]) if isinstance(body["name"], list) else body["name"]
            body["Header"] = {"En": name, "He": name}
        # elasticsearch uses the header for completion field
        # this field does not support empty values, so we put a string with space here
        # this is most likely wrong, but works for now
        # TODO: figure out how to handle it properly, maybe items without header are invalid?
        if "Header" in body:
            for lang in ("He", "En"):
                if body["Header"].get(lang) is None:
                    body["Header"][lang] = '_'
        if collection_name == "persons":
            doc_id = "{}_{}_{}".format(body["tree_num"], body["tree_version"], body["person_id"])
        else:
            doc_id = get_doc_id(collection_name, body)
        if is_new:
            uuids_to_str(body)
            es.index(index=es_index_name, doc_type=collection_name, id=doc_id, body=body)
            return True, "indexed successfully (inserted)"
        else:
            es.update(index=es_index_name, doc_type=collection_name, id=doc_id, body=body)
            return True, "indexed successfully (updated)"
    else:
        return True, "item should not be shown - so not indexed"
Example #5
0
def es_mlt_search(index_name, doc, doc_fields, target_collection, limit):
    '''Build an mlt query and execute it'''

    clean_doc = doc.copy()
    del clean_doc['_id']
    query = {
        'query': {
            'mlt': {
                'fields': doc_fields,
                'docs': [{
                    'doc': clean_doc
                }],
            }
        }
    }
    try:
        results = es.search(index=data_db.name,
                            doc_type=target_collection,
                            body=query,
                            size=limit)
    except elasticsearch.exceptions.SerializationError:
        # UUID fields are causing es to crash, turn them to strings
        uuids_to_str(clean_doc)
        results = es.search(index=data_db.name,
                            doc_type=target_collection,
                            body=query,
                            size=limit)
    except elasticsearch.exceptions.ConnectionError as e:
        logger.error('Error connecting to Elasticsearch: {}'.format(e.error))
        raise e

    if len(results['hits']['hits']) > 0:
        ret = []
        for h in results['hits']['hits']:
            try:
                slug = get_item_slug(h['_source'])
            except KeyError:
                logger.error("couldn't find slug for {},{}".format(
                    h['_source']['_id'], h['_source']['UnitType']))
                continue
            ret.append(slug)
        return ret
    else:
        return None
Example #6
0
def es_mlt_search(index_name, doc, doc_fields, target_collection, limit):
    '''Build an mlt query and execute it'''

    clean_doc = doc.copy()
    del clean_doc['_id']
    query = {'query':
              {'mlt':
                {'fields': doc_fields,
                'docs':
                  [
                    {'doc': clean_doc}
                  ],
                }
              }
            }
    try:
        results = es.search(index=data_db.name, doc_type=target_collection, body=query, size=limit)
    except elasticsearch.exceptions.SerializationError:
        # UUID fields are causing es to crash, turn them to strings
        uuids_to_str(clean_doc)
        results = es.search(index=data_db.name, doc_type=target_collection,
                            body=query, size=limit)
    except elasticsearch.exceptions.ConnectionError as e:
        logger.error('Error connecting to Elasticsearch: {}'.format(e.error))
        raise e

    if len(results['hits']['hits']) > 0:
        ret = []
        for h in results['hits']['hits']:
            try:
                slug = get_item_slug(h['_source'])
            except KeyError:
                logger.error("couldn't find slug for {},{}".format(h['_source']['_id'],
                                                                h['_source']['UnitType']))
                continue
            ret.append(slug)
        return ret
    else:
        return None
Example #7
0
def update_es(collection, doc, id):
    if MIGRATE_ES != '1':
        return

    index_name = current_app.data_db.name
    body = doc.copy()
    if '_id' in body:
        del body['_id']
    try:
        current_app.es.index(index=index_name,
                             doc_type=collection,
                             id=id,
                             body=body)
    except elasticsearch.exceptions.SerializationError:
        # UUID fields are causing es to crash, turn them to strings
        uuids_to_str(doc)
        try:
            current_app.es.index(index=index_name,
                                 doc_type=collection,
                                 id=id,
                                 body=doc)
        except elasticsearch.exceptions.SerializationError as e:
            current_app.logger.error("Elastic search index failed for {}:{} with {}"
                                     .format(collection, id, e))
Example #8
0
            del doc['UnitHeaderDMSoundex']
            # un null the fields that are used for completion
            if collection in ('places', 'familyNames'):
                add_phonetics(doc)
            # fill empty headers as es completion fails on null values
            header = doc['Header']
            for lang in ('En', 'He'):
                if not header[lang]:
                    header[lang] = '1234567890'
            try:
                res = app.es.index(index=index_name,
                                   doc_type=collection,
                                   id=_id,
                                   body=doc)
            except elasticsearch.exceptions.SerializationError:
                # UUID fields are causing es to crash, turn them to strings
                uuids_to_str(doc)
                try:
                    res = app.es.index(index=index_name,
                                       doc_type=collection,
                                       id=_id,
                                       body=doc)
                except elasticsearch.exceptions.SerializationError as e:
                    import pdb
                    pdb.set_trace()
            except elasticsearch.exceptions.RequestError as e:
                import pdb
                pdb.set_trace()
        finished = datetime.datetime.now()
        print 'Collection {} took {}'.format(collection, finished - started)