def get_bad_headers(collection_obj): rv = [] for doc in collection_obj.find(): if doc['Header']['En'] and doc['Header']['He'] and ( is_hebrew(doc['Header']['En']) or is_hebrew(doc['Header']['He']) == False): rv.append({doc['UnitId']: doc['Header']}) return rv
def get_completion(collection, string, size=7): '''Search in the elastic search index for completion options. Returns tuple of (text_completion_results, phonetic_results) Where each array contains up to `size` results. ''' # currently we only do a simple starts with search, without contains or phonetics # TODO: fix phonetics search, some work was done for that # see https://github.com/Beit-Hatfutsot/dbs-back/blob/2e79c363e40472f28fd07f8a344fe55ab77198ee/bhs_api/v1_endpoints.py#L189 lang = "He" if phonetic.is_hebrew(string) else "En" q = { "_source": ["Slug", "Header"], "suggest": { "header" : { "prefix": string, "completion": { "field": "Header.{}.suggest".format(lang), "size": size, "contexts": { "collection": collection, } } }, } } q["suggest"]["header"]["completion"]["contexts"] = {"collection": collection} results = current_app.es.search(index=current_app.es_data_db_index_name, body=q, size=0) try: header_options = results['suggest']['header'][0]['options'] except KeyError: header_options = [] try: phonetic_options = results['suggest']['phonetic'][0]['options'] except KeyError: phonetic_options = [] return [i['_source']['Header'][lang] for i in header_options], [i['_source']['Header'][lang] for i in phonetic_options]
def search_by_header(string, collection, starts_with=True, db=None): if not db: db = current_app.data_db if not string: # Support empty strings return {} if phonetic.is_hebrew(string): lang = 'He' else: lang = 'En' string_re = re.escape(string) if starts_with: header_regex = re.compile(u'^'+string_re, re.IGNORECASE) else: header_regex = re.compile(u'^{}$'.format(string_re), re.IGNORECASE) lang_header = 'Header.{}'.format(lang) unit_text = 'UnitText1.{}'.format(lang) # Search only for non empty docs with right status show_filter = SHOW_FILTER.copy() show_filter[unit_text] = {"$nin": [None, '']} header_search_ex = {lang_header: header_regex} header_search_ex.update(show_filter) item = db[collection].find_one(header_search_ex) if item: item = enrich_item(item, db) return _make_serializable(item) else: return {}
def es_search(q, size, collection=None, from_=0, sort=None, with_persons=False, **kwargs): if collection: # if user requested specific collections - we don't filter for persons (that's what user asked for!) collections = collection.split(",") else: # we consider the with_persons to decide whether to include persons collection or not collections = [collection for collection in SEARCHABLE_COLLECTIONS if with_persons or collection != "persons"] fields = ["Header.En^2", "Header.He^2", "UnitText1.En", "UnitText1.He"] default_query = { "query_string": { "fields": fields, "query": q, "default_operator": "and" } } for k, v in PERSONS_SEARCH_TEXT_PARAMS_LOWERCASE: if not isinstance(v, list): fields.append(v) must_queries = [] if q: must_queries.append(default_query) for year_param, year_attr in PERSONS_SEARCH_YEAR_PARAMS: if kwargs[year_param]: try: year_value = int(kwargs[year_param]) except Exception as e: raise Exception("invalid value for {} ({}): {}".format(year_param, year_attr, kwargs[year_param])) year_type_param = "{}_t".format(year_param) year_type = kwargs[year_type_param] if year_type == "pmyears": year_type_value_param = "{}_v".format(year_param) try: year_type_value = int(kwargs[year_type_value_param]) except Exception as e: raise Exception("invalid value for {} ({}): {}".format(year_type_value_param, year_attr, kwargs[year_type_value_param])) must_queries.append({"range": {year_attr: {"gte": year_value - year_type_value, "lte": year_value + year_type_value,}}}) elif year_type == "exact": must_queries.append({"term": {year_attr: year_value}}) else: raise Exception("invalid value for {} ({}): {}".format(year_type_param, year_attr, year_type)) for text_param, text_attr in PERSONS_SEARCH_TEXT_PARAMS_LOWERCASE: if kwargs[text_param]: text_value = kwargs[text_param].lower() text_type_param = "{}_t".format(text_param) text_type = kwargs[text_type_param] must_queries.append(get_person_elastic_search_text_param_query(text_type, text_attr, text_value)) for exact_param, exact_attr in PERSONS_SEARCH_EXACT_PARAMS: if kwargs[exact_param]: exact_value = kwargs[exact_param] if exact_param == "sex": exact_value = exact_value.upper() if exact_value not in ["F", "M", "U"]: raise Exception("invalid value for {} ({}): {}".format(exact_param, exact_attr, exact_value)) elif exact_param == "treenum": try: exact_value = int(exact_value) except Exception as e: raise Exception("invalid value for {} ({}): {}".format(exact_param, exact_attr, exact_value)) must_queries.append({"term": {exact_attr: exact_value}}) collection_boosts = { "places": 5 } for collection in SEARCHABLE_COLLECTIONS: if collection not in collection_boosts: collection_boosts[collection] = 1 must_queries.append({"bool": { "should": [{"match": {"_type": {"query": collection, "boost": boost}}} for collection, boost in collection_boosts.items()] }}) body = { "query": { "bool": { "must": must_queries } } } if sort == "abc": if phonetic.is_hebrew(q.strip()): # hebrew alphabetical sort body["sort"] = [{"Header.He_lc": "asc"}, "_score"] else: # english alphabetical sort body["sort"] = [{"Header.En_lc": "asc"}, "_score"] elif sort == "rel": # relevance sort body["sort"] = ["_score"] elif sort == "year" and collection == "photoUnits": body["sort"] = [{"UnitPeriod.PeriodStartDate.keyword": "asc"}, "_score"] try: current_app.logger.debug("es.search index={}, doc_type={} body={}".format(current_app.es_data_db_index_name, collections, json.dumps(body))) results = current_app.es.search(index=current_app.es_data_db_index_name, body=body, doc_type=collections, size=size, from_=from_) except elasticsearch.exceptions.ConnectionError as e: current_app.logger.error('Error connecting to Elasticsearch: {}'.format(e)) raise Exception("Error connecting to Elasticsearch: {}".format(e)) except Exception as e: raise Exception("Elasticsearch error: {}".format(e)) return results
def get_completion(collection, string, size=7): '''Search in the elastic search index for completion options. Returns to array each with up to `size` results. The first array contains the text completion and the second the phonetic suggestions. ''' if phonetic.is_hebrew(string): lang = 'He' else: lang = 'En' ''' TODO: fix pohonetics search or remove this code dms_soundex = phonetic.get_dms(string) if " " in dms_soundex: dms_completion = {"regex":"[{}]".format(dms_soundex.replace(' ', '|'))} dms_completion = {"prefix": dms_soundex.split(' ')[0]} else: dms_completion = {"prefix": dms_soundex} q = { "_source": ["Slug", "Header"], "suggest": { "header" : { "prefix" : string, "completion" : { "field" : "Header.{}.suggest".format(lang), "size": size, "contexts": { "collection": collection, } } }, "phonetic" : { "completion" : { "field" : "dm_soundex", "size": size, "contexts": { "collection": collection, } } } } } q["suggest"]["phonetic"].update(dms_completion) ''' # no phonetics query q = { "_source": ["Slug", "Header"], "suggest": { "header": { "prefix": string, "completion": { "field": "Header.{}.suggest".format(lang), "size": size, "contexts": { "collection": collection, } } }, } } results = current_app.es.search(index=current_app.data_db.name, body=q, size=0) try: header_options = results['suggest']['header'][0]['options'] except KeyError: header_options = [] try: phonetic_options = results['suggest']['phonetic'][0]['options'] except KeyError: phonetic_options = [] return [i['_source']['Header'][lang] for i in header_options],\ [i['_source']['Header'][lang] for i in phonetic_options]
def get_bad_headers(collection_obj): rv = [] for doc in collection_obj.find(): if doc['Header']['En'] and doc['Header']['He'] and (is_hebrew(doc['Header']['En']) or is_hebrew(doc['Header']['He']) == False): rv.append({doc['UnitId']: doc['Header']}) return rv