コード例 #1
0
def get_bad_headers(collection_obj):
    rv = []
    for doc in collection_obj.find():
        if doc['Header']['En'] and doc['Header']['He'] and (
                is_hebrew(doc['Header']['En'])
                or is_hebrew(doc['Header']['He']) == False):
            rv.append({doc['UnitId']: doc['Header']})
    return rv
コード例 #2
0
def get_completion(collection, string, size=7):
    '''Search in the elastic search index for completion options.
    Returns tuple of (text_completion_results, phonetic_results)
    Where each array contains up to `size` results.
    '''
    # currently we only do a simple starts with search, without contains or phonetics
    # TODO: fix phonetics search, some work was done for that
    # see https://github.com/Beit-Hatfutsot/dbs-back/blob/2e79c363e40472f28fd07f8a344fe55ab77198ee/bhs_api/v1_endpoints.py#L189
    lang = "He" if phonetic.is_hebrew(string) else "En"
    q = {
        "_source": ["Slug", "Header"],
        "suggest": {
            "header" : {
                "prefix": string,
                "completion": {
                    "field": "Header.{}.suggest".format(lang),
                    "size": size,
                    "contexts": {
                        "collection": collection,
                    }
                }
            },
        }
    }
    q["suggest"]["header"]["completion"]["contexts"] = {"collection": collection}
    results = current_app.es.search(index=current_app.es_data_db_index_name, body=q, size=0)
    try:
        header_options = results['suggest']['header'][0]['options']
    except KeyError:
        header_options = []
    try:
        phonetic_options = results['suggest']['phonetic'][0]['options']
    except KeyError:
        phonetic_options = []
    return  [i['_source']['Header'][lang] for i in header_options], [i['_source']['Header'][lang] for i in phonetic_options]
コード例 #3
0
ファイル: item.py プロジェクト: Libisch/dbs-back
def search_by_header(string, collection, starts_with=True, db=None):
    if not db:
        db = current_app.data_db
    if not string: # Support empty strings
        return {}
    if phonetic.is_hebrew(string):
        lang = 'He'
    else:
        lang = 'En'
    string_re = re.escape(string)
    if starts_with:
        header_regex = re.compile(u'^'+string_re, re.IGNORECASE)
    else:
        header_regex = re.compile(u'^{}$'.format(string_re), re.IGNORECASE)
    lang_header = 'Header.{}'.format(lang)
    unit_text = 'UnitText1.{}'.format(lang)
    # Search only for non empty docs with right status
    show_filter = SHOW_FILTER.copy()
    show_filter[unit_text] = {"$nin": [None, '']}
    header_search_ex = {lang_header: header_regex}
    header_search_ex.update(show_filter)
    item = db[collection].find_one(header_search_ex)

    if item:
        item = enrich_item(item, db)
        return _make_serializable(item)
    else:
        return {}
コード例 #4
0
ファイル: item.py プロジェクト: OriHoch/dbs-back
def search_by_header(string, collection, starts_with=True, db=None):
    if not db:
        db = current_app.data_db
    if not string: # Support empty strings
        return {}
    if phonetic.is_hebrew(string):
        lang = 'He'
    else:
        lang = 'En'
    string_re = re.escape(string)
    if starts_with:
        header_regex = re.compile(u'^'+string_re, re.IGNORECASE)
    else:
        header_regex = re.compile(u'^{}$'.format(string_re), re.IGNORECASE)
    lang_header = 'Header.{}'.format(lang)
    unit_text = 'UnitText1.{}'.format(lang)
    # Search only for non empty docs with right status
    show_filter = SHOW_FILTER.copy()
    show_filter[unit_text] = {"$nin": [None, '']}
    header_search_ex = {lang_header: header_regex}
    header_search_ex.update(show_filter)
    item = db[collection].find_one(header_search_ex)

    if item:
        item = enrich_item(item, db)
        return _make_serializable(item)
    else:
        return {}
コード例 #5
0
def es_search(q, size, collection=None, from_=0, sort=None, with_persons=False, **kwargs):
    if collection:
        # if user requested specific collections - we don't filter for persons (that's what user asked for!)
        collections = collection.split(",")
    else:
        # we consider the with_persons to decide whether to include persons collection or not
        collections = [collection for collection in SEARCHABLE_COLLECTIONS
                                  if with_persons or collection != "persons"]
    

    fields = ["Header.En^2", "Header.He^2", "UnitText1.En", "UnitText1.He"]
    default_query = {
        "query_string": {
            "fields": fields,
            "query": q,
            "default_operator": "and"
        }
    }
    for k, v in PERSONS_SEARCH_TEXT_PARAMS_LOWERCASE:
        if not isinstance(v, list):
            fields.append(v)
    must_queries = []
    if q:
        must_queries.append(default_query)
    for year_param, year_attr in PERSONS_SEARCH_YEAR_PARAMS:
        if kwargs[year_param]:
            try:
                year_value = int(kwargs[year_param])
            except Exception as e:
                raise Exception("invalid value for {} ({}): {}".format(year_param, year_attr, kwargs[year_param]))
            year_type_param = "{}_t".format(year_param)
            year_type = kwargs[year_type_param]
            if year_type == "pmyears":
                year_type_value_param = "{}_v".format(year_param)
                try:
                    year_type_value = int(kwargs[year_type_value_param])
                except Exception as e:
                    raise Exception("invalid value for {} ({}): {}".format(year_type_value_param, year_attr, kwargs[year_type_value_param]))
                must_queries.append({"range": {year_attr: {"gte": year_value - year_type_value, "lte": year_value + year_type_value,}}})
            elif year_type == "exact":
                must_queries.append({"term": {year_attr: year_value}})
            else:
                raise Exception("invalid value for {} ({}): {}".format(year_type_param, year_attr, year_type))
    for text_param, text_attr in PERSONS_SEARCH_TEXT_PARAMS_LOWERCASE:
        if kwargs[text_param]:
            text_value = kwargs[text_param].lower()
            text_type_param = "{}_t".format(text_param)
            text_type = kwargs[text_type_param]
            must_queries.append(get_person_elastic_search_text_param_query(text_type, text_attr, text_value))

    for exact_param, exact_attr in PERSONS_SEARCH_EXACT_PARAMS:
        if kwargs[exact_param]:
            exact_value = kwargs[exact_param]
            if exact_param == "sex":
                exact_value = exact_value.upper()
                if exact_value not in ["F", "M", "U"]:
                    raise Exception("invalid value for {} ({}): {}".format(exact_param, exact_attr, exact_value))
            elif exact_param == "treenum":
                try:
                    exact_value = int(exact_value)
                except Exception as e:
                    raise Exception("invalid value for {} ({}): {}".format(exact_param, exact_attr, exact_value))
            must_queries.append({"term": {exact_attr: exact_value}})
    collection_boosts = {
        "places": 5
    }
    for collection in SEARCHABLE_COLLECTIONS:
        if collection not in collection_boosts:
            collection_boosts[collection] = 1
    must_queries.append({"bool": {
        "should": [{"match": {"_type": {"query": collection, "boost": boost}}}
                   for collection, boost in collection_boosts.items()]
    }})
    body = {
        "query": {
            "bool": {
                "must": must_queries
            }
        }
    }
    if sort == "abc":
        if phonetic.is_hebrew(q.strip()):
            # hebrew alphabetical sort
            body["sort"] = [{"Header.He_lc": "asc"}, "_score"]
        else:
            # english alphabetical sort
            body["sort"] = [{"Header.En_lc": "asc"}, "_score"]
    elif sort == "rel":
        # relevance sort
        body["sort"] = ["_score"]
    elif sort == "year" and collection == "photoUnits":
        body["sort"] = [{"UnitPeriod.PeriodStartDate.keyword": "asc"}, "_score"]
    try:
        current_app.logger.debug("es.search index={}, doc_type={} body={}".format(current_app.es_data_db_index_name, collections, json.dumps(body)))
        results = current_app.es.search(index=current_app.es_data_db_index_name, body=body, doc_type=collections, size=size, from_=from_)
    except elasticsearch.exceptions.ConnectionError as e:
        current_app.logger.error('Error connecting to Elasticsearch: {}'.format(e))
        raise Exception("Error connecting to Elasticsearch: {}".format(e))
    except Exception as e:
        raise Exception("Elasticsearch error: {}".format(e))
    return results
コード例 #6
0
def get_completion(collection, string, size=7):
    '''Search in the elastic search index for completion options.
    Returns to array each with up to `size` results. The first array contains
    the text completion and the second the phonetic suggestions.
    '''

    if phonetic.is_hebrew(string):
        lang = 'He'
    else:
        lang = 'En'
    ''' TODO: fix pohonetics search or remove this code
    dms_soundex = phonetic.get_dms(string)
    if " " in dms_soundex:
        dms_completion = {"regex":"[{}]".format(dms_soundex.replace(' ', '|'))}
        dms_completion = {"prefix": dms_soundex.split(' ')[0]}
    else:
        dms_completion = {"prefix": dms_soundex}

    q = {
        "_source": ["Slug", "Header"],
        "suggest": {
            "header" : {
                "prefix" : string,
                "completion" : {
                    "field" : "Header.{}.suggest".format(lang),
                    "size": size,
                    "contexts": {
                        "collection": collection,
                    }
                }
            },
            "phonetic" : {
                "completion" : {
                    "field" : "dm_soundex",
                    "size": size,
                    "contexts": {
                        "collection": collection,
                    }
                }
            }
        }
    }
    q["suggest"]["phonetic"].update(dms_completion)
    '''

    # no phonetics query
    q = {
        "_source": ["Slug", "Header"],
        "suggest": {
            "header": {
                "prefix": string,
                "completion": {
                    "field": "Header.{}.suggest".format(lang),
                    "size": size,
                    "contexts": {
                        "collection": collection,
                    }
                }
            },
        }
    }

    results = current_app.es.search(index=current_app.data_db.name,
                                    body=q,
                                    size=0)
    try:
        header_options = results['suggest']['header'][0]['options']
    except KeyError:
        header_options = []

    try:
        phonetic_options = results['suggest']['phonetic'][0]['options']
    except KeyError:
        phonetic_options = []

    return  [i['_source']['Header'][lang] for i in header_options],\
            [i['_source']['Header'][lang] for i in phonetic_options]
コード例 #7
0
ファイル: data_quality.py プロジェクト: OriHoch/dbs-back
def get_bad_headers(collection_obj):
    rv = []
    for doc in collection_obj.find():
        if doc['Header']['En'] and doc['Header']['He'] and (is_hebrew(doc['Header']['En']) or is_hebrew(doc['Header']['He']) == False):
            rv.append({doc['UnitId']: doc['Header']})
    return rv