Esempio n. 1
0
def norm_get_name(database, key, collection=None):
    if REPORT_LOOKUP_TIMINGS:
        lookup_start = datetime.now()

    dbpath = _get_db_path(database, collection)
    if dbpath is None:
        # full path not configured, fall back on name as default
        dbpath = database

    try:
        data = normdb.data_by_id(dbpath, key)
    except normdb.DbNotFoundError as e:
        Messager.warning(str(e))
        data = None

    # just grab the first one (sorry, this is a bit opaque)
    if data is not None:
        value = data[0][0][1]
    else:
        value = None

    if REPORT_LOOKUP_TIMINGS:
        _report_timings(database, lookup_start)

    # echo request for sync
    json_dic = {
        'database': database,
        'key': key,
        'value': value
    }
    return json_dic
Esempio n. 2
0
def ssdb_build(strs,
               dbname,
               ngram_length=DEFAULT_NGRAM_LENGTH,
               include_marks=DEFAULT_INCLUDE_MARKS):
    '''
    Given a list of strings, a DB name, and simstring options, builds
    a simstring DB for the strings.
    '''
    try:
        import simstring
    except ImportError:
        Messager.error(SIMSTRING_MISSING_ERROR, duration=-1)
        raise NoSimStringError

    dbfn = __ssdb_path(dbname)
    try:
        # only library defaults (n=3, no marks) supported just now (TODO)
        assert ngram_length == 3, "Error: unsupported n-gram length"
        assert include_marks == False, "Error: begin/end marks not supported"
        db = simstring.writer(dbfn)
        for s in strs:
            db.insert(s)
        db.close()
    except:
        print("Error building simstring DB", file=sys.stderr)
        raise

    return dbfn
Esempio n. 3
0
def ssdb_supstring_exists(s, dbname, threshold=DEFAULT_THRESHOLD):
    '''
    Given a string s and a DB name, returns whether at least one
    string in the associated simstring DB likely contains s as an
    (approximate) substring.
    '''
    try:
        import simstring
    except ImportError:
        Messager.error(SIMSTRING_MISSING_ERROR, duration=-1)
        raise NoSimStringError

    if threshold == 1.0:
        # optimized (not hugely, though) for this common case
        db = ssdb_open(dbname.encode('UTF-8'))

        __set_db_measure(db, 'overlap')
        db.threshold = threshold

        result = db.retrieve(s)
        db.close()

        # assume simstring DBs always contain UTF-8 - encoded strings
        result = [r.decode('UTF-8') for r in result]

        for r in result:
            if s in r:
                return True
        return False
    else:
        # naive implementation for everything else
        return len(ssdb_supstring_lookup(s, dbname, threshold)) != 0
Esempio n. 4
0
def norm_get_data(database, key, collection=None):
    if REPORT_LOOKUP_TIMINGS:
        lookup_start = datetime.now()

    dbpath = _get_db_path(database, collection)
    if dbpath is None:
        # full path not configured, fall back on name as default
        dbpath = database

    try:
        data = normdb.data_by_id(dbpath, key)
    except normdb.DbNotFoundError as e:
        Messager.warning(str(e))
        data = None

    if data is None:
        Messager.warning("Failed to get data for " + database + ":" + key)

    if REPORT_LOOKUP_TIMINGS:
        _report_timings(database, lookup_start)

    # echo request for sync
    json_dic = {
        'database': database,
        'key': key,
        'value': data
    }
    return json_dic
Esempio n. 5
0
def _report_timings(dbname, start, msg=None):
    """
    Debug facilty reports total time spent for processing the queries
    """
    delta = datetime.now() - start
    strdelta = str(delta).replace('0:00:0', '')  # take out zero min & hour
    queries = normdb.get_query_count(dbname)
    normdb.reset_query_count(dbname)
    Messager.info("Processed " + str(queries) + " queries in " + strdelta +
                  (msg if msg is not None else ""))
Esempio n. 6
0
def norm_search(database, name, collection=None, exactmatch=False):
    try:
        return _norm_search_impl(database, name, collection, exactmatch)
    except simstringdb.ssdbNotFoundError as e:
        Messager.warning(str(e))
        return {
            'database': database,
            'query': name,
            'header': [],
            'items': []
        }
Esempio n. 7
0
def __set_db_measure(db, measure):
    try:
        import simstring
    except ImportError:
        Messager.error(SIMSTRING_MISSING_ERROR, duration=-1)
        raise NoSimStringError

    ss_measure_by_str = {
        'cosine': simstring.cosine,
        'overlap': simstring.overlap,
    }
    db.measure = ss_measure_by_str[measure]
Esempio n. 8
0
def ssdb_supstring_lookup(s,
                          dbname,
                          threshold=DEFAULT_THRESHOLD,
                          with_score=False):
    '''
    Given a string s and a DB name, returns the strings in the
    associated simstring DB that likely contain s as an (approximate)
    substring. If with_score is True, returns pairs of (str,score)
    where score is the fraction of n-grams in s that are also found in
    the matched string.
    '''
    try:
        import simstring
    except ImportError:
        Messager.error(SIMSTRING_MISSING_ERROR, duration=-1)
        raise NoSimStringError

    db = ssdb_open(dbname.encode('UTF-8'))

    __set_db_measure(db, 'overlap')
    db.threshold = threshold

    result = db.retrieve(s)
    db.close()

    # assume simstring DBs always contain UTF-8 - encoded strings
    result = [r.decode('UTF-8') for r in result]

    # The simstring overlap measure is symmetric and thus does not
    # differentiate between substring and superstring matches.
    # Replicate a small bit of the simstring functionality (mostly the
    # ngrams() function) to filter to substrings only.
    s_ngrams = ngrams(s)
    filtered = []
    for r in result:
        if s in r:
            # avoid calculation: simple containment => score=1
            if with_score:
                filtered.append((r, 1.0))
            else:
                filtered.append(r)
        else:
            r_ngrams = ngrams(r)
            overlap = s_ngrams & r_ngrams
            if len(overlap) >= len(s_ngrams) * threshold:
                if with_score:
                    filtered.append((r, 1.0 * len(overlap) / len(s_ngrams)))
                else:
                    filtered.append(r)

    return filtered
Esempio n. 9
0
def ssdb_open(dbname):
    '''
    Given a DB name, opens it as a simstring DB and returns the handle.
    The caller is responsible for invoking close() on the handle.
    '''
    try:
        import simstring
    except ImportError:
        Messager.error(SIMSTRING_MISSING_ERROR, duration=-1)
        raise NoSimStringError

    try:
        return simstring.reader(__ssdb_path(dbname))
    except IOError:
        Messager.error('Failed to open simstring DB %s' % dbname)
        raise ssdbNotFoundError(dbname)
Esempio n. 10
0
def _get_db_path(database, collection):
    if collection is None:
        # TODO: default to WORK_DIR config?
        return None
    else:
        try:
            conf_dir = real_directory(collection)
            projectconf = ProjectConfiguration(conf_dir)
            norm_conf = projectconf.get_normalization_config()
            for entry in norm_conf:
                dbname, dbpath = entry[0], entry[3]
                if dbname == database:
                    return dbpath
            # not found in config.
            Messager.warning('DB '+database+' not defined in config for ' +
                             collection+', falling back on default.')
            return None
        except Exception:
            # whatever goes wrong, just warn and fall back on the default.
            Messager.warning('Failed to get DB path from config for ' +
                             collection+', falling back on default.')
            return None
Esempio n. 11
0
def _norm_search_name_attr(database, name, attr,
                           matched, score_by_id, score_by_str,
                           best_score=0, exactmatch=False,
                           threshold=simstringdb.DEFAULT_THRESHOLD):
    # helper for norm_search, searches for matches where given name
    # appears either in full or as an approximate substring of a full
    # name (if exactmatch is False) in given DB. If attr is not None,
    # requires its value to appear as an attribute of the entry with
    # the matched name. Updates matched, score_by_id, and
    # score_by_str, returns best_score.

    # If there are no strict substring matches for a given attribute
    # in the simstring DB, we can be sure that no query can succeed,
    # and can fail early.
    # TODO: this would be more effective (as would some other things)
    # if the attributes were in a separate simstring DB from the
    # names.
    if attr is not None:
        utfattr = attr.encode('UTF-8')
        normattr = string_norm_form(utfattr)
        if not simstringdb.ssdb_supstring_exists(normattr, database, 1.0):
            # debugging
            #Messager.info('Early norm search fail on "%s"' % attr)
            return best_score

    if exactmatch:
        # only candidate string is given name
        strs = [name]
        ss_norm_score = {string_norm_form(name): 1.0}
    else:
        # expand to substrings using simstring
        # simstring requires UTF-8
        utfname = name.encode('UTF-8')
        normname = string_norm_form(utfname)
        str_scores = simstringdb.ssdb_supstring_lookup(normname, database,
                                                       threshold, True)
        strs = [s[0] for s in str_scores]
        ss_norm_score = dict(str_scores)

        # TODO: recreate this older filter; watch out for which name to use!
#         # filter to strings not already considered
#         strs = [s for s in strs if (normname, s) not in score_by_str]

    # look up IDs
    if attr is None:
        id_names = normdb.ids_by_names(database, strs, False, True)
    else:
        id_names = normdb.ids_by_names_attr(database, strs, attr, False, True)

    # sort by simstring (n-gram overlap) score to prioritize likely
    # good hits.
    # TODO: this doesn't seem to be having a very significant effect.
    # consider removing as unnecessary complication (ss_norm_score also).
    id_name_scores = [(i, n, ss_norm_score[string_norm_form(n)])
                      for i, n in id_names]
    if _PYTHON3:
        id_name_scores.sort(lambda a: a[2], reverse=True)
    else:
        id_name_scores.sort(lambda a, b: cmp(  # pylint: disable=undefined-variable
            b[2], a[2]))
    id_names = [(i, n) for i, n, s in id_name_scores]

    # update matches and scores
    for i, n in id_names:
        if n not in matched:
            matched[n] = set()
        matched[n].add(i)

        max_cost = MAX_SCORE - best_score + MAX_DIFF_TO_BEST_SCORE + 1
        if (name, n) not in score_by_str:
            # TODO: decide whether to use normalized or unnormalized strings
            # for scoring here.
            #score_by_str[(name, n)] = _norm_score(name, n, max_cost)
            score_by_str[(name, n)] = _norm_score(
                string_norm_form(name), string_norm_form(n), max_cost)
        score = score_by_str[(name, n)]
        best_score = max(score, best_score)

        score_by_id[i] = max(score_by_id.get(i, -1),
                             score_by_str[(name, n)])

        # stop if max count reached
        if len(score_by_id) > MAX_SEARCH_RESULT_NUMBER:
            Messager.info(
                'Note: more than %d search results, only retrieving top matches' % MAX_SEARCH_RESULT_NUMBER)
            break

    return best_score