def norm_get_name(database, key, collection=None): if REPORT_LOOKUP_TIMINGS: lookup_start = datetime.now() dbpath = _get_db_path(database, collection) if dbpath is None: # full path not configured, fall back on name as default dbpath = database try: data = normdb.data_by_id(dbpath, key) except normdb.DbNotFoundError as e: Messager.warning(str(e)) data = None # just grab the first one (sorry, this is a bit opaque) if data is not None: value = data[0][0][1] else: value = None if REPORT_LOOKUP_TIMINGS: _report_timings(database, lookup_start) # echo request for sync json_dic = { 'database': database, 'key': key, 'value': value } return json_dic
def ssdb_build(strs, dbname, ngram_length=DEFAULT_NGRAM_LENGTH, include_marks=DEFAULT_INCLUDE_MARKS): ''' Given a list of strings, a DB name, and simstring options, builds a simstring DB for the strings. ''' try: import simstring except ImportError: Messager.error(SIMSTRING_MISSING_ERROR, duration=-1) raise NoSimStringError dbfn = __ssdb_path(dbname) try: # only library defaults (n=3, no marks) supported just now (TODO) assert ngram_length == 3, "Error: unsupported n-gram length" assert include_marks == False, "Error: begin/end marks not supported" db = simstring.writer(dbfn) for s in strs: db.insert(s) db.close() except: print("Error building simstring DB", file=sys.stderr) raise return dbfn
def ssdb_supstring_exists(s, dbname, threshold=DEFAULT_THRESHOLD): ''' Given a string s and a DB name, returns whether at least one string in the associated simstring DB likely contains s as an (approximate) substring. ''' try: import simstring except ImportError: Messager.error(SIMSTRING_MISSING_ERROR, duration=-1) raise NoSimStringError if threshold == 1.0: # optimized (not hugely, though) for this common case db = ssdb_open(dbname.encode('UTF-8')) __set_db_measure(db, 'overlap') db.threshold = threshold result = db.retrieve(s) db.close() # assume simstring DBs always contain UTF-8 - encoded strings result = [r.decode('UTF-8') for r in result] for r in result: if s in r: return True return False else: # naive implementation for everything else return len(ssdb_supstring_lookup(s, dbname, threshold)) != 0
def norm_get_data(database, key, collection=None): if REPORT_LOOKUP_TIMINGS: lookup_start = datetime.now() dbpath = _get_db_path(database, collection) if dbpath is None: # full path not configured, fall back on name as default dbpath = database try: data = normdb.data_by_id(dbpath, key) except normdb.DbNotFoundError as e: Messager.warning(str(e)) data = None if data is None: Messager.warning("Failed to get data for " + database + ":" + key) if REPORT_LOOKUP_TIMINGS: _report_timings(database, lookup_start) # echo request for sync json_dic = { 'database': database, 'key': key, 'value': data } return json_dic
def _report_timings(dbname, start, msg=None): """ Debug facilty reports total time spent for processing the queries """ delta = datetime.now() - start strdelta = str(delta).replace('0:00:0', '') # take out zero min & hour queries = normdb.get_query_count(dbname) normdb.reset_query_count(dbname) Messager.info("Processed " + str(queries) + " queries in " + strdelta + (msg if msg is not None else ""))
def norm_search(database, name, collection=None, exactmatch=False): try: return _norm_search_impl(database, name, collection, exactmatch) except simstringdb.ssdbNotFoundError as e: Messager.warning(str(e)) return { 'database': database, 'query': name, 'header': [], 'items': [] }
def __set_db_measure(db, measure): try: import simstring except ImportError: Messager.error(SIMSTRING_MISSING_ERROR, duration=-1) raise NoSimStringError ss_measure_by_str = { 'cosine': simstring.cosine, 'overlap': simstring.overlap, } db.measure = ss_measure_by_str[measure]
def ssdb_supstring_lookup(s, dbname, threshold=DEFAULT_THRESHOLD, with_score=False): ''' Given a string s and a DB name, returns the strings in the associated simstring DB that likely contain s as an (approximate) substring. If with_score is True, returns pairs of (str,score) where score is the fraction of n-grams in s that are also found in the matched string. ''' try: import simstring except ImportError: Messager.error(SIMSTRING_MISSING_ERROR, duration=-1) raise NoSimStringError db = ssdb_open(dbname.encode('UTF-8')) __set_db_measure(db, 'overlap') db.threshold = threshold result = db.retrieve(s) db.close() # assume simstring DBs always contain UTF-8 - encoded strings result = [r.decode('UTF-8') for r in result] # The simstring overlap measure is symmetric and thus does not # differentiate between substring and superstring matches. # Replicate a small bit of the simstring functionality (mostly the # ngrams() function) to filter to substrings only. s_ngrams = ngrams(s) filtered = [] for r in result: if s in r: # avoid calculation: simple containment => score=1 if with_score: filtered.append((r, 1.0)) else: filtered.append(r) else: r_ngrams = ngrams(r) overlap = s_ngrams & r_ngrams if len(overlap) >= len(s_ngrams) * threshold: if with_score: filtered.append((r, 1.0 * len(overlap) / len(s_ngrams))) else: filtered.append(r) return filtered
def ssdb_open(dbname): ''' Given a DB name, opens it as a simstring DB and returns the handle. The caller is responsible for invoking close() on the handle. ''' try: import simstring except ImportError: Messager.error(SIMSTRING_MISSING_ERROR, duration=-1) raise NoSimStringError try: return simstring.reader(__ssdb_path(dbname)) except IOError: Messager.error('Failed to open simstring DB %s' % dbname) raise ssdbNotFoundError(dbname)
def _get_db_path(database, collection): if collection is None: # TODO: default to WORK_DIR config? return None else: try: conf_dir = real_directory(collection) projectconf = ProjectConfiguration(conf_dir) norm_conf = projectconf.get_normalization_config() for entry in norm_conf: dbname, dbpath = entry[0], entry[3] if dbname == database: return dbpath # not found in config. Messager.warning('DB '+database+' not defined in config for ' + collection+', falling back on default.') return None except Exception: # whatever goes wrong, just warn and fall back on the default. Messager.warning('Failed to get DB path from config for ' + collection+', falling back on default.') return None
def _norm_search_name_attr(database, name, attr, matched, score_by_id, score_by_str, best_score=0, exactmatch=False, threshold=simstringdb.DEFAULT_THRESHOLD): # helper for norm_search, searches for matches where given name # appears either in full or as an approximate substring of a full # name (if exactmatch is False) in given DB. If attr is not None, # requires its value to appear as an attribute of the entry with # the matched name. Updates matched, score_by_id, and # score_by_str, returns best_score. # If there are no strict substring matches for a given attribute # in the simstring DB, we can be sure that no query can succeed, # and can fail early. # TODO: this would be more effective (as would some other things) # if the attributes were in a separate simstring DB from the # names. if attr is not None: utfattr = attr.encode('UTF-8') normattr = string_norm_form(utfattr) if not simstringdb.ssdb_supstring_exists(normattr, database, 1.0): # debugging #Messager.info('Early norm search fail on "%s"' % attr) return best_score if exactmatch: # only candidate string is given name strs = [name] ss_norm_score = {string_norm_form(name): 1.0} else: # expand to substrings using simstring # simstring requires UTF-8 utfname = name.encode('UTF-8') normname = string_norm_form(utfname) str_scores = simstringdb.ssdb_supstring_lookup(normname, database, threshold, True) strs = [s[0] for s in str_scores] ss_norm_score = dict(str_scores) # TODO: recreate this older filter; watch out for which name to use! # # filter to strings not already considered # strs = [s for s in strs if (normname, s) not in score_by_str] # look up IDs if attr is None: id_names = normdb.ids_by_names(database, strs, False, True) else: id_names = normdb.ids_by_names_attr(database, strs, attr, False, True) # sort by simstring (n-gram overlap) score to prioritize likely # good hits. # TODO: this doesn't seem to be having a very significant effect. # consider removing as unnecessary complication (ss_norm_score also). id_name_scores = [(i, n, ss_norm_score[string_norm_form(n)]) for i, n in id_names] if _PYTHON3: id_name_scores.sort(lambda a: a[2], reverse=True) else: id_name_scores.sort(lambda a, b: cmp( # pylint: disable=undefined-variable b[2], a[2])) id_names = [(i, n) for i, n, s in id_name_scores] # update matches and scores for i, n in id_names: if n not in matched: matched[n] = set() matched[n].add(i) max_cost = MAX_SCORE - best_score + MAX_DIFF_TO_BEST_SCORE + 1 if (name, n) not in score_by_str: # TODO: decide whether to use normalized or unnormalized strings # for scoring here. #score_by_str[(name, n)] = _norm_score(name, n, max_cost) score_by_str[(name, n)] = _norm_score( string_norm_form(name), string_norm_form(n), max_cost) score = score_by_str[(name, n)] best_score = max(score, best_score) score_by_id[i] = max(score_by_id.get(i, -1), score_by_str[(name, n)]) # stop if max count reached if len(score_by_id) > MAX_SEARCH_RESULT_NUMBER: Messager.info( 'Note: more than %d search results, only retrieving top matches' % MAX_SEARCH_RESULT_NUMBER) break return best_score