def getTagMatchFunc(matchtype, tags=None, layerkey=None): """Returns a function that takes in (queryterm, tag) and returns a score. The different matchtypes are: 'exact': term exactly matches a tag 'hasword': term is contained within a tag (respecting words) 'contains': term is contained within a tag 'unordered': term matches tag (in any order) 'starts': term starts tag 'ends': term ends tag 'typo': term is misspelled version of a tag 'path': term is close to another (by path similarity in wordnet) 'synonym': term is a synonym of a tag 'constant': returns 1 You can optionally pass in a 'layerkey' as a unique id. This can be used for caches, etc. """ import pickle from nkpylib.nknlp import matchstrings if matchtype == 'constant': ret = lambda term, tag: 1 elif matchtype == 'exact': ret = lambda term, tag: 1 if term == tag else 0 elif matchtype == 'hasword': ret = hasWord elif matchtype == 'contains': ret = lambda term, tag: 1 if term and (term in tag or tag in term) else 0 elif matchtype == 'unordered': def ret(term, tag): matches, s = matchstrings(term, tag) return s if s > 0.3 else 0 elif matchtype == 'starts': ret = lambda term, tag: 1 if tag.startswith(term) else 0 elif matchtype == 'ends': ret = lambda term, tag: 1 if tag.endswith(term) else 0 elif matchtype == 'typo': ret = lambda term, tag: strsim(term, tag) if strsim(term, tag) > 0.7 else 0 elif matchtype == 'path': ret = lambda term, tag: wordnetsim(term, tag) if wordnetsim(term, tag) > 0.3 else 0 elif matchtype == 'synonym': ret = lambda term, tag: 0 #TODO fix else: raise NotImplementedError() return (ret)
def getTagMatchFunc(matchtype, tags=None, layerkey=None): """Returns a function that takes in (queryterm, tag) and returns a score. The different matchtypes are: 'exact': term exactly matches a tag 'hasword': term is contained within a tag (respecting words) 'contains': term is contained within a tag 'unordered': term matches tag (in any order) 'starts': term starts tag 'ends': term ends tag 'typo': term is misspelled version of a tag 'path': term is close to another (by path similarity in wordnet) 'synonym': term is a synonym of a tag 'constant': returns 1 You can optionally pass in a 'layerkey' as a unique id. This can be used for caches, etc. """ import pickle from nkpylib.nknlp import matchstrings if matchtype == 'constant': ret = lambda term, tag: 1 elif matchtype == 'exact': ret = lambda term, tag: 1 if term == tag else 0 elif matchtype == 'hasword': ret = hasWord elif matchtype == 'contains': ret = lambda term, tag: 1 if term and (term in tag or tag in term ) else 0 elif matchtype == 'unordered': def ret(term, tag): matches, s = matchstrings(term, tag) return s if s > 0.3 else 0 elif matchtype == 'starts': ret = lambda term, tag: 1 if tag.startswith(term) else 0 elif matchtype == 'ends': ret = lambda term, tag: 1 if tag.endswith(term) else 0 elif matchtype == 'typo': ret = lambda term, tag: strsim(term, tag) if strsim(term, tag ) > 0.7 else 0 elif matchtype == 'path': ret = lambda term, tag: wordnetsim(term, tag) if wordnetsim( term, tag) > 0.3 else 0 elif matchtype == 'synonym': ret = lambda term, tag: 0 #TODO fix else: raise NotImplementedError() return (ret)
def matchstrings(a, b): """Matches strings in a very loose way. Returns (matches, score), where matches is a list of pairs of matching words from a and b, and score is the final normalized similarity score (higher is better).""" from nkutils import utf, strsim import numpy as np np.set_printoptions(precision=2, linewidth=200, suppress=1) def norm(s): """Normalizes a string""" s = utf(s).strip().lower() return s def split(s): """Splits a string into components, quite aggressively""" import re els = re.split('\W+', s) els = map(norm, els) els = [e for e in els if e and e not in STOP_WORDS] return els els1 = split(norm(a)) els2 = split(norm(b)) #print '%s -> %s' % (a, els1) #print '%s -> %s' % (b, els2) matches = [] best = 0.0 if not els1 or not els2: return (matches, best) m = np.zeros((len(els1), len(els2))) for i, e1 in enumerate(els1): for j, e2 in enumerate(els2): c = m[i, j] = strsim(e1, e2) #print ' %d,%d = %s vs %s = %s' % (i, j, e1, e2, c) while 1: n = np.argmax(m) i, j = loc = np.unravel_index([n], m.shape) s = m[loc] #print ' %s, %s, %s' % (n, loc, s) if s <= 0: break # if we're here, then we want to add this match #print 'got %s, %s, %s, %s, %s' % (n, i, j, els1, els2) matches.append((els1[i[0]], els2[j[0]])) best += s m[i, :] = -1 m[:, j] = -1 #print ' picking %s, %s' % (matches[-1], best) #print m if matches: best /= float(len(matches)) return (matches, best)
def matchstrings(a, b): """Matches strings in a very loose way. Returns (matches, score), where matches is a list of pairs of matching words from a and b, and score is the final normalized similarity score (higher is better).""" from nkutils import utf, strsim import numpy as np np.set_printoptions(precision=2, linewidth=200, suppress=1) def norm(s): """Normalizes a string""" s = utf(s).strip().lower() return s def split(s): """Splits a string into components, quite aggressively""" import re els = re.split('\W+', s) els = map(norm, els) els = [e for e in els if e and e not in STOP_WORDS] return els els1 = split(norm(a)) els2 = split(norm(b)) #print '%s -> %s' % (a, els1) #print '%s -> %s' % (b, els2) matches = [] best = 0.0 if not els1 or not els2: return (matches, best) m = np.zeros((len(els1), len(els2))) for i, e1 in enumerate(els1): for j, e2 in enumerate(els2): c = m[i,j] = strsim(e1, e2) #print ' %d,%d = %s vs %s = %s' % (i, j, e1, e2, c) while 1: n = np.argmax(m) i, j = loc = np.unravel_index([n], m.shape) s = m[loc] #print ' %s, %s, %s' % (n, loc, s) if s <= 0: break # if we're here, then we want to add this match #print 'got %s, %s, %s, %s, %s' % (n, i, j, els1, els2) matches.append((els1[i[0]], els2[j[0]])) best += s m[i,:] = -1 m[:,j] = -1 #print ' picking %s, %s' % (matches[-1], best) #print m if matches: best /= float(len(matches)) return (matches, best)