def testNormalizeAndValidateTokens(self): pairs = [('Sciences de l’homme, anthropologie, ethnologie', ['sciences', 'homme', 'anthropologie', 'ethnologie'])] for (phrase, expected) in pairs: logging.debug('Checking normalization of phrase: %s' % phrase) found = vocab_lookup.normalizeAndValidateTokens(phrase) self.assertFalse( set(found) ^ set(expected), 'Invalid tokenization: %s --> %s' % (phrase, '|'.join(found)))
def matchScore(self, phrase, acronyms = None): ''' The main matching method for this referential. ''' matches = self.vocabLookup.countUidMatches(phrase, maxTokens = self.maxLookupTokens, minCount = 1) # Variable matches holds a dict(variant -> list(hitCount, mainVariant)) if acronyms and len(matches) < 1: tokens = vocab_lookup.normalizeAndValidateTokens(phrase) for acroExpanded in self.acronymExpansions(tokens, acronyms): matches.update(self.vocabLookup.countUidMatches(' '.join(acroExpanded), maxTokens = self.maxLookupTokens + len(acroExpanded) - len(tokens), minCount = 1)) for (t, p) in matches.iteritems(): for m in p[1]: yield (m, p[0] * matchLengthFactor(t, m, acronyms))
def matchLengthFactor(t1, t2, keepAcronyms, closest=True): l1 = len(vocab_lookup.normalizeAndValidateTokens(t1, keepAcronyms)) l2 = len(vocab_lookup.normalizeAndValidateTokens(t2, keepAcronyms)) # return pow(TOKEN_LENGTH_FACTOR, - abs(l1 - l2)) if closest else pow(TOKEN_LENGTH_FACTOR, len(l1)) return 1000000 / (1 + abs(l1 - l2)) if closest else pow( TOKEN_LENGTH_FACTOR, len(l1))
def tokenLength(p): return len(vocab_lookup.normalizeAndValidateTokens(p))
def acronymizePhrase(self, phrase): keepAcronyms = True tokens = vocab_lookup.normalizeAndValidateTokens(phrase, keepAcronyms) return self.acronymizeTokens(tokens)
def acronymizePhrase(phrase, minAcroSize, maxAcroSize): keepAcronyms = True tokens = vocab_lookup.normalizeAndValidateTokens(phrase, keepAcronyms) return acronymizeTokens(tokens, minAcroSize, maxAcroSize)
def tokenLength(p): return len(vocab_lookup.normalizeAndValidateTokens(p)) # Iteration of CSV and text-line files def fileValueiterator(fileName):