Exemple #1
0
 def testNormalizeAndValidateTokens(self):
     pairs = [('Sciences de l’homme, anthropologie, ethnologie',
               ['sciences', 'homme', 'anthropologie', 'ethnologie'])]
     for (phrase, expected) in pairs:
         logging.debug('Checking normalization of phrase: %s' % phrase)
         found = vocab_lookup.normalizeAndValidateTokens(phrase)
         self.assertFalse(
             set(found) ^ set(expected),
             'Invalid tokenization: %s --> %s' % (phrase, '|'.join(found)))
Exemple #2
0
	def matchScore(self, phrase, acronyms = None):
		''' The main matching method for this referential. '''
		matches = self.vocabLookup.countUidMatches(phrase, maxTokens = self.maxLookupTokens, minCount = 1)
		# Variable matches holds a dict(variant -> list(hitCount, mainVariant))
		if acronyms and len(matches) < 1:
			tokens = vocab_lookup.normalizeAndValidateTokens(phrase)
			for acroExpanded in self.acronymExpansions(tokens, acronyms):
				matches.update(self.vocabLookup.countUidMatches(' '.join(acroExpanded), 
					maxTokens = self.maxLookupTokens + len(acroExpanded) - len(tokens), 
					minCount = 1))
		for (t, p) in matches.iteritems():
			for m in p[1]:
				yield (m, p[0] * matchLengthFactor(t, m, acronyms))
Exemple #3
0
def matchLengthFactor(t1, t2, keepAcronyms, closest=True):
    l1 = len(vocab_lookup.normalizeAndValidateTokens(t1, keepAcronyms))
    l2 = len(vocab_lookup.normalizeAndValidateTokens(t2, keepAcronyms))
    # return pow(TOKEN_LENGTH_FACTOR, - abs(l1 - l2)) if closest else pow(TOKEN_LENGTH_FACTOR, len(l1))
    return 1000000 / (1 + abs(l1 - l2)) if closest else pow(
        TOKEN_LENGTH_FACTOR, len(l1))
Exemple #4
0
def tokenLength(p):
    return len(vocab_lookup.normalizeAndValidateTokens(p))
Exemple #5
0
 def acronymizePhrase(self, phrase):
     keepAcronyms = True
     tokens = vocab_lookup.normalizeAndValidateTokens(phrase, keepAcronyms)
     return self.acronymizeTokens(tokens)
Exemple #6
0
def acronymizePhrase(phrase, minAcroSize, maxAcroSize): 
	keepAcronyms = True
	tokens = vocab_lookup.normalizeAndValidateTokens(phrase, keepAcronyms)
	return acronymizeTokens(tokens, minAcroSize, maxAcroSize)
Exemple #7
0
def tokenLength(p): return len(vocab_lookup.normalizeAndValidateTokens(p))

# Iteration of CSV and text-line files

def fileValueiterator(fileName):