def compare(input_list, keywords_dictionary, word_weights):
    # Load phonetics functions
    dmeta = fuzzy.DMetaphone()
    metaphone = lambda x: dmeta(x)[0]
    soundex = fuzzy.Soundex(4)
    phonetics_methods = [metaphone, soundex]

    # initiate empty dictionary for scores
    scores = {}

    # Iterate through methods for solving, then iterate through words in
    # scrubbed user input. For each word, compare phonetics to all keywords
    # and add score to the scores dictionary. After, do normal QWERTY and LD
    # analyses
    for method, keywords in keywords_dictionary.iteritems():
        scores[method] = 0
        # print(method)
        # Phonetic Scoring methods
        for phonetic in phonetics_methods:
            formatted_array = np.asarray(map(phonetic, keywords))

            for word in input_list:
                formatted_word = phonetic(word)
                dist_array = \
                normalized_damerau_levenshtein_distance_withNPArray(
                 formatted_word, formatted_array)

                dist = min(dist_array)

                # Handle cases where "not" was found within the input - add to
                #    scores dictionary.
                weight = word_weights.get(word) if word_weights.get(
                    word) else 1

                scores[method] += weight * math.sqrt(dist)

        # For QWERTY and Damerau-Levenshtein distances, calcuate the differences
        for word in input_list:
            # Do QWERTY Keyboard analysis
            dist_array = normalized_keyboard_word_distance_withNPArray(
                word, keywords)
            dist = min(dist_array)

            # handle weighting for position from "not"
            weight = word_weights.get(word) if word_weights.get(word) else 1
            scores[method] += weight * math.sqrt(dist)

            # Do normal LD analysis
            dist_array = normalized_damerau_levenshtein_distance_withNPArray(
                word, np.asarray(keywords))
            dist = min(dist_array)

            weight = word_weights.get(word) if word_weights.get(word) else 1
            scores[method] += weight * math.sqrt(dist)

    return scores
def compare(input_list, keywords_dictionary, word_weights):
	# Load phonetics functions
	dmeta = fuzzy.DMetaphone()
	metaphone = lambda x: dmeta(x)[0]
	soundex = fuzzy.Soundex(4)
	phonetics_methods = [metaphone, soundex]

	# initiate empty dictionary for scores
	scores = {}

	# Iterate through methods for solving, then iterate through words in
	# scrubbed user input. For each word, compare phonetics to all keywords
	# and add score to the scores dictionary. After, do normal QWERTY and LD
	# analyses
	for method, keywords in keywords_dictionary.iteritems():
		scores[method] = 0
		# print(method)
		# Phonetic Scoring methods
		for phonetic in phonetics_methods:
			formatted_array = np.asarray(map(phonetic, keywords))

			for word in input_list:
				formatted_word = phonetic(word)
				dist_array = \
				normalized_damerau_levenshtein_distance_withNPArray(
					formatted_word, formatted_array)
				
				dist = min(dist_array)

				# Handle cases where "not" was found within the input - add to 
				#    scores dictionary.
				weight = word_weights.get(word) if word_weights.get(word) else 1

				scores[method] += weight*math.sqrt(dist)

		# For QWERTY and Damerau-Levenshtein distances, calcuate the differences
		for word in input_list:
			# Do QWERTY Keyboard analysis
			dist_array = normalized_keyboard_word_distance_withNPArray(
				word, keywords)
			dist = min(dist_array)
			
			# handle weighting for position from "not"
			weight = word_weights.get(word) if word_weights.get(word) else 1
			scores[method] += weight*math.sqrt(dist)

			# Do normal LD analysis
			dist_array = normalized_damerau_levenshtein_distance_withNPArray(
				word, np.asarray(keywords))
			dist = min(dist_array)
			
			weight = word_weights.get(word) if word_weights.get(word) else 1
			scores[method] += weight*math.sqrt(dist)

	return scores
Example #3
0
existing_series_compare = np.array([compare_strip(o) for o in existing_series],
                                   dtype='S')

for file in video_files:
    try:
        base_filename = os.path.basename(file)
        video_info = guessit(base_filename)
        if not 'title' in video_info.keys():
            logging.warning(
                'Unable to parse series name from: {}'.format(file))
            continue
        source_file = os.path.join(config_data['directories']['seeding'], file)
        series = titlecase(video_info['title'])

        # Check if there is a similar name we should use instead.
        distances = normalized_damerau_levenshtein_distance_withNPArray(
            compare_strip(series), existing_series_compare)
        #print(distances)
        min_distance = 1.0
        min_series = series
        for i in range(len(existing_series)):
            if distances[i] < min_distance:
                min_distance = distances[i]
                min_series = existing_series[i]
        logging.debug('Closest match({}): {} '.format(min_distance,
                                                      min_series))
        if min_distance < 0.125:
            series = min_series

        # Check if there are overrides.
        for override in overrides.keys():
            if re.match(override, base_filename, re.IGNORECASE):
Example #4
0
myArray = np.array([generateWord() for i in range(l_arrayLength)], dtype='S')
myRef = generateWord()
startV = time.time()
myRes = damerau_levenshtein_distance_withNPArray(myRef, myArray)
endV = time.time()
startR = time.time()
myExpected = [damerau_levenshtein_distance(myRef, w) for w in myArray]
endR = time.time()
assert(len(myRes) == l_arrayLength)
assert((myRes == myExpected).all())
print("Source \"%s\" against Array[%d]" % (myRef, len(myArray)))
print("Array calculus took %f s against %f s" % (endV - startV, endR - startR))
#
print("")
print('#normalized distance from a reference to an array:')
myRes = normalized_damerau_levenshtein_distance_withNPArray(myRef, myArray)
myExpected = [normalized_damerau_levenshtein_distance(myRef, w) for w in myArray]
assert(len(myRes) == l_arrayLength)
assert((myRes == myExpected).all())
print("Source \"%s\" against Array[%d]" % (myRef, len(myArray)))


print("")
print('#performance testing:')

# random words will be comprised of ascii letters, numbers, and spaces
chars = string.ascii_letters + string.digits + ' '
word1 = ''.join([random.choice(chars) for i in range(30)])  # generate a random string of characters of length 30
word2 = ''.join([random.choice(chars) for i in range(30)])  # and another
print("""timeit.timeit("damerau_levenshtein_distance('%s', '%s')", 'from pyxdameraulevenshtein import damerau_levenshtein_distance', number=500000) = %f seconds""" %
      (word1, word2, timeit.timeit("damerau_levenshtein_distance('%s', '%s')" % (word1, word2), 'from pyxdameraulevenshtein import damerau_levenshtein_distance', number=500000)))
Example #5
0
existing_series = [o for o in os.listdir(d) if os.path.isdir(os.path.join(d, o))]
existing_series_compare = np.array([compare_strip(o) for o in existing_series], dtype='S')

for file in video_files:
    try:
        base_filename = os.path.basename(file)
        video_info = guessit(base_filename)
        if not 'title' in video_info.keys():
            logging.warning('Unable to parse series name from: {}'.format(file))
            continue
        source_file = os.path.join(config_data['directories']['seeding'], file)
        series = titlecase(video_info['title'])


        # Check if there is a similar name we should use instead.
        distances = normalized_damerau_levenshtein_distance_withNPArray(compare_strip(series), existing_series_compare)
        #print(distances)
        min_distance = 1.0
        min_series = series
        for i in range(len(existing_series)):
            if distances[i] < min_distance:
                min_distance = distances[i]
                min_series = existing_series[i]
        logging.debug('Closest match({}): {} '.format(min_distance, min_series))
        if min_distance < 0.125:
            series = min_series

        # Check if there are overrides.
        for override in overrides.keys():
            if re.match(override, base_filename, re.IGNORECASE):
                if 'title' in overrides[override]: