Beispiel #1
0
def get_close_matches(word,
                      possibilities,
                      n=3,
                      cutoff=None,
                      case_sensitive=False):
    """
        Given a string and a list of strings to lookup, returns a list of
        pairs of the n-closest strings (according to the edit-distance),
        and their respective distances from 'word'.
        If cutoff is given, the returned list will contain only the string
        which are closer than the cutoff.
    """
    try:
        from nltk.metrics import edit_distance
    except ImportError:
        from Levenshtein import distance as edit_distance

    hits = []
    for possibility in possibilities:
        if case_sensitive:
            d = edit_distance(word, possibility)
        else:
            d = edit_distance(word.lower(), possibility.lower())
        if cutoff and d < cutoff:
            hits.append((possibility, d))

    return sorted(hits, cmp=lambda x, y: cmp(x[1], y[1]))
Beispiel #2
0
    def linguistic_distance(self):
        """
		Compare the two languages word by word for all meanings in the meaning_list.
		:return: the linguistic similarity between the two languages
		"""
        eds = []
        norms = []
        for meaning in self.meaning_list:
            word1 = self.language1_dict.get(meaning)
            word2 = self.language2_dict.get(meaning)
            if word1 != None and word2 != None:
                word1 = word1.words[0]
                word2 = word2.words[0]
                LDN = edit_distance(word1, word2) / float(
                    max(len(word1), len(word2)))
                eds += [LDN]

        for meaning in self.meaning_list:
            for meaning2 in self.meaning_list:
                if meaning != meaning2:
                    word1 = self.language1_dict.get(meaning)
                    word2 = self.language2_dict.get(meaning2)
                    if word1 != None and word2 != None:
                        word1 = word1.words[0]
                        word2 = word2.words[0]
                        LDN = edit_distance(word1, word2) / float(
                            max(len(word1), len(word2)))
                        norms += [LDN]

        average = sum(norms) / float(len(norms))
        LDND = [ed / average for ed in eds]
        return sum(LDND) / len(LDND)
Beispiel #3
0
def cer_from_transcripts(transcripts, ys, log_path=None, truncate=True):
    '''
    Args:
        transcripts: list of strings

    Return:
        norm_dists: list of CER values
        dist: edit distances
    '''
    norm_dists = []
    dists = []
    for i, t in enumerate(transcripts):
        curr_t = t
        curr_y = ys[i]
        if len(curr_y) == 0:
            print('%d is 0' % i)
        curr_t_nos = curr_t.replace(' ', '')
        curr_y_nos = curr_y.replace(' ', '')
        if truncate:
            curr_t = curr_t[:len(curr_y)]
            curr_t_nos = curr_t_nos[:len(curr_y_nos)]
        dist = edit_distance(curr_t, curr_y)
        norm_dist = dist / len(curr_y)
        dist_nos = edit_distance(curr_t_nos, curr_y_nos)
        norm_dist_nos = dist_nos / len(curr_y_nos)
        best_dist = min(dist, dist_nos)
        best_norm = min(norm_dist, norm_dist_nos)
        if log_path is not None:
            with open(log_path, 'a') as ouf:
                ouf.write('dist: %.2f, norm_dist: %.2f\n' %
                          (best_dist, best_norm))
        norm_dists.append(best_norm)
        dists.append(best_dist)
    return norm_dists, dists
Beispiel #4
0
def cer_from_transcripts(transcripts,
                         ys,
                         log_path,
                         truncate=True,
                         spaces='best'):
    '''
    Return:
        norm_dists: list of CER values
        dist: edit distances
        spaces: no, yes, best (to account for incongruity in raw data spacing)
    '''
    norm_dists = []
    dists = []
    for i, t in enumerate(transcripts):
        curr_t = t
        curr_y = ys[i]
        if len(curr_y) == 0:
            print('%d is 0' % i)
        curr_t_nos = curr_t.replace(' ', '')
        curr_y_nos = curr_y.replace(' ', '')
        if truncate:
            curr_t = curr_t[:len(curr_y)]
            curr_t_nos = curr_t_nos[:len(curr_y_nos)]
        dist = edit_distance(curr_t, curr_y)
        norm_dist = dist / len(curr_y)
        dist_nos = edit_distance(curr_t_nos, curr_y_nos)
        norm_dist_nos = dist_nos / len(curr_y_nos)
        best_dist = min(dist, dist_nos)
        best_norm = min(norm_dist, norm_dist_nos)
        with open(log_path, 'a') as ouf:
            ouf.write('dist: %.2f, norm_dist: %.2f\n' % (best_dist, best_norm))
        norm_dists.append(best_norm)
        dists.append(best_dist)
    return norm_dists, dists
Beispiel #5
0
def get_related_evidence(title):
	print '>>>>>>>>>>>>>>>>>>>>>>>>>>'
	try:
		print 'given title: ' + title
	# TODO: fix this...
	except UnicodeEncodeError:
		print 'title cannot be printed - containing unicode encode error'
		return [], {}, 0
	fetch = metapub.PubMedFetcher()
	pmids = fetch.pmids_for_query(title)
	if len(pmids) == 1:
		article = fetch.article_by_pmid(pmids[0])
		if edit_distance(article.title, title) <= len(title) * 0.1:
			print 'matched title: ' + article.title.encode('utf-8')
			related_pmids = fetch.related_pmids(pmids[0])
			return _merge_related_pmids(pmids[0], related_pmids, fetch)
	elif len(pmids) > 1:
		for i in range(min(20, len(pmids))):
			article = fetch.article_by_pmid(pmids[i])
			if edit_distance(article.title, title) <= len(title) * 0.1:
				print 'matched title: ' + article.title.encode('utf-8')
				related_pmids = fetch.related_pmids(pmids[i])
				return _merge_related_pmids(pmids[i], related_pmids, fetch)

	print 'no match found'
	return [], {}, 0
Beispiel #6
0
def find_eng_neighbour(inpword, c, data, lsh, soundex_eng):
    minhash = MinHash(num_perm=32)
    word = soundex_eng.soundex(inpword)
    test_i = word
    for d in word:
        d = d.encode("utf-8")
        minhash.update(d)

    results = lsh.query(minhash)
    min_res = 999
    indx = -1
    for i in results:
        res = edit_distance(data[i][0], test_i)
        if min_res > res:
            indx = i
            min_res = res
        if min_res == 0:
            break
    if indx == -1:
        return "nomatch"
    word = data[indx][0]
    #count = c.execute("select max(count) from dev_table where soundex = '"+word+"'")
    #a = list(count)
    dev_word = c.execute("select eng from eng_table where soundex = '" + word +
                         "'")
    dev_word = list(dev_word)
    min_res = 999
    for word in dev_word:
        word = word[0]
        res = edit_distance(word, inpword)
        if min_res > res:
            final_word = word
            min_res = res
    return final_word
def checkSentenceTri(sentence, size, ref, ref_tot):
    trueWord = ref
    totalNumberOfWords = ref_tot

    for a in range(0, size - 2):
        bestSuite = {}
        checkWord1 = sentence.split(' ')[a]
        checkWord2 = sentence.split(' ')[a + 1]
        checkWord3 = sentence.split(' ')[a + 2]
        checkWord = checkWord1 + ' ' + checkWord2 + ' ' + checkWord3
        bestSuite[checkWord] = []

        for i in trueWord.keys():
            word = i
            if checkWord in trueWord:
                bestSuite.pop(checkWord)
                break
            if (edit_distance(word, checkWord) <= 5) and (edit_distance(
                    word, checkWord) > 2) and (not checkWord in trueWord):
                bestSuite[checkWord].append({word: int(trueWord[i])})

        for key in bestSuite:
            if (key == ''):
                break
            for dic_in_list in bestSuite[key]:
                for kay in dic_in_list:
                    print(
                        str(key) + " ---> " + str(kay) + " " +
                        str(dic_in_list[kay] / totalNumberOfWords))
    return
Beispiel #8
0
def get_abstract_by_title(title):
	print '>>>>>>>>>>>>>>>>>>>>>>>>>>'
	print 'searching entry with title: ' + title
	fetch = metapub.PubMedFetcher()
	pmids = fetch.pmids_for_query(title)
	if (len(pmids) == 0):
		print 'warning: no entry retrieved for given title'
		return None, ''
	elif (len(pmids) == 1):
		article = fetch.article_by_pmid(pmids[0])
		if edit_distance(article.title, title) <= math.ceil(len(title) * 0.1) and article.abstract != None:
			print 'successfully matched title: ' + article.title
			return article.title, article.abstract
		else:
			print 'warning: found one entry but not a match'		
			return None, ''
	else:
		print 'warning: retrieved more than one entry for given title'
		for i in range(min(20, len(pmids))):
			article = fetch.article_by_pmid(pmids[i])
			if edit_distance(article.title, title) <= math.ceil(len(title) * 0.1) and article.abstract != None:
				print 'successfully matched title: ' + article.title
				return article.title, article.abstract
		print 'warning: no entry is a match'
		return None, ''
    def matches_author(self, string, fuzzy=False, distance_threshold=3):
        """
        This function retrieves from the KnowledgeBase possible authors that match the search string.
        None is returned if no matches are found.

        :param string: the string to be matched

        :param fuzzy: whether exact or fuzzy string matching should be applied

        :distance_threshold: the maximum edit distance threshold (ignored if `fuzzy==False`)

        :return: a list of tuples, ordered by distance between the seach and the matching string, where:
                tuple[0] contains the id (i.e. CTS URN) of the matching author
                tuple[1] contains a label of the matching author
                tuple[2] is the distance, measured in characters, between the search string and the matching string
                or None if no match is found.
        """
        #string = string.lower()
        author_matches, abbr_matches = [],[]

        if(not fuzzy):

            author_matches = [(id.split("$$")[0]
                            , self._author_names[id]
                            , len(self._author_names[id])-len(string))
                             for id in self._author_idx.searchAllWords(string)]

            abbr_matches = [(id.split("$$")[0]
                            , self._author_abbreviations[id]
                            , len(self._author_abbreviations[id])-len(string))
                            for id in self._author_abbr_idx.searchAllWords(string)]
        else:
            abbr_matches = [(id.split("$$")[0]
                            , self._author_abbreviations[id]
                            , edit_distance(string,self._author_abbreviations[id]))
                            for id in self._author_abbreviations
                            if edit_distance(string,self._author_abbreviations[id]) <= distance_threshold]

            abbr_matches = sorted(abbr_matches, key =itemgetter(2))
            author_matches = []

            for id in self._author_names:
                if(string.endswith(".")):
                    if string.replace(".","") in self._author_names[id]:
                        if(len(string) > (len(self._author_names[id]) / 2)):
                            try:
                                assert abbr_matches[0][2] == 0
                                distance = len(self._author_names[id]) - len(string)
                                if distance < 0:
                                    distance = 1
                                author_matches.append((id.split("$$")[0], self._author_names[id],distance))
                            except Exception, e:
                                author_matches.append((id.split("$$")[0], self._author_names[id],0))
                        else:
                            if(edit_distance(string,self._author_names[id]) <= distance_threshold):
                                author_matches.append((id.split("$$")[0], self._author_names[id], edit_distance(string,self._author_names[id])))
                else:
                    if(edit_distance(string,self._author_names[id]) <= distance_threshold):
                        author_matches.append((id.split("$$")[0], self._author_names[id], edit_distance(string,self._author_names[id])))
def spell_correction(document, vocab):  ######################with suggestions
    with open(vocab, 'rb') as f:
        vocab = pickle.load(f)

    with open(document, 'rb') as f:
        rawtext = pickle.load(f)

    tokens = nltk.WordPunctTokenizer().tokenize(rawtext)
    tokens = [x for x in tokens if x]
    #print (tokens)
    #print(vocab)

    wrongwords = [word for word in tokens if word not in vocab]
    #print (wrongwords)
    error_location = [x for x, _ in enumerate(tokens) if _ in wrongwords]
    #print(wrongwords)

    suggestions = {}
    mindistances_sugg = {}

    bestmatch = {}
    for word in wrongwords:
        mindistances = []
        mindistances_word = []
        for v in vocab:

            if edit_distance(word, v) <= 4:
                mindistances.append(v)
                mindistances_word.append(edit_distance(word, v))

        suggestions[word] = mindistances
        try:
            mindistances_sugg[word] = min(mindistances_word)
        except:
            pass
    #print(mindistances_word)

    for word in (wrongwords):
        dist = mindistances_sugg[word]
        key = [x for x in suggestions[word] if edit_distance(word, x) == dist]
        bestmatch[word] = key

        #print(bestmatch)

    big_table = list(zip(wrongwords, error_location, bestmatch.values()))

    df = pd.DataFrame(big_table)
    df.columns = ["wrongspellings", "location", "correction"]
    #print (error_location)
    #print(big_table)

    for word in wrongwords:
        print(word + "....?", "did you mean .........", suggestions[word])
        print(
            "best match ......", bestmatch[word],
            "please do humanity a favor and go to school boy ....................."
        )

    return df
Beispiel #11
0
def match():
    second = session.attributes['lyric line']
    next_line, artist_song = read_songs()
    query = second.lower()
    q = min(artist_song.keys(), key=lambda x: edit_distance(x, second))
    dist = edit_distance(q, query)
    print(dist)
    if dist > 10:
        pass
    return question(str(artist_song[q][0]) + ': ' + str(artist_song[q][1]))
Beispiel #12
0
def fixstring(string):
    # no string, return
    if len(string) == 0:
        return string
    # split words in string
    else:
        stringlist = string.split()

    # add stuff to new string
    newstring = ''
    old_st = ''
    for st in stringlist:
        # drop repeats
        if st == old_st: continue
        old_st = st
        # determine if real word
        if wordnet.synsets(st):
            newstring += st + ' '
            continue
        # determine if almost a real word
        try:
            fixword = spell(st)
            if wordnet.synsets(fixword):  # word is now real
                # only keep words that are 3 or less edits apart from real
                if edit_distance(st, fixword) < 4:
                    newstring += fixword + ' '
                continue
        except:
            pass
        # determine if number
        try:
            float(st)
            # only keep smaller numbers
            if len(st) < 4:
                newstring += st + ' '
            continue
        except:
            pass
        # determine if money or percent
        if '$' in st or '%' in st:
            money = st.replace('$', '').replace('%', '')
            try:
                float(money)
                newstring += st + ' '
                continue
            except:
                pass
        # if proper noun, keep
        if edit_distance(st, st.lower()) == 1:
            if all(char.isalpha() for char in st):
                newstring += st + ' '
            continue
    # return fixed string
    return newstring
Beispiel #13
0
def next_line():
    second = session.attributes['lyric line']
    next_line, artist_song = read_songs()
    query = second.lower()
    q = min(artist_song.keys(), key=lambda x: edit_distance(x, second))
    dist = edit_distance(q, query)
    print(dist)
    if dist > 10:
        pass
    msg = next_line[q]
    return question(msg)
Beispiel #14
0
def merge(raw_dbr, mention_1, mention_2):
    dbr = raw_dbr.lower()
    if mention_1 is None or type(mention_1) is not str:
        return mention_2
    if mention_2 is None or type(mention_2) is not str:
        return mention_1
    ed_1 = edit_distance(mention_1.lower(), dbr)
    ed_2 = edit_distance(mention_2.lower(), dbr)
    ret = mention_2
    if ed_1 < ed_2:
        ret = mention_1
    return ret
Beispiel #15
0
def closest_word(word, vocab, threshold=5, sub_thres=2):
    '''Finds closest word in the vocabulary (w.r.t. edit distance)

    Returns 2 words if no closest word found
    '''
    best_word = word
    best_dist = float("inf")
    prefix_len_best = float("inf")
    for vocab_word in vocab:
        curr_dist = edit_distance(word, vocab_word)
        if curr_dist < best_dist:
            best_dist = curr_dist
            best_word = vocab_word
            prefix_len_best = len(os.path.commonprefix([word, vocab_word]))
        elif curr_dist == best_dist and abs(len(best_word) - len(word)) > abs(
                len(vocab_word) - len(word)):
            prefix_len_vocab = len(os.path.commonprefix([word, vocab_word]))
            if prefix_len_best < prefix_len_vocab:
                best_word = vocab_word
                prefix_len_best = prefix_len_vocab
    if best_dist > 5:  # margin of error is sub_thres for each subword
        for i in range(len(word) - 1):
            word1 = word[:i + 1]
            word2 = word[i + 1:]
            curr_dist = float("inf")
            vocab_word1 = word1
            for vocab_word in vocab:
                if word1 == vocab_word:
                    vocab_word1 = vocab_word
                    curr_dist = 0
                    break
                dist1 = edit_distance(word1, vocab_word)
                if dist1 < curr_dist:
                    vocab_word1 = vocab_word
                    curr_dist = dist1
            vocab_word2 = word2
            if curr_dist <= sub_thres:
                curr_dist2 = float("inf")
                for vocab_word in vocab:
                    if word2 == vocab_word:
                        vocab_word2 = vocab_word
                        curr_dist2 = 0
                        break
                    dist2 = edit_distance(word2, vocab_word)
                    if dist2 < curr_dist2:
                        vocab_word2 = vocab_word
                        curr_dist2 = dist2
                curr_dist += curr_dist2
                if curr_dist < best_dist:
                    best_word = vocab_word1 + ' ' + vocab_word2
                    best_dist = curr_dist
    return best_word
Beispiel #16
0
class SpellingReplacer(object):
    def __init__(self, dict_name='en', max_dist=2):
        self.spell_dict = enchant.Dict(dict_name)
        self.max_dist = max_dist
    def replace(self, word):
        if self.spell_dict.check(word):
            return word

       suggestions = self.spell_dict.suggest(word)
        if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
            return [sugst for sugst in suggestions if edit_distance(word,sugst) <= self.max_dist]
        else:
            return word
Beispiel #17
0
def asciiSpell(word):
    spell_dict = enchant.Dict('en_US')
    max_dist = 2
    if spell_dict.check(word):
        return word
    suggestions = sorted(spell_dict.suggest(word),
                         key=lambda sugg: edit_distance(sugg, word) * 0
                         if sameletters(word, sugg) else 1)
    try:
        if edit_distance(suggestions[0], word) <= max_dist:
            return suggestions[0]
    except:
        pass
    return word
Beispiel #18
0
def searchEvidenceByTitle(request):
    if request.method == 'POST':
        data = json.loads(request.body)
        collection_id = data['collection_id']
        title = data['title']
        result_limit = data['result_limit']
        include_personal = data['include_personal']
        user_id = data['user_id']
        # DONE: we can alternatively change this to treat given title as a series of separated terms
        title_terms = title.split(' ')
        print title_terms
        evidence = Evidence.objects.filter(Q(created_by=collection_id)&reduce(lambda x, y: x & y, [Q(title__icontains=word) for word in title_terms]))
        if include_personal:
            personal_evidence = Evidence.objects.filter(Q(created_by=user_id)&reduce(lambda x, y: x & y, [Q(title__icontains=word) for word in title_terms]))
            evidence = chain(evidence, personal_evidence)
        serialized_json = serializers.serialize('json', evidence)
        evidence_json = flattenSerializedJson(serialized_json)
        evidence = json.loads(evidence_json)
        pprint.pprint(evidence)
        for e in evidence:
            e['dist'] = edit_distance(title, e['title'])
        print 'result limit'
        print result_limit
        evidence = sorted(evidence, key=lambda e:e['dist'])[:result_limit]
        for e in evidence:
            e['topic'] = -1
            try:
                e['topic'] = EvidenceTopic.objects.get(evidence=e['id']).primary_topic
            except ObjectDoesNotExist:
                if len(e['abstract']) > 50:
                    name = Collection.objects.get(collection_id=collection_id).collection_name
                    topic_dist, primary_topic_terms = TopicModeler.get_document_topics(e['abstract'], name)
                    primary_topic_tuple = max(topic_dist, key=lambda x:x[1])
                    e['topic'] = primary_topic_tuple[0]
                else:
                    print 'warning: evidence with no topic'
        return HttpResponse(json.dumps(evidence), status=status.HTTP_200_OK)

    elif request.method == 'GET':
        collection_id = 13
        title = 'UpSet: Visualization of Intersecting Sets'
        evidence = Evidence.objects.filter(created_by=collection_id)
        serialized_json = serializers.serialize('json', evidence)
        evidence_json = flattenSerializedJson(serialized_json)
        evidence = json.loads(evidence_json)
        for e in evidence:
            e['dist'] = edit_distance(title, e['title'])
        evidence = sorted(evidence, key=lambda e:e['dist'])
        return HttpResponse(json.dumps(evidence[:20]), status=status.HTTP_200_OK)
Beispiel #19
0
def fun_1_5_1():
    # 编辑距离pyton实现
    def _edit_dist_init(len1, len2):
        lev = []
        for i in range(len1):
            lev.append([0] * len2)  # initialize 2D array to zero
        for i in range(len1):
            lev[i][0] = i  # column 0:0,1,2,3,4,......
        for j in range(len2):
            lev[0][j] = j  # row 0:0,1,2,3,4,......
        return lev

    def _edit_dist_step(lev, i, j, s1, s2, transpositions=False):
        c1 = s1[i - 1]
        c2 = s2[j - 1]
        # skipping a character in s1
        a = lev[i - 1][j] + 1
        # skipping a character in s2
        b = lev[i][j - 1] + 1
        # substitution
        c = lev[i - 1][j - 1] + (c1 != c2)
        # transposition
        d = c + 1  # never picked by default
        if transpositions and i > 1 and j > 1:
            if s1[i - 2] == c2 and s2[j - 2] == c1:
                d = lev[i - 2][j - 2] + 1
        # pick the cheapest
        lev[i, j] = min(a, b, c, d)

    def edit_distance(s1, s2, transportsitions=False):
        # set-up a 2-D array
        len1 = len(s1)
        len2 = len(s2)
        lev = _edit_dist_init(len1 + 1, len2 + 1)
        # iterate over the array
        for i in range(len1):
            for j in range(len2):
                _edit_dist_step(lev,
                                i + 1,
                                j + 1,
                                s1,
                                s2,
                                transportsitions=transportsitions)
        return lev[len1][len2]

    import nltk
    from nltk.metrics import edit_distance
    print edit_distance('relate', 'relation')
    print edit_distance("suggestion", "calculation")
Beispiel #20
0
    def __match_distance__(self):
        '''
        Match node lemma with lemmas in text of shortest string distance
        :return:
        '''
        fnodes = filter(lambda node: node not in self.solved, self.nodes)
        for node in fnodes:
            lemma = self.nodes[node]['lemma']

            # get the position of the nodes already solved
            order_ids = map(lambda node: self.nodes[node]['order_id'], self.solved)
            candidates = filter(lambda x: x.i not in order_ids, self.doc)
            candidates = map(lambda x: (x, edit_distance(x.lemma_, lemma)), candidates)
            candidates.sort(key=lambda x: x[1])

            if len(candidates) > 0:
                order_id, realization = candidates[0][0].i, unicode(candidates[0][0])
                self.nodes[node]['order_id'] = order_id
                self.nodes[node]['realization'] = realization #+ u'_dist'

                # add in lexicon
                # self.__add_lexicon__(node, realization)
            else:
                self.nodes[node]['order_id'] = -1
                self.nodes[node]['realization'] = lemma #+ u'_dist'

            self.solved.append(node)
Beispiel #21
0
    def replace(self, word):
        if self.spell_dict.check(word):
            return word

        suggestions = []
        suggestions = self.spell_dict.suggest(word)

        distance = []
        print(distance)
        print(suggestions)

        retVal = ""
        for suggestedWord in suggestions:
            distance.append(edit_distance(word, suggestedWord))

        print(distance)
        lengthMatched = False

        if min(distance) <= self.max_dist:
            retVal = suggestions[distance.index(min(distance))]

            i = 0
            for ed in distance:
                if ed == min(distance) :
                    if len(word) == len(suggestions[i]) and lengthMatched == False:
                        retVal = suggestions[i]
                        lengthMatched = True
                i += 1
        else :
            retVal = word

        return retVal
 def _GetScore(self, query, match):
     """Custom edit-distance based scoring."""
     str_query = str(query)
     str_candidate = str(match.key)
     dist = float(edit_distance(str_query, str_candidate))
     max_len = float(max(len(str_query), len(str_candidate)))
     return (max_len - dist) / max_len
Beispiel #23
0
def string_matching(label1, label2): #by Maedchen and Staab
    """ (string, string) -> float
    
Return the coefficient of similarity between two sequence of strings based on
the Levenshtein distance (edit distance). It equates 1 for exact match and
0 to no similarity.
>>> string_matching('power','power')
1.0
>>> string_matching('power','abba')
0.0
"""
    sm = float(
        min(len(label1),len(label2)) - 
        edit_distance(label1, label2)
        ) / min(len(label1),len(label2)
        )    
    try:
        if sm < 0:
            return 0.0
        else:
            return sm
    except:
        print "Error found:"
        traceback.print_exc(file=sys.stdout)
        return 0
Beispiel #24
0
    def process(self, statement):

        ed = 9999

        from chatterbot.conversation import Statement

        res = str(statement.text).split()
        options = []
        strr = ""

        with open('dict.csv', 'rb') as csvvfile:
            csvreader = csv.reader(csvvfile, delimiter=str(','))
            print "Word to be corrected is " + res[0]
            for row in csvreader:
                for col in row:
                    k = edit_distance(res[0], col)
                    #print "Word compared is "+str(col)
                    #print "Edit distance is "+str(k)
                    if (k < 3):
                        options.append(col)
                        aux = strr.split()
                        if col not in aux:
                            strr = strr + " " + col
                            print col

        response = Statement(
            "The query you entered seems wrong.Try possible options like " +
            strr)
        response.remove_response(
            "The query you entered seems wrong.Try possible options like " +
            strr)
        response.confidence = 1
        return response
Beispiel #25
0
def fuzzy_comparison(tokens_1,tokens_2,max_dist=1):
  """ compares the tokens based on fuzzy match """
  matched = 0
  matched_len_1 = init_term_1 - len(tokens_1)
  matched_len_2 = init_term_2 - len(tokens_2)

  for token in reversed(tokens_1):
    if len(token)<=2:
      tokens_1.remove(token)
      continue
    for tkn in reversed(tokens_2):
      if len(tkn)<=2:
        tokens_2.remove(tkn)
        continue
      if metrics.edit_distance(token, tkn) <= max_dist:
	matched = matched + 1
        logging.debug("Match found for:"+token+" - "+tkn)
	tokens_2.remove(tkn)
	tokens_1.remove(token)
	break

  logging.info("Fuzzy match count:"+str(matched))
  score_1 = (matched_len_1 + matched)/float(init_term_1)
  score_2 = (matched_len_2 + matched)/float(init_term_2)
  return score_1,score_2
    def process_spell_errors(self, query):
        """
    Process the query string and replace spell errors with words from the
    corpus / english dictionary.

    query: A query string.

    """
        if config_params['spell_check']:
            split_query = query.split()
            result = []
            words_list = set(words.words()).union(data_dict['word_corpus'])

            for word in split_query:
                if word not in words_list and '*' not in word:
                    print(colorize.magenta("%s is not in dict" % word))
                    #process
                    words_distance = zip(
                        words_list,
                        map(lambda x: edit_distance(word, x), words_list))
                    best_word = reduce(lambda x, y: x if x[1] <= y[1] else y,
                                       words_distance)[0]
                    word = best_word
                    print(colorize.green("replaced with %s" % word))
                result.append(word)
            query = result
            return " ".join(query)
        return query
Beispiel #27
0
def spellChecker(sentences, file_name_s):
    dict_name = 'en_GB'
    spell_dict = enchant.Dict(dict_name)
    max_dist = 3
    corrected = []
    csv_writer = csv.writer(open(file_name_s, 'wb'))
    #csv_writer.writerow(HEADER2)
    for sentence in sentences:
        corrected_sent = ''
        sentence = str(sentence)
        sc = set(["[", "]", "'", '"'])
        words = ''.join([c for c in sentence if c not in sc])
        words = words.split()
        #print words
        for word in words:
            print word
            suggestions = spell_dict.suggest(word)
            #print suggestions[0]
            #print edit_distance(word, suggestions[0])
            if suggestions and edit_distance(word, suggestions[0]) <= max_dist:
                #print word
                corrected_sent = corrected_sent + " " + suggestions[0]
            else:
                corrected_sent = corrected_sent + " " + word
                corrected_sent.replace("[", "")
                corrected_sent.replace("]", "")
                corrected_sent.replace("'", "")
            #print corrected_sent
        corrected.append(corrected_sent)
        csv_writer.writerow([corrected_sent])
    print corrected
 def _GetScore(self, query, match):
     """Custom edit-distance based scoring."""
     str_query = str(query)
     str_candidate = str(match.key)
     dist = float(edit_distance(str_query, str_candidate))
     max_len = float(max(len(str_query), len(str_candidate)))
     return (max_len - dist) / max_len
Beispiel #29
0
def get_hosts_helper(tweets):
    host_re = re.compile('host [A-Z][a-z]* [A-Z][a-z]*')
    all_hosts = dict()
    tweets = tweets.__dict__
    for key, tweetObj in tweets.items():
        # nltk.download('punkt')
        # words = nltk.word_tokenize(tweet)
        # host_index = words.index("host")
        # return words[host_index + 1] + " " + words[host_index + 2]
        tweet = ' '.join(tweetObj.words)
        possible_host_match = host_re.search(tweet)
        possible_host = ''
        if possible_host_match:
            possible_host = tweet[possible_host_match.start() + 5 : possible_host_match.end()]
            if possible_host in all_hosts:
                all_hosts[possible_host] = all_hosts[possible_host] + 1
            else:
                all_hosts[possible_host] = 1
            gg_reactions.extract_reaction('hosts', 'host', ' '.join(tweetObj.words))

    top_hosts = (sorted(all_hosts.items(), key=lambda x: x[1], reverse=True))[:2]
    most_likely_host = [top_hosts[0][0]]

    dist = edit_distance(most_likely_host[0].lower(), top_hosts[1][0].lower())
    relative_mention_amount = top_hosts[1][1] / top_hosts[0][1]

    if dist >= 5 and relative_mention_amount > 0.60:
        most_likely_host.append(top_hosts[1][0])

    return most_likely_host
Beispiel #30
0
    def replace(self, word):
        if self.spell_dict.check(word):
            return word

        suggestions = []
        suggestions = self.spell_dict.suggest(word)

        distance = []
        print(distance)
        print(suggestions)

        retVal = ""
        for suggestedWord in suggestions:
            distance.append(edit_distance(word, suggestedWord))

        print(distance)
        lengthMatched = False

        if min(distance) <= self.max_dist:
            retVal = suggestions[distance.index(min(distance))]

            i = 0
            for ed in distance:
                if ed == min(distance):
                    if len(word) == len(
                            suggestions[i]) and lengthMatched == False:
                        retVal = suggestions[i]
                        lengthMatched = True
                i += 1
        else:
            retVal = word

        return retVal
Beispiel #31
0
def correctSpell(word):
    suggestions = cf.hobj.suggest(word)
    if len(suggestions) != 0:
        distance = [edit_distance(word, s) for s in suggestions]
        return suggestions[distance.index(min(distance))]
    else:
        return word
Beispiel #32
0
def spellChecker(sentences, file_name_s):
   dict_name  = 'en_GB'
   spell_dict = enchant.Dict(dict_name)
   max_dist   = 3
   corrected  = []
   csv_writer = csv.writer(open(file_name_s, 'wb'))
   #csv_writer.writerow(HEADER2)
   for sentence in sentences:
      corrected_sent = ''
      sentence = str(sentence)
      sc = set(["[", "]", "'", '"'])
      words = ''.join([c for c in sentence if c not in sc])
      words = words.split()
      #print words
      for word in words:
         print word
         suggestions = spell_dict.suggest(word)
         #print suggestions[0]
         #print edit_distance(word, suggestions[0])
         if suggestions and edit_distance(word, suggestions[0]) <= max_dist:
            #print word
            corrected_sent = corrected_sent + " " + suggestions[0]
         else:
            corrected_sent = corrected_sent + " " + word
            corrected_sent.replace("[","")
            corrected_sent.replace("]","")
            corrected_sent.replace("'","")
         #print corrected_sent
      corrected.append(corrected_sent)
      csv_writer.writerow([corrected_sent])
   print corrected
Beispiel #33
0
    def replace(self, word):
        if self.spell_dict.check(word):
            return word

        distance = []
        suggestions = []
        suggestions = self.spell_dict.suggest(word)

        retVal = ""
        for suggestedWord in suggestions:
            distance.append(edit_distance(word, suggestedWord))

        if min(distance) <= self.max_dist:
            retVal = suggestions[distance.index(min(distance))]

            i = 0
            for ed in distance:
                if ed == min(distance) :
                    if len(word) == len(suggestions[i]):
                        retVal = suggestions[i]
                        break
                i += 1
        else:
            retVal = word

        return retVal
Beispiel #34
0
def get_top_from_edit_distance(word, suggested_words):
    """
    Based on edit distance in counts the best candidates to replace
    :param word:
    :param suggested_words:
    :return: top-10 closest to original word
    """
    top = {}

    for suggested_word in suggested_words:
        value = edit_distance(word, suggested_word)
        if value not in top:
            top[value] = [suggested_word]
        else:
            if suggested_word not in top[value]:
                top[value] += [suggested_word]

    sorted_top = dict(sorted(top.items(), key=lambda x: x[0]))

    for k, v in sorted_top.items():
        sorted_top[k] = sorted(v, key=lambda x: abs(len(word) - len(x)))

    result = []

    for elements_array in sorted_top.values():
        result += elements_array

    return result[:10]
 def replace(self, word):
     suggestions = self.spell_dict.suggest(word)
     if suggestions and edit_distance(word,
                                      suggestions[0]) <= self.max_dist:
         return suggestions[0]
     else:
         return word
def typo(addr, tar_str):
    tStart = time.time()
    file_js = dict()
    with open(addr, 'r') as f_stream:
        f_str = f_stream.read()
        if is_json(f_str):
            file_js = json.loads(f_str)

    inv_idx = file_js['tokens_o']
    tar_str = normalize_word(tar_str)
    result_li = []
    for it in inv_idx:
        result_li.append((edit_distance(tar_str, normalize_word(it)), it))
    result_li = sorted(result_li)

    return_li = []
    num_res = len(result_li)
    lim = min(5, num_res)

    for idx in range(lim):
        return_li.append({'dist': result_li[idx][0], 'str': result_li[idx][1]})

    tEnd = time.time()
    print("\n typo \nIt cost %f sec" % (tEnd - tStart))
    return return_li
Beispiel #37
0
def edit_dis(list):
    """Retrieves the similar question in the test data with a users question"""

    # retrive dataframe
    df = generate_pairs()

    # convert to list for modeling
    x = df.Question.tolist()
    y = df.Answer.tolist()

    predicted = []

    for u_sent in list:

        x_index = -1  # initialize index for tracking index of similar question
        ini_val = 1000  # initialize edit distance for similar question
        
        for i in range(len(x)):
            # calculate edit distance
            val_dis = edit_distance(u_sent[0].split(), x[i].split())
            if(val_dis < ini_val):
                ini_val = val_dis
                x_index = i
        predicted.append(y[x_index])
    
    return predicted
Beispiel #38
0
    def service_tag(self, text, print_word=False):
        '''
        text: string input
        output: 0 or 1
        '''
        if self.tagger == None:
            self.tagger = []
            try:
                with open(
                        os.path.abspath('utils') +
                        '/DictionaryUtils/service_tagger.txt', 'r') as fp:
                    data = fp.read().lower()
                self.tagger = set(data.split('\n'))
            except:
                print('Warning: Service_tagger.txt not read')
                pass
            self.tagger = set(self.tagger)

            if '' in self.tagger or ' ' in self.tagger:
                self.tagger.pop()

        k = text.split()
        for w in k:
            for wrd in self.tagger:
                x = edit_distance(w.lower(), wrd)
                if x <= 1:
                    if print_word == True:
                        print(wrd)
                    return 1
        return 0
Beispiel #39
0
def pun(sent, cat):
    """""
        THIS IS THE FUNCTION YOU HAVE TO WRITE
        It takes an expression and a category as input,
        Chooses a word in the expression,
        Find a word related to the category that sounds similar,
        replace the word chosen by this similar sounding word,
        to build a new expression
    """ ""
    #first slice the string and choose a word in it, maybe you need to clean it from punctuation?
    sent = re.sub(r'[^\w\s]', '',
                  sent)  #everything that is not a word - replace with nothing
    sent = sent.split()  #split into words
    word_of_interest = sent[2]

    #second, load the category as a list- drink or food
    cat_list = category(cat)
    print(cat_list)

    #careful, not all words are in the spelling dictionnary, make sure to only keep those that are
    if word_of_interest in arpabet:
        word_of_interest = word_of_interest
    else:
        print("word not found in dictionary")
        return False

    #third, translate the list of words into a list of their phonetic representation
    translated_category = [pronounce(word) for word in cat_list]

    #fourth, create a list of distances (use the edit_distance function from nltk)
    distance_list = [
        edit_distance(word, translated_category)
        for word in translated_category
    ]
    print(distance_list)
Beispiel #40
0
    def replace_word(self, word):
        if self.dictionary.check(word):
            return word

        suggestions = self.dictionary.suggest(word)
        if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
            return suggestions[0]
 def replace(self,word):
     if self.spell_dict.check(word):
         return word
     suggestions = self.spell_dict.suggest(word)
     if suggestions and edit_distance(word, suggestions[0]) <=self.max_dist:
         return suggestions[0]
     else:
         return word
Beispiel #42
0
def check_replace_word(word):
    if spell_dict.check(word):
        return word
    suggestions = spell_dict.suggest(word)
    if suggestions and edit_distance(word, suggestions[0]) < 2:
        return suggestions[0]
    else:
        return word
def spell_check(r, a, s, scores, weight=1):
    change = weight*(1-(edit_distance(r, a)/float(max(len(r), len(a)))))
    if s in scores:
        # penalty for returning multiple of the same result when
        # one instance is incorrectly spelled
        return (scores[s] + change)/2.0
    else:
        return change
    def ordered_content_distance(self, sentence, normalized=True):
        """Normalized levenshtein distance on (ordered) content words
        between `self` and `sentence`."""

        self_content_words = self.content_words
        sentence_content_words = sentence.content_words
        distance = edit_distance(self_content_words, sentence_content_words)
        norm = max(len(self_content_words), len(sentence_content_words))
        return distance / norm if normalized else distance
    def raw_distance(self, sentence, normalized=True):
        """Normalized levenshtein distance between `self.text` and
        `sentence.text`."""

        self_text = self.text
        sentence_text = sentence.text
        distance = edit_distance(self_text, sentence_text)
        norm = max(len(self_text), len(sentence_text))
        return distance / norm if normalized else distance
def get_string_similarity(p_token, h_token):
    distance = edit_distance(h_token, p_token)
    max_length = max(len(h_token), len(p_token))
    score = 0
    if max_length > 2:
        score = 1 - (distance / (max_length - 1.99999999999999))
    #if score > 1:
        #logging.warning('score > 1 for %s, %s' % (p_token, h_token))
    return max(0, score)
	def similar(self, word):
		names = self.table_names() + self.column_names() + self.row_names()
		best = 100
		best_word = None
		for name in names:
			dist = edit_distance(name, word)
			if dist <= best:
				best,best_word = dist,name
				#print "Best word: " + best_word + " for " + word + ". Distance: " + str(dist)
		return best_word
Beispiel #48
0
def spell_correct(unigrams, Dict):
	for raword in unigrams:	
		if not (raword == "" or (raword[0] == '@' or raword[0] == '#')):
		
			#Type error
			suggestions = Dict.suggest(raword)
			if suggestions and not Dict.check(raword):
				if edit_distance(suggestions[0], raword) < 2:
					raword = suggestions[0]
	return unigrams
Beispiel #49
0
def chug():
    for title in dir_ocr:
        with open(ocr + title, "r") as o_open:
            with open(lines, "r") as l_open:
                # lists of lines for each doc.
                o_open_r = o_open.readlines()
                l_open_r = l_open.readlines()
                tot_o_line = len(o_open_r)
                tot_l_line = len(l_open_r)
                o_line = 0
                for o in o_open_r:
                    # strip ocr lines of punctuation/whitespace
                    d = {}
                    o_1 = p.depunc(o.decode("utf-8"))
                    l_line = 0
                    o_line += 1
                    for l in l_open_r:
                        # strip 'known' lines of punctuation/whitespace
                        l_1 = p.depunc(l.decode("utf-8"))
                        # ignore ocr lines with few characters, still count the line thought
                        if len(o_1) < 4:
                            l_line += 1
                        # don't compare ocr lines less than half or over twice the length of the reference 'known' line(does this improve performance?)
                        elif len(o_1) < 0.5 * len(l_1) or len(o_1) > 1.5 * len(l_1):
                            l_line += 1
                        # compare ocr and known lines, get a similarity value between 0(not similar) and 1 (exact match), insert line pairs into dictionary
                        else:
                            l_line += 1
                            x = len(o_1) + len(l_1)
                            dist = (x - metrics.edit_distance(o_1, l_1)) / (x)
                            d[
                                '"'
                                + str(
                                    title
                                    + "| "
                                    + str(o_line)
                                    + '","'
                                    + o.rstrip("\n")
                                    + '","'
                                    + "line: "
                                    + str(l_line)
                                    + '","'
                                    + l.rstrip("\n")
                                    + '"'
                                )
                            ] = dist
                            # keep the top score in the dictionary for each ocr line. Append to file.
                    if len(d) > 0 and (max(d.values())) > 0.85:
                        m = d.keys()[d.values().index(max(d.values()))]
                        f = open(output, "a")
                        f.write(str(m) + "," + str((max(d.values()))) + "\n")
                        print str(m).decode("utf-8") + ",", (max(d.values()))
            l_open.close()
        o_open.close()
    f.close()
Beispiel #50
0
    def between(a, b):
        """Returns the edit distance between two strings.

        >>> EditDistance.between('abc', 'abc')
        0
        >>> EditDistance.between('abc', 'def')
        3
        >>> EditDistance.between('abcd', 'abef')
        2
        """
        return edit_distance(a, b)
Beispiel #51
0
def main(argv):
  inputfile = ''
  inputheader = ''
  try:
     opts, args = getopt.getopt(argv,"hi:d:",["ifile=","iheader="])
  except getopt.GetoptError:
     print 'test.py -i <inputfile> -o <inputheader>'
     sys.exit(2)
  for opt, arg in opts:
     if opt == '-h':
        print 'test.py -i <inputfile> -d <outputfile>'
        sys.exit()
     elif opt in ("-i", "--ifile"):
        inputfile = arg
     elif opt in ("-d", "--iheader"):
        inputheader = arg
  print inputfile
  header = '##engine '
  body = ''
  tree = ET.parse(inputfile)
  root = tree.getroot()
  
  for headernode in root.findall('header'):
    for tool in headernode.iter('tool'):
      body = body + tool.attrib["engine"] + ' '
    #body = body + tool.attrib["engine"] + ' '
  
  for word in root.iter('word'):
    original = word.find('original').text
    header = header + original + ' '
    status = word.find('status').text
    if status == 'SplErr':
      expected = word.find('expected').text
      # a is the misspelled word
      a = original
      # c is the reference word
      c = expected
      count = 0
      total = 0
      for suggestion in word.iter('suggestion'):
        # b is one of the suggestions offered 
        # by the spell checker engine
        b = suggestion.text
        count = count + 1
        total = total + round(edit_distance(a,b)/len(c), 2)
      result = total / count
      body = body + str(round(result,3)) + ' '
    else:
      body = body + '0 '
  
  f = open('test.dat', 'a')
  if (inputheader == 'true'):
    f.write(header+'\n')
  f.write(body+'\n')
Beispiel #52
0
def str_common_word(str1, str2):
    str1, str2 = str1.lower(), str2.lower()
    words, cnt, words2 = str1.split(), 0, str2.split(),
    for word in words:
        if len(words2) < 10 and len(words) < 4:
            for word2 in words2:
                if edit_distance(word, word2, transpositions=False) <= 1:
                    cnt += 1
        else:
            if str2.find(word) >= 0:
                cnt += 1
    return cnt
def spellcheck(wordtoken):
  if wordtoken == "":
    return wordtoken

  if DICT.check(wordtoken) == False:
    suggestions = DICT.suggest(wordtoken)
    if suggestions:
      for suggestion in suggestions:
        if edit_distance(wordtoken, suggestion) <= 2:
          return suggestion

  return wordtoken
Beispiel #54
0
def str_common_word(str1, str2):
    words, cnt = str1.split(), 0
    for word in words:
        if str2.find(word)>=0:
            cnt+=1
        # new for edit distance
        if cnt == 0 and len(word)>3:
            s1 = [z for z in list(set(str2.split(" "))) if abs(len(z)-len(word))<2]
            t1 = sum([1 for z in s1 if edit_distance(z, word)<2])
            if t1 > 1:
                cnt+=0.5
    return cnt
Beispiel #55
0
def getOrthographicVariants(word, limit=20):
    """Use flookup and the orthographicvariation FST to get possible alternate
    spellings/transcriptions of the word.  Return these ranked by their minimum
    edit distance from the word.

    """

    print '\n\n\nTRYING TO GET VARIANTS FOR: %s\n\n\n' % word

    # Check to see if we have the orthographic variation FST file
    if orthographicVariationBinaryFileName not in os.listdir(parserDataDir):
        return []

    # Check to see if the nltk module is installed
    try:
        from nltk.metrics import edit_distance
    except ImportError:
        return []

    # Get variants from flookup
    word = u'#%s#' % word
    orthographicVariationBinaryFilePath = os.path.join(
        parserDataDir, orthographicVariationBinaryFileName)
    process = subprocess.Popen(
        ['flookup', '-x', '-i', orthographicVariationBinaryFilePath],
        shell=False,
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE)
    process.stdin.write(word.encode('utf-8'))
    result = unicode(process.communicate()[0], 'utf-8').split('\n')
    #print 'Number of results from flookup: %d' % len(result)

    # Remove results that are too long or too short
    margin = 2
    if len(result) > 1000:
        margin = 1
    result = [x for x in result
              if len(x) < len(word) + 2 and len(x) > len(word) -2]
    #print 'Number of results needing edit distancing: %d' % len(result)

    # Sort variants by minimum edit distance
    result = [(x, edit_distance(word, x)) for x in result]
    result.sort(key=lambda x: x[1])

    # Take only the top <limit> # of results
    result = result[:limit]

    # Remove the first result if it has a MED of 0
    if result[0][1] == 0:
        result = result[1:]
    result = [x[0][1:-1] for x in result if x] # Remove hash symbols
    return result
Beispiel #56
0
def get_sim(analys_dict):
    
    sim_dict = {}
    name_list = analys_dict.keys()
    
    for name1 in name_list:
        for name2 in name_list:
            dist = edit_distance(name1,name2)
            if(dist < 3 and dist > 0):
                sim_dict[analys_dict[name1]]= analys_dict[name2]
                name_list.remove(name2)
                print "%d %s : %d %s" % (analys_dict[name1], name1, analys_dict[name2], name2)
    return sim_dict
Beispiel #57
0
def min_distance_one(original, find):
    #check the colour and name
    if original == None:
        return None
    new = None
    min_dist = 100
    for j in find:
        distance = edit_distance(original, j, transpositions=False)
        if distance < min_dist:
            min_dist = distance
            new = j
    #print "Name we are looking for: %s. It is similar to: %s"%(new, original)
    return  new
Beispiel #58
0
def get_close_matches(word, possibilities, n=3, cutoff=None, case_sensitive=False):
    """
        Given a string and a list of strings to lookup, returns a list of
        pairs of the n-closest strings (according to the edit-distance),
        and their respective distances from 'word'.
        If cutoff is given, the returned list will contain only the string
        which are closer than the cutoff.
    """
    try:
        from nltk.metrics import edit_distance
    except ImportError:
        from Levenshtein import distance as edit_distance
    
    hits = []
    for possibility in possibilities:
        if case_sensitive:
            d = edit_distance(word, possibility)
        else:
            d = edit_distance(word.lower(), possibility.lower())
        if cutoff and d < cutoff:
            hits.append((possibility, d))

    return sorted(hits, cmp=lambda x,y: cmp(x[1], y[1]))
Beispiel #59
0
    def misspell_distance(self, word):
        if self.spell_dict.check(word):
            return 0
        if word.isdigit():
            return 0
        suggestions = self.spell_dict.suggest(word)

        if suggestions:
            # print >> sys.stderr, "%r => %r" % (word, suggestions[0])
            ed = edit_distance(word, suggestions[0])
            if ed:
                return ed
        else:
            return self.max_dist