def get_del_ins_num(errs, corrs):
    i, j, dp = levenshtein_distance(errs, corrs)
    del_num, ins_num = 0, 0
    while i > 0 or j > 0:
        if i == 0:
            min_idx = 2
        elif j == 0:
            min_idx = 1
        else:
            dp_val = [dp[i - 1][j - 1], dp[i - 1][j], dp[i][j - 1]]
            min_idx = dp_val.index(min(dp_val))

        if dp[i][j] == dp[i - 1][j - 1] and min_idx == 0:
            i -= 1
            j -= 1
        elif min_idx == 0:
            del_num += 1
            ins_num += 1
            i -= 1
            j -= 1
        elif min_idx == 1:
            del_num += 1
            i -= 1
        else:
            ins_num += 1
            j -= 1
    return del_num, ins_num
Beispiel #2
0
def average_levenshtein(predicted_plain: List[str],
                        correct_plain: List[str]) -> float:
    correct = 0

    for predicted, plain in zip(predicted_plain, correct_plain):
        correct += levenshtein_distance(plain, predicted)
    return correct / len(correct_plain)
def align_err_cor(errs, corrs):
    i, j, dp = levenshtein_distance(errs, corrs)

    err_vals, corr_vals = [], []
    while i > 0 or j > 0:
        if i == 0:
            min_idx = 2
        elif j == 0:
            min_idx = 1
        else:
            dp_val = [dp[i - 1][j - 1], dp[i - 1][j], dp[i][j - 1]]
            min_idx = dp_val.index(min(dp_val))
        if dp[i][j] == dp[i - 1][j - 1] and min_idx == 0:
            i -= 1
            j -= 1
            corr_vals.append(corrs[j])
            err_vals.append(errs[i])
        elif min_idx == 0:
            i -= 1
            j -= 1
            corr_vals.append(corrs[j])
            err_vals.append(errs[i])
        elif min_idx == 1:
            i -= 1
            corr_vals.append(None)
            err_vals.append(errs[i])
        else:
            j -= 1
            corr_vals.append(corrs[j])
            err_vals.append(None)
    corr_vals.reverse()
    err_vals.reverse()
    return err_vals, corr_vals
def process(text):
    a = None
    b = None
    err_corr = text.split("\t")
    if len(err_corr) == 2:
        err = mojimoji.zen_to_han(err_corr[0].rstrip('\n'), kana=False)
        err = mojimoji.han_to_zen(err, ascii=False, digit=False)
        corr = mojimoji.zen_to_han(err_corr[1].rstrip('\n'), kana=False)
        corr = mojimoji.han_to_zen(corr, ascii=False, digit=False)
        err_lang = utils.lang_check(err, lang)
        corr_lang = utils.lang_check(corr, lang)

        if err_lang and corr_lang:

            errs = list(err)
            corrs = list(corr)
            del_num, ins_num = ld.levenshtein_distance(errs, corrs)
            del_portion = del_num / len(errs)
            ins_portion = ins_num / len(corrs)


            if (del_num < d_num and ins_num < i_num and del_portion < 0.4 and ins_portion < 0.4)\
                    and (corrs[-1]== '。' or corrs[-1]== '?' or corrs[-1]== '!') \
                    and (corrs[-2] not in numlist) and ('__' not in corr) and (len(corr)>6):
                #cleaning the dataset like: 1)
                err = re.sub("\d+\)\s+", "", err)
                corr = re.sub("\d+\)\s+", "", corr)
                err = re.sub("\(\s", "", err)
                corr = re.sub("\(\s", "", corr)
                err = re.sub("\s\)", "", err)
                corr = re.sub("\s\)", "", corr)
                #cleaning the string like: 1.)
                err = re.sub("\d+\.\)\s*", "", err)
                corr = re.sub("\d+\.\)\s*", "", corr)
                #cleaning the string like: 1.
                err = re.sub("\d+\.\s*", "", err)
                corr = re.sub("\d+\.\s*", "", corr)
                #cleaning the strings begin with ・
                err = re.sub("・\s+", "", err)
                corr = re.sub("・\s+", "", corr)
                # cleaning the strings begin with *
                err = re.sub("\*\s+", "", err)
                corr = re.sub("\*\s+", "", corr)
                # cleaning the strings begin with *
                err = re.sub("\*\*\s+", "", err)
                corr = re.sub("\*\*\s+", "", corr)
                # cleaning the strings begin with -
                err = re.sub("-\s+", "", err)
                corr = re.sub("-\s+", "", corr)
                # cleaning the tag for conversation:
                err = re.sub("A:\s*", "", err)
                corr = re.sub("A:\s*", "", corr)
                # cleaning the tag for conversation:
                err = re.sub("B:\s*", "", err)
                corr = re.sub("B:\s*", "", corr)
                a = err
                b = corr

                return a, b
def process(text):
    err_corr = text.split("\t")
    if len(err_corr) == 2:
        err = mojimoji.zen_to_han(err_corr[0].rstrip('\n'), kana=False)
        corr = mojimoji.zen_to_han(err_corr[1].rstrip('\n'), kana=False)
        err_lang = utils.lang_check(err, lang) if check_ascii(err) else False
        corr_lang = utils.lang_check(corr,
                                     lang) if check_ascii(corr) else False

        if err_lang and corr_lang:
            errs = tokenize.word_tokenize(err)
            corrs = tokenize.word_tokenize(corr)
            del_num, ins_num = ld.levenshtein_distance(errs, corrs)
            del_portion = del_num / len(errs)
            ins_portion = ins_num / len(corrs)
            if del_num < d_num and ins_num < i_num and del_portion < 0.33 and ins_portion < 0.33:
                print(errs + "\t" + corrs)
def process(text):
    err_corr = text.split("\t")
    if len(err_corr) == 2:
        err = mojimoji.zen_to_han(err_corr[0].rstrip('\n'), kana=False)
        err = mojimoji.han_to_zen(err, ascii=False, digit=False)
        corr = mojimoji.zen_to_han(err_corr[1].rstrip('\n'), kana=False)
        corr = mojimoji.han_to_zen(corr, ascii=False, digit=False)
        err_lang = utils.lang_check(err, lang)
        corr_lang = utils.lang_check(corr, lang)

        if err_lang and corr_lang:
            errs = list(err)
            corrs = list(corr)
            del_num, ins_num = ld.levenshtein_distance(errs, corrs)
            del_portion = del_num / len(errs)
            ins_portion = ins_num / len(corrs)
            if del_num < d_num and ins_num < i_num and del_portion < 0.4 and ins_portion < 0.4:
                print(err + "\t" + corr)
 def f(a, b):
     print a,b
     return levenshtein_distance(a, b)
def extractMostCommonGroup(url, html=None):
## REQUIRING A HOSTNAME MATCH DOESN'T WORK FOR RELATIVE URLS..
#    match = url_hostname_re.match(url)
#    if match is not None:
#        hostname = match.group(2)
#    else:
#        raise Exception('Failed to extract hostname from the supplied url?')

#    a=array([[1,2,9,10,99,100], [3,4,10,11,99,150], [99, 100, 10, 13, 400, -3]])
#    mask=array([[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]])
#    clusterid, error, nfound = kcluster(a,
#        nclusters=2, mask=mask, weight=array([1, 1, 1, 1, 1, 1]),
#        transpose=0, npass=1000, method='a', dist='e')
#    print clusterid
#    print error
#    print nfound
#    return

    if html is None:
        html = wget(url)

    bodyOnlyMatch = bodyOnlyRe.match(html)
    if bodyOnlyMatch:
        d = pq(bodyOnlyMatch.group(1))
    else:
        print 'HTML: %s' % html
        print 'Warning: <body> tag region could not be extracted - this is bad'
        d = pq(html)

    def _print(x):
        print x
        return True

    #print [urlparse.urljoin(url, a.attrib['href']) for a in d('a')]
    lst = list(excludeDuplicates([
        urlparse.urljoin(url, a.attrib['href']) for a in d('a') if (
            #_print(pq(a).children('img')) and
            len(pq(a).children('img')) > 0 and
            a.attrib.has_key('href') and
            containsNumberRe.search(a.attrib['href'])
        )
    ]))

    #print lst


    domain = urlparse.urlsplit(url).netloc

    diffs = dict([
        (item, levenshtein_distance(lst[i].replace('http://', ''), domain))
        for i, item in enumerate(lst)
    ])

#     avg = mean(diffs)
#     print avg
#     print min(diffs)
#     print max(diffs)
#     pprint (lst)

    def f(a, b):
        return abs(a - b)

    numGroups = len(diffs) / 2
    print 'Num groups:',numGroups

    if len(diffs) == 0:
        print 'LargestImage extract.. error: no diffs!'
        return []

    groups = kmeans(diffs, numGroups, f)

    # Select and return the largest group of similar links.
    maxIdx = -1
    conflictedIdx = -1
    maxSz = 0
    for idx, group in groups.items():
        l = len(group)
        print l
        if l > maxSz:
            maxIdx = idx
            maxSz = l
            # Any previous conflict is no longer relevant.
            conflictedIdx = -1
        elif l == maxSz:
            # Mark conflicted state.
            conflictedIdx = idx

    print 'groups = %s' % groups
    # Make sure we got a result.
    if maxIdx == -1:
        raise Exception('No groups were found?  Very odd.. groups = %s' % groups)

    # Check to see if the largest group had conflicts.
    if conflictedIdx != -1:
        print 'WARNING: There was a group of equal size which was not selected.'

    imageLinks = False

    for link in groups[maxIdx]:
        if imageRe.match(link):
            imageLinks = True
            break

    if not imageLinks:
        print 'no image links were found.. for url=%s' % url
        out = []
        for link in groups[maxIdx]:
            out.append(extractLargestImageUrlFromUrl(link))
        return out

    #print 'imageLinks = %s' % imageLinks

    return groups[maxIdx]
Beispiel #9
0
        #find incorrect words
        if token not in word2index:
            if verbose == 1:
                print ">> incorrect word:", token

            #generate context
            context = re.sub(token, '[]', tweet)
            context_proposition = explore_context2vec(context, w, word2index,
                                                      index2word, model,
                                                      target_exp, n_results)

            #find clother word in context
            min_dist = np.inf
            for proposition in context_proposition:
                dist = levenshtein_distance(token, proposition)
                if dist < min_dist:
                    min_dist = dist
                    correct_word = proposition[0]
            if verbose == 1:
                print ">> correction:", correct_word
                print '>> Levenshtein distance:', min_dist
            correct_tweet = re.sub(token, correct_word, tweet)
        else:
            correct_tweet = tweet
    if verbose == 1:
        print correct_tweet, '\n'
    normalised_rtweet_list.append(correct_tweet)

print 'Writing file...'
normalised_corpus = open('normalised_' + corpus_file, 'w')
 def test_generic(self):
     for idx, ans in enumerate(self.distance):
         self.assertEqual(
             ans,
             levenshtein_distance(self.string1s[idx], self.string2s[idx]))