コード例 #1
0
def main():
    if (len(sys.argv) != 2):
        print("Usage: python ML-Agglomerative.py path-name")
        sys.exit()
    path = sys.argv[1]  # "../ames/test.csv"

    df = pd.read_csv(path)
    arr1 = df.columns.values
    arr2 = df.columns.values

    NL = NormalizedLevenshtein()
    vectors = pd.DataFrame([[NL.distance(i, j) for j in arr2] for i in arr1])

    clusters = 5
    agg = AgglomerativeClustering(affinity='euclidean',
                                  compute_full_tree='auto',
                                  connectivity=None,
                                  linkage='ward',
                                  memory=None,
                                  n_clusters=clusters,
                                  pooling_func='deprecated').fit(vectors)

    for i in range(clusters):
        print("Cluster " + str(i) + ":")
        print(pd.Series(arr1[agg.labels_ == i]))
        print('\n')
コード例 #2
0
def simDif(w1,w2):
  normalized_levenshtein = NormalizedLevenshtein()
  dis = normalized_levenshtein.distance(w1,w2)
  sim = normalized_levenshtein.similarity(w1,w2)
  print('distance: '+str(dis)+' similarity: '+ str(sim))
  # 此线表示的是固定文本的一段和活动文本各段比较之后的分隔
  print('----------------------------')
コード例 #3
0
def get_norm_levenshtein(prediction, groundtruth):
    normalized_levenshtein = NormalizedLevenshtein()
    if type(groundtruth)==list:
        if len(groundtruth)==0:
            return 0
        return np.max([get_norm_levenshtein(prediction, gt) for gt in groundtruth])
    return normalized_levenshtein.similarity(normalize_answer(prediction), normalize_answer(groundtruth))
コード例 #4
0
def searchlike(query_str,s1):
    # 字面量相似度
    #print(difflib.SequenceMatcher(None, query_str, s1).quick_ratio())
    # 编辑距离
    normalized_levenshtein = NormalizedLevenshtein()
    #print(normalized_levenshtein.similarity(query_str, s1))
    #杰卡德相似度
    jaccard_coefficient = Jaccrad(query_str, s1)
    #print(jaccard_coefficient)
    return difflib.SequenceMatcher(None, query_str, s1).quick_ratio(), normalized_levenshtein.similarity(query_str, s1),jaccard_coefficient
コード例 #5
0
def similarity(outputs_batch, labels_batch, dic):
    norm_lev = NormalizedLevenshtein()
    outputs_batch = torch.argmax(outputs_batch, -1)
    avg_sim = 0
    for j in range(outputs_batch.size(-1)):
        pred = [dic[int(k)] for k in outputs_batch[:, j]]
        pred = utils.clear(pred)
        avg_sim += norm_lev.distance(pred, labels_batch[j])
    avg_sim = 1 - avg_sim / outputs_batch.size(-1)
    return avg_sim
コード例 #6
0
def similarity_plus(outputs_batch, labels_batch, dic):
    d = enchant.Dict("en_US")
    norm_lev = NormalizedLevenshtein()
    outputs_batch = torch.argmax(outputs_batch, -1)
    avg_sim = 0
    for j in range(outputs_batch.size(-1)):
        pred = [dic[int(k)] for k in outputs_batch[:, j]]
        pred = utils.clear(pred)
        if not d.check(pred):
            pred = d.suggest(pred)
        avg_sim += norm_lev.distance(pred, labels_batch[j])
    avg_sim = 1 - avg_sim / outputs_batch.size(-1)
    return avg_sim
コード例 #7
0
    def __init__(self, word2vec: Word2Vec, fasttext: FastText,
                 lexicon: set) -> None:
        '''Creates a new PostOCRTextCorrection object
        Args:
            word2vec (Word2Vec): Word2Vec pre-trained model: a gensim model trained on dirty data (text without cleaning)
            fasttext (FastText): FastText pre-trained model: a gensim model trained on dirty data (text without cleaning)
            lexicon (set): complete list of words belonging to the language of the corpus documents (including conjugations, ...)
        '''

        self.__n_levenshtein = NormalizedLevenshtein()

        # loads pretrained models
        self.word2vec = word2vec
        self.word2vec_vocab = set(
            [word for word in self.word2vec.wv.index_to_key if len(word) > 3])
        self.fasttext = fasttext

        self.lexicon = set(lexicon)
コード例 #8
0
ファイル: main.py プロジェクト: mr-CLK/QA-Generator
    def __init__(self):

        self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
        model = T5ForConditionalGeneration.from_pretrained('Parth/result')
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        # model.eval()
        self.device = device
        self.model = model
        self.nlp = spacy.load('tr')
        self.fdist = FreqDist(brown.words())
        self.normalized_levenshtein = NormalizedLevenshtein()
        self.set_seed(42)
コード例 #9
0
def data_classifier(arr, sources):
    NL = NormalizedLevenshtein()
    vectors = pd.DataFrame([[NL.distance(i, j) for j in arr] for i in arr])

    clusters = int(len(arr)**.5)
    if clusters <= 1:
        clusters = 2
    kmeans = KMeans(n_clusters=clusters, random_state=0).fit(vectors)

    field_ids = ["field-" + str(uuid.uuid4()) for field in arr]
    uncert = pd.DataFrame()
    for i in range(clusters):
        uncert[i] = vectors.apply(dist,
                                  args=(kmeans.cluster_centers_[i], ),
                                  axis=0)
    uncert.index = field_ids

    classifications_obj = {}
    fields_obj = {}
    for i in range(clusters):
        cluster = {}
        fields = pd.Series(field_ids)[kmeans.labels_ == i]
        for field_id in fields:
            cluster[field_id] = uncert.loc[field_id][i]
            idx = field_ids.index(field_id)
            fields_obj[field_id] = {"name": arr[idx], "source": sources[idx]}
        cid_obj = {}
        cid_obj["name"] = "classification" + str(i)
        cid_obj["metadata"] = None
        cid_obj["values"] = cluster
        cid_obj["distribution"] = np.array(list(cluster.values())).mean()
        classifications_obj["classification-" + str(uuid.uuid4())] = cid_obj

    data = {}
    data["Classifications"] = classifications_obj
    data["Fields"] = fields_obj
    return (json.dumps(data, sort_keys=True, indent=4))
コード例 #10
0
ファイル: questgen.py プロジェクト: ramsrig/Questgen.ai
    def __init__(self):
        model_file_1 = "../input/s2v-old/s2v_old"

        self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
        model = T5ForConditionalGeneration.from_pretrained('Parth/result')
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        # model.eval()
        self.device = device
        self.model = model
        self.nlp = spacy.load('en_core_web_sm')

        self.s2v = Sense2Vec().from_disk('../input/s2v-old/s2v_old')

        self.fdist = FreqDist(brown.words())
        self.normalized_levenshtein = NormalizedLevenshtein()
        self.set_seed(42)
コード例 #11
0
ファイル: Loader.py プロジェクト: Amr-Aboshama/XGeN
    def __init__(self,
                 s2v_model_path='s2v_old',
                 qg_model_path='Parth/result',
                 bq_model_path='ramsrigouthamg/t5_boolean_questions',
                 ap_model_path='Parth/boolean',
                 t5_tokenizer_path='t5-base'):

        self.tokenizer = T5Tokenizer.from_pretrained(t5_tokenizer_path)
        self.normalized_levenshtein = NormalizedLevenshtein()
        self.rand = random.Random(datetime.now())

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.qg_model = T5ForConditionalGeneration.from_pretrained(
            qg_model_path).to(self.device)
        self.bq_model = T5ForConditionalGeneration.from_pretrained(
            bq_model_path).to(self.device)
        self.ap_model = T5ForConditionalGeneration.from_pretrained(
            ap_model_path).to(self.device)
コード例 #12
0
    def __init__(self, lang_code='en', max_questions=20):

        self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
        model = T5ForConditionalGeneration.from_pretrained('Parth/result')
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        # model.eval()
        self.device = device
        self.model = model
        self.nlp = self.try_load_spacy_model(lang_code)
        self.max_questions = int(max_questions)

        self.s2v = Sense2Vec().from_disk(
            '/Users/dev/Develop/text-to-anki/backend/src/Questgen.ai/Questgen.ai/Questgen/s2v_old'
        )

        self.fdist = FreqDist(brown.words())
        self.normalized_levenshtein = NormalizedLevenshtein()
        self.set_seed(42)
コード例 #13
0
ファイル: evaluate.py プロジェクト: hamsterhooey/textract
'''
import argparse, os, cv2
from similarity.normalized_levenshtein import NormalizedLevenshtein
from textract.extractor import TextExtractor

# parse argument
parser = argparse.ArgumentParser()
parser.add_argument('--img_dir', type=str, help='image folder')
parser.add_argument('--gd_dir', type=str, help='ground truth folder')
parser.add_argument('--out_dir', type=str, help='ocr text output folder')
args = parser.parse_args()

if not os.path.exists(args.out_dir):
    os.mkdir(args.out_dir)

normalized_levenshtein = NormalizedLevenshtein()
similarities = {}

# use textract
extractor = TextExtractor()

folder = os.listdir(args.img_dir)
for article in folder:
    article_path = os.path.join(args.img_dir, article)
    if not os.path.isdir(article_path):
        continue

    groudtruth_path = os.path.join(args.gd_dir, article + '.txt')

    if not os.path.exists(groudtruth_path):
        continue
コード例 #14
0
# Teste 4 Machine learning eu acho
from similarity.normalized_levenshtein import NormalizedLevenshtein
normalized_levenshtein = NormalizedLevenshtein()

##perguntas_respostas = {'olá' : 'Como posso ajuda-lo?',
##                                            'bom dia' : 'Bom Dia!!',
##                                            'qual é seu filme preferido' :' Ex Machina!',
##                                             ' ' : ' ',
##                                             ' ' : ' ',
##                                             ' ' : ' ',
##                                             ' ' : ' ',
##                                             ' ' : ' ',
##                                             ' ' : ' ',
##                                             ' ' : ' ',
##                                             ' ' : ' '
##
##
##                           }

from similarity.normalized_levenshtein import NormalizedLevenshtein
normalized_levenshtein = NormalizedLevenshtein()

import os

pr_file = './perguntas_respostas.p'
import pickle
if os.path.isfile(pr_file):
    with open(pr_file, 'rb') as p:
        perguntas_respostas = pickle.load(p)

コード例 #15
0
import re
import numpy as np
from similarity.normalized_levenshtein import NormalizedLevenshtein

NORMALIZED_LEVENSHTEIN = NormalizedLevenshtein()

IS_TRUE = re.compile('true', re.IGNORECASE)

def by_name(data_frame, name):
    "produces a normalized distance score between 0 and 1"
    def make_score(_name):
        "make score takes 1 minus the distance to produce the similairty"
        return 1 - NORMALIZED_LEVENSHTEIN.distance(_name, name)
    name_scores = data_frame['name'].map(make_score)
    return name_scores

def by_experience(data_frame, experience):
    "converts the experienced column into a score of 0 or 1"
    params_exper = re.match(IS_TRUE, experience) != None
    def make_score(_exp):
        "returns 1 if the bool from the query string matches the Series experienced value"
        result = params_exper == _exp
        return 1 if result else 0
    exp_score = data_frame['experienced'].map(make_score)
    return exp_score

def by_distance(data_frame, location_type, value):
    "to normalize for all distances 1 - distance / max distance"
    distances = np.square(data_frame[location_type] - value)
    max_distance = np.max(distances)
    normalized = 1 - distances / max_distance
コード例 #16
0
def main():

    v1 = 'text'
    v2 = 'text'

    # -----------------------------------------------Edit based ------------------------------------------------------
    print(
        "-------------------------------- Edit based ----------------------------------"
    )
    print("------- HAMMING ---------")
    ed = Hamming()
    #The return value is a float between 0 and 1, where 0 means totally different, and 1 equal.
    print("Hamming Similarity: ", ed.normalized_similarity(v1, v2))

    print("\n-------- MLIPNS --------")
    ed = MLIPNS()
    print("MLIPNS similarity: ", ed.similarity(v1, v2))

    print("\n-------- JaroWinkler --------")
    ed = JaroWinkler()
    print("JaroWinkler similarity: ", ed.similarity(v1, v2))

    print("\n-------- Jaro --------")
    ed = Jaro()
    print("Jaro similarity: ", ed.similarity(v1, v2))

    # ----------------------------------------------- Token based ------------------------------------------------------

    print(
        "-------------------------------- Token based ----------------------------------"
    )
    print("\n-------- JACCARD --------")
    ed = Jaccard()
    print("JACCARD similarity: ", ed.similarity(v1, v2))
    #considera a quantidade de letras

    print("\n-------- Sorensen --------")
    ed = Sorensen()
    print("Sorensen similarity: ", ed.similarity(v1, v2))

    print("\n-------- Tversky --------")
    ed = Tversky()
    print("Tversky similarity: ", ed.similarity(v1, v2))

    print("\n-------- Overlap --------")
    ed = Overlap()
    print("Overlap similarity: ", ed.similarity(v1, v2))

    print("\n-------- Cosine --------")
    ed = Cosine()
    print("Cosine similarity: ", ed.similarity(v1, v2))

    # ----------------------------------------------- Sequence based ------------------------------------------------------
    print(
        "-------------------------------- Sequence based ----------------------------------"
    )

    print("\n-------- RatcliffObershelp --------")
    ed = RatcliffObershelp()
    print("RatcliffObershelp similarity: ", ed.similarity(v1, v2))

    # ----------------------------------------------- Compression based ------------------------------------------------------
    print(
        "-------------------------------- Compression based ----------------------------------"
    )

    print("\n-------- EntropyNCD --------")
    ed = EntropyNCD()
    print("EntropyNCD similarity: ", ed.similarity(v1, v2))

    print("\n-------- BZ2NCD --------")
    ed = BZ2NCD()
    print("BZ2NCD similarity: ", ed.similarity(v1, v2))

    print("\n-------- LZMANCD --------")
    ed = LZMANCD()
    print("LZMANCD similarity: ", ed.similarity(v1, v2))

    print("\n-------- ZLIBNCD --------")
    ed = ZLIBNCD()
    print("ZLIBNCD similarity: ", ed.similarity(v1, v2))

    # ----------------------------------------------- Simple based ------------------------------------------------------
    print(
        "-------------------------------- Simple based ----------------------------------"
    )

    print("\n-------- Prefix --------")
    ed = Prefix()
    print("Prefix similarity: ", ed.similarity(v1, v2))

    print("\n-------- Postfix --------")
    ed = Postfix()
    print("Postfix similarity: ", ed.similarity(v1, v2))

    # ----------------------------------------------- strsim function ------------------------------------------------------
    print(
        "-------------------------------- strsim function ----------------------------------"
    )

    print("\n-------- Normalized Levenshtein --------")
    ed = NormalizedLevenshtein()
    print("Normalized Levenshtein similarity: ", ed.similarity(v1, v2))

    print("\n-------- MetricLCS --------")
    ed = MetricLCS()
    print("MetricLCS similarity: ", ed.distance(v1, v2))

    print("\n-------- NGram --------")
    ed = NGram()
    print("NGram similarity: ", ed.distance(v1, v2))

    print("\n-------- Sorensen --------")
    ed = Sorensen()
    print("Sorensen similarity: ", ed.similarity(v1, v2))
コード例 #17
0
def similaridade(function_name, string_1, string_2):

    if function_name == 'Hamming':
        ed = Hamming()
        return ed.normalized_similarity(string_1, string_2)

    elif function_name == 'MLIPNS':
        ed = MLIPNS()
        return ed.similarity(string_1, string_2)

    elif function_name == 'JaroWinkler':
        ed = JaroWinkler()
        return ed.similarity(string_1, string_2)

    elif function_name == 'Jaro':
        ed = Jaro()
        return ed.similarity(string_1, string_2)

    elif function_name == 'Jaccard':
        ed = Jaccard()
        return ed.similarity(string_1, string_2)

    elif function_name == 'Sorensen':
        ed = Sorensen()
        return ed.similarity(string_1, string_2)

    elif function_name == 'Tversky':
        ed = Tversky()
        return ed.similarity(string_1, string_2)

    elif function_name == 'Overlap':
        ed = Overlap()
        return ed.similarity(string_1, string_2)

    elif function_name == 'Cosine':
        ed = Cosine()
        return ed.similarity(string_1, string_2)

    elif function_name == 'RatcliffObershelp':
        ed = RatcliffObershelp()
        return ed.similarity(string_1, string_2)

    elif function_name == 'EntropyNCD':
        ed = EntropyNCD()
        return ed.similarity(string_1, string_2)

    elif function_name == 'BZ2NCD':
        ed = BZ2NCD()
        return ed.similarity(string_1, string_2)

    elif function_name == 'LZMANCD':
        ed = LZMANCD()
        return ed.similarity(string_1, string_2)

    elif function_name == 'ZLIBNCD':
        ed = ZLIBNCD()
        return ed.similarity(string_1, string_2)

    elif function_name == 'Prefix':
        ed = Prefix()
        return ed.similarity(string_1, string_2)

    elif function_name == 'Postfix':
        ed = Postfix()
        return ed.similarity(string_1, string_2)

    elif function_name == 'NormalizedLevenshtein':
        ed = NormalizedLevenshtein()
        return ed.similarity(string_1, string_2)

    elif function_name == 'MetricLCS':
        ed = MetricLCS()
        return ed.distance(string_1, string_2)

    elif function_name == 'NGram':
        ed = NGram()
        return ed.distance(string_1, string_2)

    elif function_name == 'StrCmp95':
        ed = StrCmp95()
        return ed.distance(string_1, string_2)
コード例 #18
0
ファイル: Cog_find.py プロジェクト: YashasviMantha/NLP
import os
from similarity.normalized_levenshtein import NormalizedLevenshtein
from tqdm import tqdm

normalized_levenshtein = NormalizedLevenshtein()

file_list = os.listdir("../../Final_data")
for i in file_list:
    if (not i.endswith(".csv")):
        file_list.remove(i)

files_increment = 0
for file_i in file_list:
    for file_j in file_list:
        if (file_i != file_j):

            files_increment = files_increment + 1

            file_1 = open("../../Final_data/" + file_i, 'r', encoding='utf8')
            file_2 = open("../../Final_data/" + file_j, 'r', encoding='utf8')

            data_1 = file_1.readlines()
            data_2 = file_2.readlines()

            for i in range(60000):
                data_1[i] = data_1[i].replace("\n", '')
                data_2[i] = data_2[i].replace("\n", '')

                data_1[i] = data_1[i].split(";")
                data_2[i] = data_2[i].split(";")
コード例 #19
0
print(len(clean_asr))


# Definition of function to calculate Levenshtein btw 2 sentences
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()


# Definition of the 5 different types to classify the sentences
equal = 0
close = 0
medium = 0
low = 0
different = 0

normalized_levenshtein = NormalizedLevenshtein()
norm_dist = []

# Loop to analyze each pair of sentences and count the number of occurences of each type
for sent_nb in range(len(clean)):
    Ratio = normalized_levenshtein.similarity(clean[sent_nb],
                                              clean_asr[sent_nb])
    norm_dist.append(Ratio)
    if (Ratio == 1):
        equal += 1
    if (Ratio > 0.9):
        close += 1
    if (Ratio <= 0.9 and Ratio > 0.7):
        medium += 1
    if (Ratio <= 0.7 and Ratio > 0.5):
        low += 1
コード例 #20
0
class PostOCRTextCorrection:
    '''
    This class helps to correct spelling errors introduced in texts by OCR.
    To correct errors it combines FastText, Word2vec and Normalized Levenshtein Distance .
   '''
    def __init__(self, word2vec: Word2Vec, fasttext: FastText,
                 lexicon: set) -> None:
        '''Creates a new PostOCRTextCorrection object
        Args:
            word2vec (Word2Vec): Word2Vec pre-trained model: a gensim model trained on dirty data (text without cleaning)
            fasttext (FastText): FastText pre-trained model: a gensim model trained on dirty data (text without cleaning)
            lexicon (set): complete list of words belonging to the language of the corpus documents (including conjugations, ...)
        '''

        self.__n_levenshtein = NormalizedLevenshtein()

        # loads pretrained models
        self.word2vec = word2vec
        self.word2vec_vocab = set(
            [word for word in self.word2vec.wv.index_to_key if len(word) > 3])
        self.fasttext = fasttext

        self.lexicon = set(lexicon)

    def __word_in_word2vec__(self, word: str) -> bool:
        '''Returns True if there is a 'word'-vector in Word2vec Space, otherwise False'''
        return word in self.word2vec_vocab

    def __word_in_lexicon__(self, word: str) -> bool:
        '''Returns True if 'word' belongs to the language of the corpus documents'''
        return word in self.lexicon

    def normalized_levhenstein_similarity(self, word1: str,
                                          word2: str) -> float:
        '''Returns the normalized levhenstein similarity [0,1] between 'word1' and 'word2' '''
        return self.__n_levenshtein.similarity(word1, word2)

    def weighted_normalized_levhenstein_similarity(self, word1: str,
                                                   word2: str,
                                                   weight: float) -> float:
        '''Returns the normalized levhenstein similarity [0,1] between 'word1' and 'word2' weighted
        by 'weight' '''

        return weight * self.normalized_levhenstein_similarity(word1, word2)

    def ocr_correction(self,
                       word: str,
                       topn_we: int = 10,
                       topn_nwe=150,
                       weight_we=0.3,
                       weight_nwe=1.0,
                       topn_co: int = 1) -> list:
        '''Retrieves the most suitable words to correct the wrong one in input
        Args:
            word (str): a wrong word to correct
            topn_we (int, default=10): top N most similar words if 'word is models embedded
            topn_nwe (int, default=150): top N most similar words if 'word' is not models embedded
            weight_we (float, default=0.3): normalized levhenstein weight if 'word' is models embedded
            weight_nwe (float, default=1.0): normalized levhenstein weight if 'word' is not models embedded
            topn_co (int, default=1): ton N most suitable words to correct the wrong one in input
        Returns:
            list: ton N most suitable words to correct the wrong one in input
        Examples:
          >>> ocr = PostOCRTextCorrection(...)
          >>> word = 'niziitlniente'
          >>> ocr.ocr_correction(word, topn_co=1)
          >>> [('nullatenente', 0.99)] #output
        '''

        word = word.lower()

        # {corrected word: score}
        is_word2vec_embedded = self.__word_in_word2vec__(word)
        corrections_score = defaultdict(int)

        if is_word2vec_embedded:
            for similar_word, cosine_simalirity in self.word2vec.wv.most_similar(
                    word, topn=topn_we):
                if not self.__word_in_lexicon__(similar_word): continue
                corrections_score[similar_word] = \
                    cosine_simalirity + self.weighted_normalized_levhenstein_similarity(word, similar_word, weight_we)
        else:
            # if not self.is_word2vec_embedded(word)
            # we check on a larger neighborod in FastText
            topn_we = topn_nwe

        for similar_word, cosine_simalirity in self.fasttext.wv.most_similar(
                word, topn=topn_we):
            if not self.__word_in_word2vec__(similar_word): continue

            candidate = list()
            candidate.append(similar_word)

            if is_word2vec_embedded:
                candidate.extend(
                    list(
                        dict(
                            self.word2vec.wv.most_similar(similar_word,
                                                          topn=topn_we))))
            else:
                pass

            for similar_word in candidate:
                if not self.__word_in_lexicon__(similar_word): continue

                if is_word2vec_embedded:
                    sim = self.word2vec.wv.similarity(word, similar_word) + \
                          self.weighted_normalized_levhenstein_similarity(word, similar_word, weight_we)
                else:
                    sim = cosine_simalirity + \
                          self.weighted_normalized_levhenstein_similarity(word, similar_word, weight_nwe)

                if sim > corrections_score[similar_word]:
                    corrections_score[similar_word] = sim

        return sorted(corrections_score.items(),
                      key=lambda x: x[1],
                      reverse=True)[:topn_co]

    def ocr_correction_corpus(
            self,
            data: List[dict],
            key: str = 'text',
            processing_text: callable = lambda x: x,
            processing_correction: callable = lambda x: x.lower(),
            min_len: int = 5,
            topn_we: int = 10,
            topn_nwe=150,
            weight_we=0.3,
            weight_nwe=1.0,
            paragraphs_level: bool = False) -> Tuple[List[dict], dict]:
        '''
        Given a corpus, retrieves ocr errors, estimates corrections for each error, corrects ocr errors with the estimated corrections.
        Args:
            data (List[dict]): a list of json documents
            key (str, default=text): key to the text field in each documents
            processing_text (callable, default=lambda x:x): function to process texts
            processing_correction (callable, default=lambda x:x.lower()): function to process corrections
            min_len (int, default=4): words less than 'min_len' characters long are not corrected.
            topn_we (int, default=10): top N most similar words if 'word' is models embedded
            topn_nwe (int, default=150): top N most similar words if 'word' is not models embedded
            weight_we (float, default=0.3): normalized levhenstein weight if 'word' is models embedded
            weight_nwe (float, default=1.0): normalized levhenstein weight if 'word' is not models embedded
            topn_co (int, default=1): ton N most suitable words to correct the wrong one in input
        Returns:
            Tuple[List[dict], dict]: a list of json documents corrected; corrections
        Examples:
          >>> ocr = PostOCRTextCorrection(...)
          >>> key='text'
          >>> data = [{'text': '[...] niziitlniente [...]'}, ..., {'text': '[...] preliininari [...]'}]
          >>> ocr.ocr_correction(data, key)
          >>> [{'text': '[...] nullatenente [...]'}, ..., {'text': '[...] preliminari [...]'}]#output
        '''
        ocr_errors = set()

        for doc in tqdm(data,
                        position=0,
                        leave=True,
                        desc='Retrieving ocr errors'):
            if paragraphs_level:
                for para_json in doc['paragraphs']:
                    words = set(
                        word
                        for word in processing_text(para_json[key]).split()
                        if len(word) >= min_len and not word.isdigit()
                        and not word[0].isupper())
                    ocr_errors.update(words.difference(self.lexicon))
            else:
                words = set(word for word in processing_text(doc[key]).split()
                            if len(word) >= min_len and not word.isdigit()
                            and not word[0].isupper())
                ocr_errors.update(words.difference(self.lexicon))

        ocr_corrections = dict()
        for ocr_error in tqdm(list(ocr_errors),
                              position=0,
                              leave=True,
                              desc='Retrieving ocr corrections'):
            ocr_correction = self.ocr_correction(ocr_error,
                                                 topn_we=topn_we,
                                                 topn_nwe=topn_nwe,
                                                 weight_we=weight_we,
                                                 weight_nwe=weight_nwe,
                                                 topn_co=1)

            if len(ocr_correction) == 0: continue

            ocr_correction = [
                processing_correction(ocr_corr[0])
                for ocr_corr in ocr_correction
            ]
            ocr_corrections[ocr_error] = ocr_correction[0]

        new_data = list()
        for doc in tqdm(data,
                        position=0,
                        leave=True,
                        desc='Correcting ocr errors'):
            if '_id' in doc:
                del doc["_id"]
            if paragraphs_level:
                new_para_json = list()
                for para_json in doc['paragraphs']:
                    para_json[key] = " " + processing_text(
                        para_json[key]) + " "
                    for token in para_json[key].split():
                        if token.lower() in ocr_corrections:
                            para_json[key] = para_json[key].replace(
                                " " + token.lower() + " ",
                                " " + ocr_corrections[token.lower()] + " ")
                    new_para_json.append(para_json)
                doc['paragraphs'] = new_para_json
            else:
                doc[key] = " " + processing_text(doc[key]) + " "
                for token in doc[key].split():
                    if token.lower() in ocr_corrections:
                        doc[key] = doc[key].replace(
                            " " + token.lower() + " ",
                            " " + ocr_corrections[token.lower()] + " ")
            new_data.append(doc)

        return new_data, ocr_corrections
コード例 #21
0
p = []

filter_thresh_45 = []

for i in range(len(temp_article)):

    jarowinkler = JaroWinkler()

    sim = jarowinkler.similarity(my_string, temp_article[i])

    if sim > 0.45:

        filter_thresh_45.append(data[i])

normalized_levenshtein = NormalizedLevenshtein()

filter_normalized_levenshtein = []

for i in range(len(filter_thresh_45)):

    sim = normalized_levenshtein.distance(my_string, filter_thresh_45[i][0])

    if sim >= 0.7:

        filter_normalized_levenshtein.append(filter_thresh_45[i])

with open('filtered_levenshtein_human_mobility.txt',
          'w',
          encoding="ISO-8859-1") as outfile:
    json.dump(filter_normalized_levenshtein, outfile)
コード例 #22
0
ファイル: 文本分析.py プロジェクト: cht123456abc/Bug
    def similarity(self, question, answer):

        stopword = self.read_from(folder_path + '上证专用停用词.txt')
        stopwords = []
        for sw in stopword:
            sw = sw.strip('\n')
            sw = sw.strip(' ')
            stopwords.append(sw)
        # print(stopwords)

        meaningful_words1 = []
        meaningful_words2 = []

        words2 = jieba.cut(str(question))
        words3 = jieba.cut(str(answer))
        for word in words2:
            if word not in stopwords:
                meaningful_words1.append(word)
        for word in words3:
            if word not in stopwords:
                meaningful_words2.append(word)
        s2 = ''.join(meaningful_words1)
        # print(s2)
        s3 = ''.join(meaningful_words2)
        a1 = Cosine(1)
        b1 = Damerau()
        c1 = Jaccard(1)
        d1 = JaroWinkler()
        e1 = Levenshtein()
        f1 = LongestCommonSubsequence()
        g1 = MetricLCS()
        h1 = NGram(2)
        i1 = NormalizedLevenshtein()
        j1 = OptimalStringAlignment()
        k1 = QGram(1)
        l1 = SorensenDice(2)
        m1 = WeightedLevenshtein(character_substitution=CharSub())

        line_sim = []

        cos_s = a1.similarity(s2, s3)
        line_sim.append(cos_s)
        cos_d = a1.distance(s2, s3)
        line_sim.append(cos_d)
        dam = b1.distance(s2, s3)
        line_sim.append(dam)
        jac_d = c1.distance(s2, s3)
        line_sim.append(jac_d)
        jac_s = c1.similarity(s2, s3)
        line_sim.append(jac_s)
        jar_d = d1.distance(s2, s3)
        line_sim.append(jar_d)
        jar_s = d1.similarity(s2, s3)
        line_sim.append(jar_s)
        lev = e1.distance(s2, s3)
        line_sim.append(lev)
        lon = f1.distance(s2, s3)
        line_sim.append(lon)
        met = g1.distance(s2, s3)
        line_sim.append(met)
        ngr = h1.distance(s2, s3)
        line_sim.append(ngr)
        nor_d = i1.distance(s2, s3)
        line_sim.append(nor_d)
        nor_s = i1.similarity(s2, s3)
        line_sim.append(nor_s)
        opt = j1.distance(s2, s3)
        line_sim.append(opt)
        qgr = k1.distance(s2, s3)
        line_sim.append(qgr)
        sor_d = l1.distance(s2, s3)
        line_sim.append(sor_d)
        sor_s = l1.similarity(s2, s3)
        line_sim.append(sor_s)
        wei = m1.distance(s2, s3)
        line_sim.append(wei)

        return line_sim
コード例 #23
0
from similarity.levenshtein import Levenshtein
from similarity.normalized_levenshtein import NormalizedLevenshtein
from similarity.cosine import Cosine
lev = Levenshtein()
nolev = NormalizedLevenshtein()
cosine = Cosine(4)
str1 = 'I enjoy playing football'
str2 = 'I love to play soccer'

print(lev.distance(str1, str2))
print('Levenshtein distance:')
print(nolev.similarity(str1, str2))
print('Cosine similarity:')
print(cosine.similarity(str1, str2))
コード例 #24
0
from similarity.normalized_levenshtein import NormalizedLevenshtein
from similarity.jaccard import Jaccard

s1 = '中华人民共和国'
s2 = '中国'

normalized_levenshtein = NormalizedLevenshtein()
print('Levenshtein: ', normalized_levenshtein.distance(s1, s2))

jaccard_distance = Jaccard(1)
print('Jaccard: ', jaccard_distance.distance(s1, s2))

# print(jaccard_similarity_score(list(s1), list(s2)))
コード例 #25
0
def diff_lev(source, source_asr, target, source_pos, result_source,
             result_source_asr, result_target, result_source_pos, workdir,
             **kwargs):
    with open(source, 'r', encoding='utf-8') as source, \
            open(source_asr, 'r', encoding='utf-8') as source_asr, \
            open(source_pos, 'r', encoding='utf-8') as source_pos, \
            open(target, 'r', encoding='utf-8') as target, \
            open(result_source, 'w', encoding='utf-8') as result_source, \
            open(result_source_asr, 'w', encoding='utf-8') as result_source_asr, \
            open(result_source_pos, 'w', encoding='utf-8') as result_source_pos, \
            open(result_target, 'w', encoding='utf-8') as result_target, \
            open(workdir / 'distances.txt', 'w', encoding='utf-8') as distances:

        source = source.readlines()
        source_asr = source_asr.readlines()
        target = target.readlines()
        source_pos = source_pos.readlines()

        # different types to classify the sentences
        counter = Counter()

        normalized_levenshtein = NormalizedLevenshtein()
        norm_dist = []

        # Loop to analyze each pair of sentences and count the number of occurrences of each type
        for source_sent, source_asr_sent in tqdm.tqdm(zip(source, source_asr)):
            ratio = normalized_levenshtein.similarity(source_sent,
                                                      source_asr_sent)
            norm_dist.append(ratio)
            if ratio == 1:
                counter["equal"] += 1
            if ratio > 0.9:
                counter["close"] += 1
            if 0.9 >= ratio > 0.7:
                counter["medium"] += 1
            if 0.7 >= ratio > 0.5:
                counter["low"] += 1
            if 0.5 >= ratio:
                counter["different"] += 1

        # Write the results of the comparisons in output file
        for dist in norm_dist:
            distances.write(f"{dist}\n")

        # print Statistics
        print(
            f"Equal count:{counter['equal']}, ratio: {counter['equal'] / len(source)}"
        )
        print(f"{counter['close']} {counter['close'] / len(source)}")
        print(f"{counter['medium']} {counter['medium'] / len(source)}")
        print(f"{counter['low']} {counter['low'] / len(source)}")
        print(f"{counter['different']} {counter['different'] / len(source)}")

        # Loop to identify similar sentences and write in output files
        # Counting sentences with same number of tokens comparing clean and asr
        for ratio, source_sent, source_asr_sent, target_sent, source_pos_sent in tqdm.tqdm(
                zip(norm_dist, source, source_asr, target, source_pos)):
            if float(ratio) >= 0.9:
                result_source.write(source_sent)
                result_source_asr.write(source_asr_sent)
                result_target.write(target_sent)
                result_source_pos.write(source_pos_sent)
                counter["after_filter"] += 1
                if len(source_sent.split(" ")) == len(
                        source_asr_sent.split(" ")):
                    counter["equal_nb_tokens"] += 1

        print(f"Sentences after cleaning {counter['after_filter']}")
        print(
            f"Sentences with equal number of tokens: {counter['equal_nb_tokens']}"
        )
コード例 #26
0
    def get_replacement(self, distance='lsh', threshold=.8):
        if distance == 'edit_distance':
            distance = Levenshtein()
        elif distance == 'normalized_edit_distance':
            distance = NormalizedLevenshtein()

        # for each token, get its bin
        # for each bin, iterate each element and get the groups of satisfied tokens such as
        # [white] = [whit, whie, whit]
        # [whie] = [whine,white]

        replacement = {}
        s = self.uniq_values

        while len(s) > 0:
            token = rd.sample(s, 1)[0]
            s.remove(token)
            m = self._generate_hash(token)
            similarities = self.lsh.query(m)
            similarities = [
                _ for _ in similarities if _ not in replacement.values()
                and _ not in replacement.keys()
            ]
            if len(similarities) > 1:
                scores = {}
                bin_replacement = {}
                if distance != 'lsh':
                    for idx, item in enumerate(similarities):
                        count = 0
                        candidates = []
                        for idx_compared in range(idx + 1, len(similarities)):
                            candidate = similarities[idx_compared]
                            if item != candidate and distance.distance(
                                    item, candidate) < threshold:
                                if idx not in bin_replacement:
                                    bin_replacement[idx] = [idx_compared]
                                else:
                                    bin_replacement[idx].append(idx_compared)
                                if idx_compared not in bin_replacement:
                                    bin_replacement[idx_compared] = [idx]
                                else:
                                    bin_replacement[idx_compared].append(idx)

                    for idx_item, candidates in sorted(
                            bin_replacement.items(), key=lambda x: -len(x[1])):
                        item = similarities[idx_item]
                        if item in replacement.keys():
                            item = replacement[item]
                        for idx_candidate in candidates:
                            candidate = similarities[idx_candidate]
                            if candidate != item and candidate not in replacement.keys(
                            ):
                                if item not in replacement.keys():
                                    replacement[candidate] = item
                                elif replacement[item] != candidate:
                                    replacement[candidate] = replacement[item]
                else:
                    for candidate in similarities:
                        if candidate != token:
                            replacement[candidate] = token

        return replacement
コード例 #27
0
import tensorflow as tf
import numpy as np
from similarity.normalized_levenshtein import NormalizedLevenshtein
from tensorflow.keras.metrics import Metric

labels = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
norm_lev = NormalizedLevenshtein()


class LevenshteinMetric(Metric):
    def __init__(self, batch_size, **kwargs):
        super().__init__(**kwargs)
        self.levenshtein_distance_fn = levenshtein_distance_fn
        self.batch_size = batch_size
        self.total = self.add_weight("total", initializer="zeros")
        self.count = self.add_weight("count", initializer="zeros")

    def update_state(self, y_true, y_pred, sample_weight=None):
        metric = self.levenshtein_distance_fn(y_true, y_pred)
        self.total.assign_add(tf.reduce_sum(metric))
        #self.count.assign_add(tf.cast(self.batch_size, tf.float32))
        self.count.assign_add(tf.cast(len(y_true), tf.float32))

    def result(self):
        return self.total / self.count

    def get_config(self):
        base_config = super().get_config()
        return base_config