Esempio n. 1
0
 def distance(self, s1: Sentence, s2: Sentence) -> float:
     return [
         lcs.llcs(
             s1.lowercase_tokens(),
             s2.lowercase_tokens(),
         )
     ]
Esempio n. 2
0
    def _pre_func_medical_money(self, item_name: str,
                                passed_nodes: Dict[str, TpNodeItem],
                                node_items: Dict[str, TpNodeItem],
                                img: np.ndarray):
        def has3number(text):
            """
            检查一个text是否有三个连续的text
            :param text:
            :return:
            """
            if not text:
                return False
            res = re.search('[0-9,,,\.]{3,}', text)
            if res:
                return True
            else:
                return False

        # TODO 对 那种退现金...退支票...的数据要做特殊处理0001648365
        for node in passed_nodes.values():
            if has3number(node.text):
                roi = node.bbox
                crnn_res, crnn_scores = crnn_util.run_number_amount(img, roi)
                dis = lcs.llcs(crnn_res, node.text)
                if dis / len(crnn_res) > 0.6:
                    # 认为crnn_res是有效的
                    if str_util.only_keep_money_char(crnn_res.replace(',', '')) != \
                            str_util.only_keep_money_char(node.text.replace(',', '')):
                        logger.debug('re recog {} to crnn , from {} to {}'.format(item_name, node.text, crnn_res))
                    node.text = crnn_res
                    node.scores = crnn_scores

        return self._pre_func_money(item_name, passed_nodes, node_items, img)
Esempio n. 3
0
def search_by_lcs(text, probable_list):
    max_dis = 0
    prob_list = []
    for prob in probable_list:
        dis = lcs.llcs(text, prob)
        if dis > max_dis:
            prob_list = [prob]
            max_dis = dis
        elif dis == max_dis:
            prob_list.append(prob)
    return prob_list, max_dis
Esempio n. 4
0
File: core.py Progetto: imfifc/myocr
    def search_one(self, text: str, search_dist=2, search_norm_dist=None, min_len=5):
        """
        返回编辑距离最小的那个候选字。如果两个候选字的编辑距离一样,会用 difflib 找最长公共子序列,返回子序列长度最大的
        :param text:
        :param search_dist: 返回小于 search_dist 的匹配项
        :param search_norm_dist: 如果不为 None,则根据 text 的长度计算 search_dist
        :param min_len: 如果 text 长度小于 min_len 则不进行搜索,并返回 None
        :return: str or None
        """
        if text is None:
            return text

        if len(text) < min_len:
            return None

        if search_norm_dist is not None:
            assert 0 < search_norm_dist < 1
            search_dist = int(search_norm_dist * len(text))

        candidate_words = self.search(text, search_dist, True)

        if len(candidate_words) == 0:
            return None

        max_LCS = 0
        out_idx = 0
        for i, it in enumerate(candidate_words):
            word = it['word']
            # s = SequenceMatcher(None, text, word)
            # lsc = s.find_longest_match(0, len(text), 0, len(word)).size
            llcs = lcs.llcs(text, word)
            if llcs > max_LCS:
                max_LCS = llcs
                out_idx = i

        return candidate_words[out_idx]['word']
Esempio n. 5
0
def extract_paradigms(surface_sim, out_model):
    print("Processing ", out_model)

    #    out_model = "models/"+fname.split("/")[-1]+".model"

    model_bible = FastText.load(out_model)
    path_clusters = open(
        out_model.split("/")[-1].replace(".model", "") + "_topn_100_" +
        "lcs_" + str(surface_sim) + ".words.clusters.txt", "w")

    for word in model_bible.wv.vocab:
        #    print(word, model_bible.wv.most_similar(word))
        sim_words = []
        for x in model_bible.wv.most_similar(word, topn=100):
            #                if x[1] < semantic_sim: continue
            llcs = lcs.llcs(word, x[0]) / max(len(x[0]), len(word))
            #                prefix = utils.prefix(word, x[0])
            #        print(word, x[0], llcs)
            if llcs >= surface_sim:  # and prefix > 0:
                sim_words.append(x[0])
        if len(sim_words) > 0:
            print(word, ",".join(sim_words), sep="\t", file=path_clusters)

    path_clusters.close()
Esempio n. 6
0
from distance_metrics import lcs

# wei zhengchang de seq2seq, bie de dou you tese de wenti

trg = pd.read_csv("seq2seq_trg.csv", sep=",", header=None)
trg = trg.T

trg = trg[0].str.replace(" ", "")
trg = trg[~trg.astype(float).isnull()]
trg = [s[1:] for s in trg]

output = pd.read_csv("seq2seq_output.csv", sep=",", header=None)
output = output.T
output = output[0].str.replace(" ", "")
output = output[~output.astype(float).isnull()]
output = [s[1:] for s in output]

LCS = [lcs.llcs(v, u) for v, u in zip(trg, output)]

print("debug ahoy!")

loss = 0

for row in range(len(output)):
    if output[row] == trg[row]:
        None
    else:
        loss += 1

match = loss / len(trg)
Esempio n. 7
0
 def LCS(self, x, y):
     return -lcs.llcs(x, y)
Esempio n. 8
0
                               min_count=2,
                               workers=4,
                               iter=10)
        model_bible.save(out_model)

        model_bible = FastText.load(out_model)
        path_clusters = open(
            out_model.split("/")[-1].replace(".model", "") +
            ".words.clusters.txt", "w")

        for word in model_bible.wv.vocab:
            #    print(word, model_bible.wv.most_similar(word))
            sim_words = []
            for x in model_bible.wv.most_similar(word):
                if x[1] < semantic_sim: continue
                llcs = lcs.llcs(word, x[0]) / max(len(x[0]), len(word))
                #                prefix = utils.prefix(word, x[0])
                #        print(word, x[0], llcs)
                if llcs >= surface_sim:  # and prefix > 0:
                    sim_words.append(x[0])
            if len(sim_words) > 0:
                print(word, ",".join(sim_words), sep="\t", file=path_clusters)

        path_clusters.close()

#if "inflections" in sys.argv:

#    job_list = []

#    for surface_sim in np.arange(0.5, 1.0, 0.05):
##        print(surface_sim)