Beispiel #1
0
def syn(x):
    if len(x) > 2:
        # string (including literal) match
        for evi in evidence:
            if lev(x.lower(), evi):
                return evi
        # alias match. Order matters because edge cases suck
        aliases = dict({
            'spirit': 'box',
            'radio': 'box',
            'finger': 'prints',
            'book': 'writing',
            'temp': 'freeze',
            'freezing': 'freeze',
            'projector': 'dots',
        })
        for key in aliases:
            if lev(x.lower(), key):
                return aliases[key]
    # if there's no match we return false
    return False
Beispiel #2
0
import levenshtein


str1 = "LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV"
str2 = "EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG"

# str1='sdfjsa'
# str2='sfffdsd'

str1 = "CA"
str2 = "ABC"


levd = levenshtein.lev(str1, str2)
print("Levenshtein Distance:", levd)

osad = levenshtein.osa(str1, str2)
print("Damerau-levenshtein distance(OSA):", osad)

dalevd = levenshtein.dalev(str1, str2)
print("Damerau-levenshtein distance:", dalevd)
    def expand_corpus(self):
        sid = 0
        f_text = open(self._config.eswd_text)
        w_text = open(
            "%s.%dto%d.exp" % (self._config.eswd_text, self._config.eswd_sid_start, self._config.eswd_sid_end), "w" )
        sf_text = None
        if self._config.eswd_save_sid_corpus:
            sf_text = open(
                "%s.%dto%d.sid" % (self._config.eswd_text, self._config.eswd_sid_start, self._config.eswd_sid_end),
                "w" )

        expd_line_n = 0
        zero_line_n = 0

        line_n = 0
        for line in f_text:
            sid += 1
            if sid > self._config.eswd_sid_end > -1:
                break
            if sid < self._config.eswd_sid_start:
                continue
            line_n+=1
            pseq_list = list()
            wseq_set = set()
            word_n = len(line.split())
            self._logger.debug("An original word sequence: %s" % line.strip())

            if sf_text is not None:
                sf_text.write("%d %s\n" % (sid, line.strip()))

            # Word to Pron :
            # -> input : line
            # -> output: pseq_list
            self._word2lex("", line.split(), 0, pseq_list)

            self._logger.debug("Phone sequences:")
            for idx, pline in enumerate(pseq_list, start=1):
                self._logger.debug("%d-%d %s" % (sid, idx, pline.strip()))

            # Pron to Word
            # -> input : pseq_list
            # -> output: wseq_list
            for pid, pline in enumerate(pseq_list, start=1):
                wseqs = set()
                self._lex2word("", pline[:-1].strip().split("' "), 0, wseqs)
                for wseq in wseqs:
                    if len(wseq_set) >= self._config.eswd_max_expanded_sentence:
                        break
                    if wseq == line.strip():
                        continue
                    lev_d = lev(wseq.split(), line.split())
                    lev_r = lev_d / word_n
                    if self._config.eswd_min_sentence_levenshtein_distance <= lev_d and \
                            self._config.eswd_min_sentence_levenshtein_distance_rate <= lev_r:
                            wseq_set.add(wseq)

            if len(wseq_set) == 0:
                self._logger.info("Zero line(sid: %d): %s" % (sid, line.strip()))
                zero_line_n += 1
            else:
                self._logger.debug("Expanded word sequences:")
                for idx, wline in enumerate(wseq_set, start=1):
                    lev_d = lev(wline.split(), line.split())
                    self._logger.debug("%d-%d %s, lev = %d" % (sid, idx, wline.strip(), lev_d))
                    w_text.write("%d %s\n" % (sid, wline.strip()))
                    expd_line_n += 1

        self._logger.info("Input lines: %d, expanded lines: %d, zero lines: %d" % (line_n, expd_line_n, zero_line_n))
Beispiel #4
0
def romance():

    # Levenshtein scores
    scores_es_pt = []
    scores_es_it = []
    scores_es_fr = []
    scores_pt_it = []
    scores_pt_fr = []
    scores_it_fr = []

    # Score summation
    sum_es_pt = 0
    sum_es_it = 0
    sum_es_fr = 0
    sum_pt_it = 0
    sum_pt_fr = 0
    sum_it_fr = 0

    # Read in corpuses
    with open("corpus/es") as f:
        es = f.read().decode('utf-8').splitlines()
    with open("corpus/pt") as f:
        pt = f.read().decode('utf-8').splitlines()
    with open("corpus/it") as f:
        it = f.read().decode('utf-8').splitlines()
    with open("corpus/fr") as f:
        fr = f.read().decode('utf-8').splitlines()

    # Generate Levenshtein scores
    for x in range(0, 100):
        scores_es_pt.append(lev(es[x], pt[x]))
        scores_es_it.append(lev(es[x], it[x]))
        scores_es_fr.append(lev(es[x], fr[x]))
        scores_pt_it.append(lev(pt[x], it[x]))
        scores_pt_fr.append(lev(pt[x], fr[x]))
        scores_it_fr.append(lev(it[x], fr[x]))

    # Score sums
    for x in range(0, 100):
        sum_es_pt += scores_es_pt[x]
        sum_es_it += scores_es_it[x]
        sum_es_fr += scores_es_fr[x]
        sum_pt_it += scores_pt_it[x]
        sum_pt_fr += scores_pt_fr[x]
        sum_it_fr += scores_it_fr[x]

    # Score means
    mean_es_pt = sum_es_pt / 100.0
    mean_es_it = sum_es_it / 100.0
    mean_es_fr = sum_es_fr / 100.0
    mean_pt_it = sum_pt_it / 100.0
    mean_pt_fr = sum_pt_fr / 100.0
    mean_it_fr = sum_it_fr / 100.0

    # Testing
    print "[ES, PT]", "\t", sum_es_pt, "\t", mean_es_pt
    print "[ES, IT]", "\t", sum_es_it, "\t", mean_es_it
    print "[ES, FR]", "\t", sum_es_fr, "\t", mean_es_fr
    print "[PT, IT]", "\t", sum_pt_it, "\t", mean_pt_it
    print "[PT, FR]", "\t", sum_pt_fr, "\t", mean_pt_fr
    print "[IT, FR]", "\t", sum_it_fr, "\t", mean_it_fr

    # Return the mean of the means
    mean_rom = (mean_es_pt + mean_es_it + mean_es_fr + mean_pt_it
        + mean_pt_fr + mean_it_fr) / 6
    return mean_rom
Beispiel #5
0
def germanic():

    # Levenshtein scores
    scores_en_de = []
    scores_en_nl = []
    scores_en_af = []
    scores_de_nl = []
    scores_de_af = []
    scores_nl_af = []

    # Score summation
    sum_en_de = 0
    sum_en_nl = 0
    sum_en_af = 0
    sum_de_nl = 0
    sum_de_af = 0
    sum_nl_af = 0

    # Read in corpuses
    with open("corpus/en") as f:
        en = f.read().decode('utf-8').splitlines()
    with open("corpus/de") as f:
        de = f.read().decode('utf-8').splitlines()
    with open("corpus/nl") as f:
        nl = f.read().decode('utf-8').splitlines()
    with open("corpus/af") as f:
        af = f.read().decode('utf-8').splitlines()

    # Generate Levenshtein scores
    for x in range(0, 100):
        scores_en_de.append(lev(en[x], de[x]))
        scores_en_nl.append(lev(en[x], nl[x]))
        scores_en_af.append(lev(en[x], af[x]))
        scores_de_nl.append(lev(de[x], nl[x]))
        scores_de_af.append(lev(de[x], af[x]))
        scores_nl_af.append(lev(nl[x], af[x]))

    # Score sums
    for x in range(0, 100):
        sum_en_de += scores_en_de[x]
        sum_en_nl += scores_en_nl[x]
        sum_en_af += scores_en_af[x]
        sum_de_nl += scores_de_nl[x]
        sum_de_af += scores_de_af[x]
        sum_nl_af += scores_nl_af[x]

    # Score means
    mean_en_de = sum_en_de / 100.0
    mean_en_nl = sum_en_nl / 100.0
    mean_en_af = sum_en_af / 100.0
    mean_de_nl = sum_de_nl / 100.0
    mean_de_af = sum_de_af / 100.0
    mean_nl_af = sum_nl_af / 100.0

    # Testing
    print "[EN, DE]", "\t", sum_en_de, "\t", mean_en_de
    print "[EN, NL]", "\t", sum_en_nl, "\t", mean_en_nl
    print "[EN, AF]", "\t", sum_en_af, "\t", mean_en_af
    print "[DE, NL]", "\t", sum_de_nl, "\t", mean_de_nl
    print "[DE, AF]", "\t", sum_de_af, "\t", mean_de_af
    print "[NL, AF]", "\t", sum_nl_af, "\t", mean_nl_af

    # Return the mean of the means
    mean_ger = (mean_en_de + mean_en_nl + mean_en_af + mean_de_nl
        + mean_de_af + mean_nl_af) / 6
    return mean_ger
Beispiel #6
0
def syn_ghost(x):
    for ghost in ghosts:
        if lev(x.lower(), ghost):
            return ghost
    # if there's no match we return false
    return False