Beispiel #1
0
def build_line(line, word_dict): 
    items = line[1]
    merged = merge_stems_diacritics(line)
    split_words = merged.split()
    collect_words = []
    for sw in split_words: 
        cw = huc.canonical_visenc(sw)
        if cw in word_dict: 
            ss = word_dict[cw]
            # select the most similar to sw
            min_score = float("inf")
            min_el = ""
            for s in ss: 
                word_sim = hsc.levenshtein_distance(s, sw)
                if word_sim < min_score: 
                    min_score = word_sim
                    min_el = s
            collect_words.append(min_el)
        else: 
            collect_words.append(sw)

    LOG.debug("=== collect_words ===")
    LOG.debug(collect_words)
    
    return " ".join(collect_words)
Beispiel #2
0
def calculate_word_score(word_dict_set, line_variation):
    """
    The input is single output of variations, namely,

    [(label1, dist), (label2, dist2), ...]

    If label ends with +, it means it should be merged with the next label for word. 
    """
    LOG.debug("=== line_variation ===")
    LOG.debug(line_variation)
    LOG.debug("=== line_variation[0] ===")
    LOG.debug(line_variation[0])
    
    
    whole_line = merge_stems_diacritics(line_variation)
    words = whole_line.split()
    score = 0

    for w in words:
        cw = huc.canonical_visenc(w)
        
        if cw in word_dict_set: 
            LOG.debug("=== Found: (w, cw) ===")
            LOG.debug((w, cw))
            score += len(w)
            
    return score
Beispiel #3
0
def visenc_additions(word_dict_set):
    f = open(VISENC_ADDITIONS)
    visenc_lines = f.readlines()
    for l in visenc_lines:
        if len(l.strip()) > 0: 
            items = l.split(":")
            k = huc.canonical_visenc(items[0].strip())
            if k not in word_dict_set: 
                word_dict_set[k] = set()
            word_dict_set[k].add(items[1].strip())

    return word_dict_set
Beispiel #4
0
def greedy_component_line_to_text(word_dict_set, component_list): 
    sorted_components = sort_from_right_to_left(component_list)
    label_list = greedy_label_list(sorted_components)
    canon_elements = []

    begin = 0
    while begin < len(label_list): 
        found = False
        for end in range(min(begin + 10, len(label_list)), begin, -1):
            sublist = label_list[begin:end]
            merged = "".join(sublist)
            LOG.debug("=== merged ===")
            LOG.debug(merged)
            canon_vis = huc.canonical_visenc(merged)
#            print(canon_vis)
            LOG.debug("=== canon_vis ===")
            LOG.debug(canon_vis)
            if canon_vis in word_dict_set:
                LOG.debug("=== Found ===")
                canon_elements.append(canon_vis)
                # LOG.debug("=== label_list[end-1] ===")
                # LOG.debug(label_list[end-1])
                # LOG.debug("=== label_list[end] ===")
                # LOG.debug(label_list[end])
                # LOG.debug("=== label_list[end+1] ===")
                # LOG.debug(label_list[end+1])
                begin = end
                if begin < len(label_list): 
                    LOG.debug("=== new label_list[begin] ===")
                    LOG.debug(label_list[begin])
                found = True
                break
            else:
                LOG.debug("=== Not Found ===")

        if not found:
            # skip until the next stem
            current_element = label_list[begin] + '!'
            begin += 1
            while begin < len(label_list) and not is_stem(label_list[begin]): 
            # LOG.debug("=== label_list[begin] ===")
            # LOG.debug(label_list[begin])
                current_element += "-" + label_list[begin]
                begin += 1
            # LOG.debug("=== current_element ===")
            # LOG.debug(current_element)
            canon_elements.append(current_element)

    # print(canon_elements)
    return canon_elements
Beispiel #5
0
def generate_word_dict_set(arabic_text): 
    """Generates a word list from a piece of text by delimiting from the whitespace. 

    It returns a dictionary. The keys of the dictionary are visenc without
    diacritics and values of the dictionary are sets of words. So we can check
    them quickly.
    """
    
    visenc_words = [huc.arabic_to_visenc(w.strip()) for w in arabic_text.split()]

    words = {}
    for w in visenc_words:
        canonical = huc.canonical_visenc(w)
        if canonical not in words: 
            words[canonical] = set()
        words[canonical].add(w)
    LOG.debug("=== words[:10] ===")
    LOG.debug(list(words.keys())[:10])
    return words