def process_rel_candidate_for_drop_led(relnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, opr_drop_rel):
    simple_sentence = " ".join(simple_sentences)
    
    sentence_before_drop = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, filtered_mod_pos)
    edit_dist_before_drop = edit_distance(sentence_before_drop.split(), simple_sentence.split())        
    
    temp_nodeset, temp_filtered_mod_pos = boxer_graph.drop_relation(nodeset, relnode_candidate, filtered_mod_pos)
    sentence_after_drop = boxer_graph.extract_main_sentence(temp_nodeset, main_sent_dict, temp_filtered_mod_pos)
    edit_dist_after_drop = edit_distance(sentence_after_drop.split(), simple_sentence.split())
    
    isDrop = compare_edit_distance(opr_drop_rel, edit_dist_after_drop, edit_dist_before_drop)
    return isDrop
def process_ood_candidate_for_drop_led(oodnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, opr_drop_ood):
    simple_sentence = " ".join(simple_sentences)
    
    sentence_before_drop = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, filtered_mod_pos)
    edit_dist_before_drop = edit_distance(sentence_before_drop.split(), simple_sentence.split())
    
    temp_nodeset = nodeset[:]
    temp_nodeset.remove(oodnode_candidate)
    sentence_after_drop = boxer_graph.extract_main_sentence(temp_nodeset, main_sent_dict, filtered_mod_pos)
    edit_dist_after_drop = edit_distance(sentence_after_drop.split(), simple_sentence.split())

    isDrop = compare_edit_distance(opr_drop_ood, edit_dist_after_drop, edit_dist_before_drop)
    return isDrop
Ejemplo n.º 3
0
def make_compatible(input_str):
    for i in range(len(rer_out['taglist'])):
        if(rer_out['taglist'][i] == "Org"):
            for j in allprods:
                if(dist.edit_distance(rer_out['wordlist'][i], j) < 2):
                    rer_out['wordlist'][i] = j
                    break
        if(rer_out['taglist'][i] == "Family"):
            for j in allprods:
                for k in allprods[j]:
                    if(dist.edit_distance(rer_out['wordlist'][i], k) < 4):
                        rer_out['wordlist'][i] = k
                        break
def process_mod_candidate_for_drop_led(modcand_to_process, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, opr_drop_mod):
    simple_sentence = " ".join(simple_sentences)
    
    sentence_before_drop = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, filtered_mod_pos)
    edit_dist_before_drop = edit_distance(sentence_before_drop.split(), simple_sentence.split())
    
    modcand_position_to_process = modcand_to_process[0]
    temp_filtered_mod_pos = filtered_mod_pos[:]+[modcand_position_to_process]
    sentence_after_drop = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, temp_filtered_mod_pos)
    edit_dist_after_drop = edit_distance(sentence_after_drop.split(), simple_sentence.split())
    
    isDrop = compare_edit_distance(opr_drop_mod, edit_dist_after_drop, edit_dist_before_drop)
    return isDrop
Ejemplo n.º 5
0
 def one2ArrEditDistance(self,sen,arr):
     score = []
     for l in arr:
         score.append(edit_distance(sen,l))
     if len(score) != 0:
         return sum(score)*1.0/len(score)
     return 0
Ejemplo n.º 6
0
    def levenshtein_sort(self, keyword, domains):
        """
        Sort domains by Levenshtein edit-distance

        :param sentence: str input source
        :param domains: domains list
        :rtype: list
        :return: sorted names list
        """
        # distance counter
        # transpositions - ab == ba
        distance = lambda s, d: edit_distance(s, d, transpositions=True)
        # remove zone
        get_str = lambda domain: re.sub('([.][a-z]{2,4})+$', '', domain)
        domains = map(get_str, domains)

        # Sorter
        for i in range(len(domains)):
            for j in range(len(domains) - 1):
                if (distance(keyword, get_str(domains[j])) >
                        distance(keyword, get_str(domains[j + 1]))):
                    tmp = copy(domains[j + 1])
                    domains[j + 1] = domains[j]
                    domains[j] = tmp

        return domains
Ejemplo n.º 7
0
def mean_char_edit_distance(candidates, references):
    total_distance = 0
    total_target_length = 0
    for y, t in zip(candidates, references):
        total_distance += edit_distance(y, t)
        total_target_length += len(t)
    return total_distance/total_target_length
 def __init__(self):
     self.stemmer = LancasterStemmer()
     self.stem_mapping = {}
     self.stemmed_trie = TrieNode()
     self.trie = TrieNode()
     self.singles_lst = []
     self.black_listed_stems = set([])
     loaded = cPickle.load(open(DICTIONARY, 'r'))
     print len(loaded)
     loaded += CUSTOM
     loaded = set(loaded)
     most_common = cPickle.load(open(MOST_COMMON, 'r'))
     for word in most_common:
         self.black_listed_stems.add(self.stem(word))
     #print self.black_listed_stems
     for word in loaded:
         word = word.lower()
         if word not in most_common[:TOP_K_FILTER]:
             self.trie.insert(word)
             stemmed_word = self.stem(word)
             if stemmed_word in self.stem_mapping: 
                 previous = self.stem_mapping[stemmed_word]
                 edist = distance.edit_distance(word, previous)
                 if edist > 2:
                     pass
                 #print 'warning: %s dropped in favor of %s' % (word, previous)
             else:
                 if stemmed_word not in self.black_listed_stems:
                     self.stem_mapping[stemmed_word] = word
                     self.stemmed_trie.insert(stemmed_word)
Ejemplo n.º 9
0
def select_anagrams(token, structures):
    """Select possible anagrams for a given token

    Parameters:
        token (:func:`str`): Cleaned token
        structures (:func:`dict`): Datastructures from file

    Returns:
        :func:`dict` - Possible anagrams (keys) along with their score (values)
    """
    anagrams = {}
    focus_alphabet = generate_alphabet_from_word(token[1])
    token_hash = anagram_hash(token)

    hash_list = []
    for c in structures["alphabet"]:
        for f in focus_alphabet:
            hash_list.append(token_hash + c - f)

    hash_counter = Counter(hash_list)  # Counting retrieval occurence

    for h in set(hash_counter.keys()).intersection(set(structures["anagrams"].keys())):
        count = hash_counter[h]
        anag_list = [anag for anag in structures["anagrams"][h] if edit_distance(anag, token) <= 3]

        for anag in anag_list:
            anag_score = rate_anagram(structures["occurence_map"], token, anag, count)

            if anag_score > 0:
                anagrams[anag] = anag_score

    return anagrams
Ejemplo n.º 10
0
    def get_candidates(self, word, D=1):
        """If word is in lexicon returns [(word, 1.0)].
        Otherwise returns a list with all the words in lexicon that has
        a distance equal or less than to D (D is the Levenshtein edit-distance)
        If there is no such word, returns [(word, 0.0)]
        """
        word = word.lower()

        if word in self.fdist:
            return [(word, 1.0)]

        candidates = []
        counts = []
        for w, c in self.fdist.iteritems():
            if edit_distance(w, word) <= D:
                candidates.append(w)
                counts.append(c)

        if len(candidates) == 0:
            candidates.append(word)
            counts.append(0)

        probs = [float(c) / self.wcount for c in counts]

        return sorted(zip(candidates, probs), key=lambda x: x[1], reverse=True)
Ejemplo n.º 11
0
def get_geonames_code(m):
    lat = session.scalar(m._geo_ponto.y)
    lon = session.scalar(m._geo_ponto.x)
    places = geonames_reverse(lat, lon)
    for place in places:
        nome1 = m.nome.strip().lower()
        nome2 = place[u'name'].strip().lower()
        if edit_distance(nome1, nome2) < 2:
            return int(place[u'geonameId'])
Ejemplo n.º 12
0
    def replace(self, word):
        suggestions = self.spell_dict.suggest(word)

        if suggestions:
            for suggestion in suggestions:
                if edit_distance(word, suggestion) <= self.max_dist:
                    return suggestions[0]

        return word
Ejemplo n.º 13
0
def strip_synonyms(output_set, exclude_set):
    # Remove synonyms that have Levenshtein distance of 1, AFTER removing plurals.
    for word in output_set:
        for synset in wn.synsets(word):
            for synonym in synset.lemma_names():
                if edit_distance(word,synonym) == 1:
                    exclude_set.add(synonym)

    output_set.difference_update(exclude_set)
    return output_set, exclude_set
Ejemplo n.º 14
0
def close_enough_buckets(first_bucket, second_bucket, dist):
    
    if first_bucket == second_bucket:
        return False
    
    elif edit_distance(first_bucket, second_bucket) <= dist:
        return True
    
    else:
        return False
Ejemplo n.º 15
0
def eval(references):
    string_distances = {'siddharthan':[], 'bayes_no_variation':[], 'bayes_variation':[]}
    jaccard_distances = {'siddharthan':[], 'bayes_no_variation':[], 'bayes_variation':[]}

    for reference in references:
        print reference
        string_distances['siddharthan'].append(edit_distance(reference['original'], reference['siddharthan']))
        string_distances['bayes_no_variation'].append(edit_distance(reference['original'], reference['bayes_no_variation']))
        string_distances['bayes_variation'].append(edit_distance(reference['original'], reference['bayes_variation']))

        # jaccard_distances['siddharthan'].append(jaccard_distance(reference['original'], reference['siddharthan']))
        # jaccard_distances['bayes_no_variation'].append(jaccard_distance(reference['original'], reference['bayes_no_variation']))
        # jaccard_distances['bayes_variation'].append(jaccard_distance(reference['original'], reference['bayes_variation']))

    print 'String distances: '
    print 'siddharthan: ', mean_confidence_interval(string_distances['siddharthan'])
    print 'bayes_no_variation: ', mean_confidence_interval(string_distances['bayes_no_variation'])
    print 'bayes_variation: ', mean_confidence_interval(string_distances['bayes_variation'])
    print 10 * '-'
Ejemplo n.º 16
0
def model_evaluate(model, d, gt):
    model_h.add_data(d, trunc=25)
    inf = model_h.states_list[0].stateseq
    
    inf = list(inf)    
    dist = edit_distance(gt, inf)
    
    s_gt, s_inf = set(gt), set(inf)
    iou = len(s_gt.intersection(s_inf)) / len(s_gt.union(s_inf))

    return dist, iou
Ejemplo n.º 17
0
def get_X(lines, features, cache):
    if cache == None:
        cache = {}
    tokenizer = RegexpTokenizer(r'[a-z]+')
    X = []
    for line1 in lines:
        vector = []
        for line2 in lines:
            vector.append(edit_distance(line1,line2)/max(len(line1),len(line2)))
        max_v = max(vector)
        for i in range(len(vector)):
            vector[i] = vector[i] / max_v 
        syn_dist = {}
        for word in features:
            syn_dist[word] = 1

        for word in set(tokenizer.tokenize(line1.lower())):
            if word in stopwords.words('english'):
                continue

            for word2 in features:
                if (len(wn.synsets(word)) == 0 or len(wn.synsets(word2)) == 0):
                    continue
                else:
                    if (word not in cache):
                        cache[word] = {}
                    if (word2 not in cache[word]):
                        similarity = [w1.wup_similarity(w2)
                                      for w1 in wn.synsets(word, pos=wn.NOUN) 
                                      + wn.synsets(word, pos=wn.VERB)
                                      for w2 in wn.synsets(word2, pos=wn.NOUN)
                                      + wn.synsets(word2, pos=wn.VERB)]
                        similarity = [s for s in similarity if s]

                        if (len(similarity) != 0):
                            cache[word][word2] = max(similarity)
                        else:
                            cache[word][word2] = None

                        #cache[word][word2] = wn.synsets(word)[0].path_similarity(wn.synsets(word2)[0])
                        #cache[word][word2] = wn.synsets(word)[0].wup_similarity(wn.synsets(word2)[0])

                    if (not cache[word][word2]):
                        continue

                    dist = 1 - cache[word][word2]
                    if (dist < syn_dist[word2]):
                        syn_dist[word2] = dist 
                        
        for word in features:
            vector.append(syn_dist[word])

        X.append(vector)
    return X, cache
Ejemplo n.º 18
0
def _match_by_edit_distance(full_text, text_to_match):
    text_to_match = text_to_match.replace("-LRB-", "(").replace("-RRB-", ")")
    text_to_match = text_to_match.replace("-LCB-", "{").replace("-RCB-", "}")
    text_to_match = re.sub(r'\[\\\]\\\)\]$', ')', text_to_match)

    try:
        end_point = (text_to_match.index(" ") if " " in text_to_match else len(text_to_match))
        potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in 
                             re.finditer(re.escape(text_to_match[0:end_point]), full_text, re.U | re.I)]
    except:
        import sys

        print(full_text)
        print()
        print(text_to_match)
        sys.exit(1)
        
    if len(potential_matches) == 0:
        potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in 
                             re.finditer(re.escape(text_to_match[0]), full_text, re.U)]
    if len(potential_matches) == 0:
        text_to_match = text_to_match.replace("(", "[")
        potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in 
                             re.finditer(re.escape(text_to_match[0]), full_text, re.U)]

    potential_matches = [(p[0:p.rindex(text_to_match[-1])+1] 
                          if text_to_match[-1] in p and len(p) > len(text_to_match)
                          else p)
                         for p in potential_matches]

    if len(potential_matches) == 0:
        # No idea why this would ever happen, but it does
        return text_to_match

    match_with_lowest_edit_distance = ""
    lowest_edit_distance = -1
    for match in potential_matches:
        e_d = edit_distance(match, text_to_match)
        if lowest_edit_distance == -1 or e_d <= lowest_edit_distance:
            lowest_edit_distance = e_d
            match_with_lowest_edit_distance = match

    result = match_with_lowest_edit_distance.strip()
    if text_to_match[-1] in result:
        while result[-1] != text_to_match[-1]:
            result = result[0:-1]
    elif text_to_match[-1] == '"' and re.search(r'["”\u201d]', result):
        while result[-1] not in ['"', '”', "\u201d"]:
            result = result[0:-1]
    elif text_to_match[-1] not in [']', '}', ')'] and text_to_match[-2:] != "..":
        while result[-1] != text_to_match[-1]:
            result += full_text[full_text.index(result) + len(result)][-1]

    return result
Ejemplo n.º 19
0
            def probs_metric(inverse=False):
                rand_p = Vec2(random()*table.width+table.min_point.x, random()*table.height+table.min_point.y)
                try:
                    bestmeaning, bestsentence = generate_sentence(rand_p, False, scene, speaker, usebest=True, golden=inverse, printing=printing)
                    sampled_landmark, sampled_relation = bestmeaning.args[0], bestmeaning.args[3]
                    golden_posteriors = get_all_sentence_posteriors(bestsentence, meanings, golden=(not inverse), printing=printing)

                    # lmk_prior = speaker.get_landmark_probability(sampled_landmark, landmarks, PointRepresentation(rand_p))[0]
                    all_lmk_probs = speaker.all_landmark_probs(landmarks, Landmark(None, PointRepresentation(rand_p), None))
                    all_lmk_probs = dict(zip(landmarks, all_lmk_probs))

                    lmk_prior = all_lmk_probs[sampled_landmark]
                    head_on = speaker.get_head_on_viewpoint(sampled_landmark)
                    rel_prior = speaker.get_probabilities_points( np.array([rand_p]), sampled_relation, head_on, sampled_landmark)
                    lmk_post = golden_posteriors[sampled_landmark]
                    rel_post = golden_posteriors[sampled_relation]

                    ps = np.array([golden_posteriors[lmk]*golden_posteriors[rel] for lmk, rel in meanings])
                    rank = None
                    for i,p in enumerate(ps):
                        lmk,rel = meanings[i]
                        # logger( '%f, %s' % (p, m2s(lmk,rel)))
                        head_on = speaker.get_head_on_viewpoint(lmk)
                        # ps[i] *= speaker.get_landmark_probability(lmk, landmarks, PointRepresentation(rand_p))[0]
                        ps[i] *= all_lmk_probs[lmk]
                        ps[i] *= speaker.get_probabilities_points( np.array([rand_p]), rel, head_on, lmk)
                        if lmk == sampled_landmark and rel == sampled_relation:
                            idx = i

                    ps += epsilon
                    ps = ps/ps.sum()
                    prob = ps[idx]
                    rank = sorted(ps, reverse=True).index(prob)
                    entropy = entropy_of_probs(ps)
                except (ParseError,RuntimeError) as e:
                    logger( e )
                    lmk_prior = 0
                    rel_prior = 0
                    lmk_post = 0
                    rel_post = 0
                    prob = 0
                    rank = len(meanings)-1
                    entropy = 0
                    distances = [[None]]

                head_on = speaker.get_head_on_viewpoint(sampled_landmark)
                all_descs = speaker.get_all_meaning_descriptions(trajector, scene, sampled_landmark, sampled_relation, head_on, 1)
                distances = []
                for desc in all_descs:
                    distances.append([edit_distance( bestsentence, desc ), desc])
                distances.sort()
                return lmk_prior,rel_prior,lmk_post,rel_post,\
                       prob,entropy,rank,distances[0][0],type(sampled_relation)
def validate_password_dictionary(value):
    """
    Insures that the password is not too similar to a defined set of dictionary words
    """
    password_max_edit_distance = getattr(settings, "PASSWORD_DICTIONARY_EDIT_DISTANCE_THRESHOLD", None)
    password_dictionary = getattr(settings, "PASSWORD_DICTIONARY", None)

    if password_max_edit_distance and password_dictionary:
        for word in password_dictionary:
            distance = edit_distance(value, word)
            if distance <= password_max_edit_distance:
                raise ValidationError(_("Too similar to a restricted dictionary word."), code="dictionary_word")
Ejemplo n.º 21
0
    def areExpansionsSimilar(expansion_1, expansion_2):
        expansion_1 = expansion_1.lower().replace(u"-", u" ")
        expansion_2 = expansion_2.lower().replace(u"-", u" ")
        #numActualWords = len(expansion_1)
        #numPredictedWords = len(expansion_2)

        if(expansion_1 == expansion_2
           or AcronymExpansion.startsSameWay(expansion_1, expansion_2)
           or edit_distance(expansion_1, expansion_2) <= 2):  # max(numActualWords, numPredictedWords)):
            return True

        return False
Ejemplo n.º 22
0
 def run(self, token):
     try:
         spellchk = self.correct(token.lower(), self.lWords)
         if spellchk == token and spellchk not in self.lWords[token[0]]:
             return "#" + token
         else:
             if edit_distance(token, spellchk) <= 2:
                 return spellchk
             else:
                 return "#" + token
     except:
         return "#" + token
Ejemplo n.º 23
0
def attributeMatches(val1, val2, mode="exact", threshold=0):
  if mode == "ignore":
    return True
  if mode == "exact":
    return val1 is not None and val2 is not None and\
        val1 == val2
  if mode == "fuzzy":
    return val1 is not None and val2 is not None and\
        abs(len(val1) - len(val2)) <= threshold and\
        edit_distance(val1,val2) <= threshold
  if mode == "do not differ":
    return val1 is None or val2 is None or val1 == val2
  return False
Ejemplo n.º 24
0
def matchWord(tokens, words):
	s = Set(tokens).intersection(Set(words))
	if len(s)>0:
		return [(w,1.0,w) for w in s]
	else:
		from nltk.metrics import distance
		import operator
		result = []
		for token in Set(tokens):
			vals = {w : (1.0-float(distance.edit_distance(token, w))/float(max(len(token),len(w)))) for w in words}
			sortedvals = sorted(vals.iteritems(), key=operator.itemgetter(1),reverse=True)
			result.append( (token, sortedvals[0][1], sortedvals[0][0]) )
		return sorted(result, key=lambda tup: tup[1], reverse=True)
Ejemplo n.º 25
0
def update_summary(qa_pairs, baidu_data):
    assert(qa_pairs != None)
    assert(baidu_data != None)

    # load qa pairs
    reader = open(qa_pairs, 'r')
    pairs = []
    for line in reader:
        # print line
        words = line.split(' ')
        first = words[0]
        second = words[-1]
        pairs.append([first, second])

    print 'number of pairs loaded:', len(pairs)
    s1 = u'辛弃疾的名作《永遇乐.京口北固亭怀古》中”凭谁问,廉颇老矣“的下一句是什么?'
    s2 = u'辛弃疾的名作《永遇乐.京口北固亭怀古》中”凭谁问,廉颇老矣“的下一句是什么?'
    print edit_distance(s1, s2)
    # print pairs[0][0]

    # parse tree and update its summary
    tree = ET.parse(baidu_data)
    root = tree.getroot()
    for question in root.getchildren():
        q_text = question.findall('q')[0].text.strip()
        cnt = 0
        for p in pairs:
            print edit_distance(q_text, p[0])
            if edit_distance(q_text, p[0]) < 3:
                question.remove('summary')
                ans = ET.SubElement(question, 'summary')
                ans.text = p[1]
                break
        break

    xml_string = ET.tostring(root, encoding = 'utf-8')
    result = xml.dom.minidom.parseString(xml_string)
    # print result.toprettyxml()
    return
Ejemplo n.º 26
0
def average_edit_distance(n):
    import nltk.metrics.distance as d
    edits = [ e["noteText"] for e in activity_logs_for_note(n) if e["action"] == "note-save"]  ##//ActivityLog.objects.filter(noteid=n["id"],action="note-edit").order_by("when").values_list("noteText")
    distances = []
    if len(edits) > 1:
        #print "edits: %s " % repr(edits)
        for i in range(0,len(edits)-1):
            if edits[i] is None or edits[i+1] is None:
                continue
            distances.append( d.edit_distance( edits[i], edits[i+1] ) )
        if len(distances) > 0:
            return make_feature("edit_distance",median(distances))
    return make_feature('edit_distance',MISSING)
Ejemplo n.º 27
0
def group_worlds(tags: List[str], tokens: List[str]) -> Dict[str, List[str]]:
    spans = from_bio(tags, 'world')
    with_strings = [(" ".join(tokens[i:j]), i, j) for i, j in spans]
    with_strings.sort(key=lambda x: len(x[0]), reverse=True)
    substring_groups: List[List[Tuple[str, int, int]]] = []
    ambiguous = []
    for string, i, j in with_strings:
        found = None
        for group_index, group in enumerate(substring_groups):
            for string_g, _, _ in group:
                if string in string_g:
                    if found is None:
                        found = group_index
                    elif found != group_index:
                        found = -1  # Found multiple times
        if found is None:
            substring_groups.append([(string, i, j)])
        elif found >= 0:
            substring_groups[found].append((string, i, j))
        else:
            ambiguous.append((string, i, j))
    nofit = []
    if len(substring_groups) > 2:
        substring_groups.sort(key=len, reverse=True)
        for extra in substring_groups[2:]:
            best_distance = 999
            best_index = None
            string = extra[0][0]  # Use the longest string
            for index, group in enumerate(substring_groups[:2]):
                for string_g, _, _ in group:
                    distance = edit_distance(string_g, string)
                    if distance < best_distance:
                        best_distance = distance
                        best_index = index
            # Heuristics for "close enough"
            if best_index is not None and best_distance < len(string) - 1:
                substring_groups[best_index] += extra
            else:
                nofit.append(extra)
    else:
        substring_groups += [[("N/A", 999, 999)]] * 2   # padding
    substring_groups = substring_groups[:2]
    # Sort by first occurrence
    substring_groups.sort(key=lambda x: min([y[1] for y in x]))
    world_dict = {}
    for index, group in enumerate(substring_groups):
        world_strings = delete_duplicates([x[0] for x in group])
        world_dict['world'+str(index+1)] = world_strings

    return world_dict
Ejemplo n.º 28
0
def select_lower_edit_distance(ref_word, word_list):
    """Get the word with the lower edit distance

    Parameters:
        ref_word (:func:`str`): Word to correct
        word_list (list): List of proposals
    
    Returns:
        :func:`str` - Selected word
    """
    word_dict = {word: edit_distance(ref_word, word) for word in word_list}
    min_dist = min(word_dict.values())

    return [word for word, dist in word_dict.items() if dist == min_dist]
Ejemplo n.º 29
0
def select_ocrsims(token, structures):
    """Select similar words for a given token

    Parameters:
        token (:func:`str`): Cleaned token
        structures (:func:`dict`): Datastructures from file

    Returns:
        :func:`dict` - Similar words (keys) along with their score (values)
    """
    delta = 2
    ocr_sims = {}

    word_hash = ocr_key_hash(token)

    sim_hash_list = {}  # Using a dictionary avoid multiple entries if a key is retrieved twice
    key_index = -1

    # for (key, value) in word_hash:
    for key, value in word_hash:
        key_index += 1
        sim_hash = deepcopy(word_hash)

        for d in range(-delta, delta+1):
            if d != 0:
                card = max(int(value)+d, 1)

                sim_hash[key_index] = (key, card)

                # Rebuild OCR key string
                sim_hash_str = ""
                for k, v in sim_hash:
                    sim_hash_str += k + str(v)

                if sim_hash_str in structures["ocrkeys"]:
                    card_diff = abs(int(value)-card)

                    sim_hash_list[sim_hash_str] = [(sim_word, card_diff)
                                                   for sim_word in structures["ocrkeys"][sim_hash_str]
                                                   if edit_distance(sim_word, token) <= 2]

    for sim_hash_str, sim_list in sim_hash_list.items():
        for sim_word, card_diff in sim_list:
            sim_score = rate_ocr_key(structures["occurence_map"], token, sim_word, card_diff)

            if sim_score > 0:
                ocr_sims[sim_word] = sim_score

    return ocr_sims
Ejemplo n.º 30
0
def strip_words(text, strip_this):
    """ if text starts with something that looks like stip_this then strip it """
    letters = functools.partial(re.sub, '[^a-z]+', '')
    ltext = letters(text.lower())
    lstrip = letters(strip_this.lower())

    best = (4, 0, '')
    for i in range(1, 1 + min(50, len(ltext), len(lstrip))):
        a, b = ltext[:i], lstrip[:i]
        score = (float(edit_distance(a, b)) / len(b), -len(b), b)
        if score < best:
            best = score
    density, length, letters = best
    if best[0] > 0.1 or abs(length) < min(len(lstrip), 10):
        return text
    return strip_letters(text, letters).lstrip(' .?;:()-\t\n')
from nltk.metrics.distance import edit_distance


def my_edit_distance(str1, str2):
    m = len(str1) + 1
    n = len(str2) + 1

    table = {}
    for i in range(m):
        table[i, 0] = i
    for j in range(n):
        table[0, j] = j

    for i in range(1, m):
        for j in range(1, n):
            cost = 0 if str1[i - 1] == str2[j - 1] else 1
            table[i, j] = min(table[i, j - 1] + 1, table[i - 1, j] + 1,
                              table[i - 1, j - 1] + cost)

    return table[i, j]


print("Our Algorithm :", my_edit_distance("hand", "and"))
print("NLTK Algorithm :", edit_distance("hand", "and"))
Ejemplo n.º 32
0
    def _get_features(self, s_ent, t_ent):
        """
        compute all LR model features
        :param s_ent:
        :param t_ent:
        :return:
        """
        s_name_tokens, s_stem_tokens, s_lemm_tokens, s_char_tokens, s_alias_tokens, s_def_tokens = self._compute_tokens(
            s_ent)
        t_name_tokens, t_stem_tokens, t_lemm_tokens, t_char_tokens, t_alias_tokens, t_def_tokens = self._compute_tokens(
            t_ent)

        has_same_canonical_name = (s_name_tokens == t_name_tokens)
        has_same_stemmed_name = (s_stem_tokens == t_stem_tokens)
        has_same_lemmatized_name = (s_lemm_tokens == t_lemm_tokens)
        has_same_char_tokens = (s_char_tokens == t_char_tokens)
        has_alias_in_common = (len(
            set(s_alias_tokens).intersection(set(t_alias_tokens))) > 0)

        # initialize similarity features
        name_token_jaccard_similarity = 1.0
        inverse_name_token_edit_distance = 1.0
        name_stem_jaccard_similarity = 1.0
        inverse_name_stem_edit_distance = 1.0
        name_lemm_jaccard_similarity = 1.0
        inverse_name_lemm_edit_distance = 1.0
        name_char_jaccard_similarity = 1.0
        inverse_name_char_edit_distance = 1.0

        # jaccard similarity and token edit distance
        max_changes = len(s_name_tokens) + len(t_name_tokens)
        max_char_changes = len(s_char_tokens) + len(t_char_tokens)

        if not has_same_canonical_name:
            name_token_jaccard_similarity = string_utils.get_jaccard_similarity(
                set(s_name_tokens), set(t_name_tokens))
            inverse_name_token_edit_distance = 1.0 - edit_distance(
                s_name_tokens, t_name_tokens) / max_changes

        if not has_same_stemmed_name:
            name_stem_jaccard_similarity = string_utils.get_jaccard_similarity(
                set(s_stem_tokens), set(t_stem_tokens))
            inverse_name_stem_edit_distance = 1.0 - edit_distance(
                s_stem_tokens, t_stem_tokens) / max_changes

        if not has_same_lemmatized_name:
            name_lemm_jaccard_similarity = string_utils.get_jaccard_similarity(
                set(s_lemm_tokens), set(t_lemm_tokens))
            inverse_name_lemm_edit_distance = 1.0 - edit_distance(
                s_lemm_tokens, t_lemm_tokens) / max_changes

        if not has_same_char_tokens:
            name_char_jaccard_similarity = string_utils.get_jaccard_similarity(
                set(s_char_tokens), set(t_char_tokens))
            inverse_name_char_edit_distance = 1 - edit_distance(
                s_char_tokens, t_char_tokens) / max_char_changes

        max_alias_token_jaccard = 0.0
        min_alias_edit_distance = 1.0
        best_s_alias = s_ent['aliases'][0]
        best_t_alias = t_ent['aliases'][0]

        if not has_alias_in_common:
            for s_ind, s_a_tokens in enumerate(s_alias_tokens):
                for t_ind, t_a_tokens in enumerate(t_alias_tokens):
                    if s_a_tokens and t_a_tokens:
                        j_ind = string_utils.get_jaccard_similarity(
                            set(s_a_tokens), set(t_a_tokens))
                        if j_ind > max_alias_token_jaccard:
                            max_alias_token_jaccard = j_ind
                            best_s_alias = s_ent['aliases'][s_ind]
                            best_t_alias = t_ent['aliases'][t_ind]
                        e_dist = edit_distance(s_a_tokens, t_a_tokens) / (
                            len(s_a_tokens) + len(t_a_tokens))
                        if e_dist < min_alias_edit_distance:
                            min_alias_edit_distance = e_dist

        # has any relationships
        has_parents = (len(s_ent['par_relations']) > 0
                       and len(t_ent['par_relations']) > 0)
        has_children = (len(s_ent['chd_relations']) > 0
                        and len(t_ent['chd_relations']) > 0)

        percent_parents_in_common = 0.0
        percent_children_in_common = 0.0

        # any relationships in common
        if has_parents:
            max_parents_in_common = (len(s_ent['par_relations']) +
                                     len(t_ent['par_relations'])) / 2
            percent_parents_in_common = len(
                s_ent['par_relations'].intersection(
                    t_ent['par_relations'])) / max_parents_in_common

        if has_children:
            max_children_in_common = (len(s_ent['chd_relations']) +
                                      len(t_ent['chd_relations'])) / 2
            percent_children_in_common = len(
                s_ent['chd_relations'].intersection(
                    t_ent['chd_relations'])) / max_children_in_common

        s_acronyms = [(i[0] for i in a) for a in s_alias_tokens]
        t_acronyms = [(i[0] for i in a) for a in t_alias_tokens]
        has_same_acronym = (len(set(s_acronyms).intersection(set(t_acronyms)))
                            > 0)

        s_name_root, s_name_heads = self._dependency_parse(
            s_ent['canonical_name'])
        t_name_root, t_name_heads = self._dependency_parse(
            t_ent['canonical_name'])

        has_same_name_root_word = (s_name_root == t_name_root)
        has_same_name_chunk_heads = (s_name_heads == t_name_heads)
        name_chunk_heads_jaccard_similarity = string_utils.get_jaccard_similarity(
            s_name_heads, t_name_heads)

        s_alias_root, s_alias_heads = self._dependency_parse(best_s_alias)
        t_alias_root, t_alias_heads = self._dependency_parse(best_t_alias)

        has_same_alias_root_word = (s_alias_root == t_alias_root)
        has_same_alias_chunk_heads = (s_alias_heads == t_alias_heads)
        alias_chunk_heads_jaccard_similarity = string_utils.get_jaccard_similarity(
            s_alias_heads, t_alias_heads)

        def_jaccard_similarity = string_utils.get_jaccard_similarity(
            set(s_def_tokens), set(t_def_tokens))

        # form feature vector
        feature_vec = [
            FloatField(float(has_same_canonical_name)),
            FloatField(float(has_same_stemmed_name)),
            FloatField(float(has_same_lemmatized_name)),
            FloatField(float(has_same_char_tokens)),
            FloatField(float(has_alias_in_common)),
            FloatField(name_token_jaccard_similarity),
            FloatField(inverse_name_token_edit_distance),
            FloatField(name_stem_jaccard_similarity),
            FloatField(inverse_name_stem_edit_distance),
            FloatField(name_lemm_jaccard_similarity),
            FloatField(inverse_name_lemm_edit_distance),
            FloatField(name_char_jaccard_similarity),
            FloatField(inverse_name_char_edit_distance),
            FloatField(max_alias_token_jaccard),
            FloatField(1.0 - min_alias_edit_distance),
            FloatField(percent_parents_in_common),
            FloatField(percent_children_in_common),
            FloatField(float(has_same_acronym)),
            FloatField(float(has_same_name_root_word)),
            FloatField(float(has_same_name_chunk_heads)),
            FloatField(name_chunk_heads_jaccard_similarity),
            FloatField(float(has_same_alias_root_word)),
            FloatField(float(has_same_alias_chunk_heads)),
            FloatField(alias_chunk_heads_jaccard_similarity),
            FloatField(def_jaccard_similarity)
        ]

        return feature_vec
Ejemplo n.º 33
0
def Greedy_Decode_Eval(Net, datasets, args):
    # TestNet = Net.eval()
    epoch_size = len(datasets) // args.test_batch_size
    batch_iterator = iter(
        DataLoader(datasets,
                   args.test_batch_size,
                   shuffle=True,
                   num_workers=args.num_workers,
                   collate_fn=collate_fn))
    c_matrix = np.zeros((len(CHARS) - 1, len(CHARS) - 1))

    Tp = 0
    Tn_1 = 0
    Tn_2 = 0
    t_chars = 0
    T_c = 0
    T_f = 0
    T_fc = 0
    t_fchars = 0
    norm_ed = 0
    res_chars = np.zeros(len(CHARS))
    t1 = time.time()
    count = 0
    for i in tqdm(range(epoch_size)):
        # load train data
        images, labels, lengths, filenames = next(batch_iterator)
        start = 0
        targets = []
        for length in lengths:
            label = labels[start:start + length]
            targets.append(label)
            start += length
        targets = np.array([el.numpy() for el in targets])
        imgs = images.numpy().copy()

        if args.cuda:
            images = Variable(images.cuda())
        else:
            images = Variable(images)

        # forward
        prebs = Net(images)
        # greedy decode
        prebs = prebs.cpu().detach().numpy()
        preb_labels = list()
        for i in range(prebs.shape[0]):
            preb = prebs[i, :, :]
            preb_label = list()
            for j in range(preb.shape[1]):
                preb_label.append(np.argmax(preb[:, j], axis=0))
            no_repeat_blank_label = list()
            pre_c = preb_label[0]
            if pre_c != len(CHARS) - 1:
                no_repeat_blank_label.append(pre_c)
            for c in preb_label:  # dropout repeate label and blank label
                if (pre_c == c) or (c == len(CHARS) - 1):
                    if c == len(CHARS) - 1:
                        pre_c = c
                    continue
                no_repeat_blank_label.append(c)
                pre_c = c
            preb_labels.append(no_repeat_blank_label)
        #print(len(preb_labels))

        for i, label in enumerate(preb_labels):
            if args.postprocess:
                label = postprocess(label)
            correct = False
            X = i
            lb = ""
            tg = ""
            for j in targets[i]:
                x = int(j)
                tg += CHARS[x]
            for j in label:
                lb += CHARS[j]
            norm_ed_img = 0
            # if len(lb)==0 or len(tg)==0 or tg == '0':
            #     norm_ed_img = -1

            if len(tg) > len(lb):
                norm_ed_img = 1 - edit_distance(lb, tg) / len(tg)
                norm_ed += norm_ed_img
            else:
                norm_ed_img = 1 - edit_distance(lb, tg) / len(lb)
                norm_ed += norm_ed_img

            # show image and its predict label
            t_chars += len(targets[i])
            for j in range(len(label)):
                if j >= len(targets[i]):
                    continue
                if label[j] == targets[i][j]:
                    res_chars[label[j]] += 1
                    T_c += 1
            if args.show:
                show(imgs[i], label, targets[i])
            if len(label) != len(targets[i]):
                #print(abs(len(label)-len(targets[i])))
                Tn_1 += 1

            else:
                c_matrix = cmatrix(c_matrix, label, targets[i])
                t_fchars += len(targets[i])
                for j in range(len(label)):
                    if j >= len(targets[i]):
                        continue
                    if label[j] == targets[i][j]:
                        res_chars[label[j]] += 1
                        T_fc += 1
                fuzzy = 0
                for x in range(len(label)):
                    if targets[i][x] == label[x]:
                        fuzzy += 1
                if fuzzy / len(label) >= 0.75:
                    T_f += 1
                if (np.asarray(targets[i]) == np.asarray(label)).all():
                    Tp += 1
                    correct = True
                else:
                    Tn_2 += 1
            # print(lb,tg)
            if args.save_pred_results:
                if not os.path.isdir('./testpreds'):
                    os.makedirs('./testpreds')
                with open("./testpreds/preds.csv", 'a+', newline='') as f:
                    l1 = [filenames[X].split('\\')[1], tg, lb, norm_ed_img]
                    csv_writer = writer(f)
                    csv_writer.writerow(l1)
                f.close
            if args.save_pred_images:
                if not os.path.isdir('./testpreds'):
                    os.makedirs('./testpreds')
                if not os.path.isdir('./testpreds/images'):
                    os.makedirs('./testpreds/images')
                basename = os.path.basename(filenames[X])
                #newname = 'testpreds/images/new_'+basename.split('.')[0]+"__"+lb+'.png'
                newname = 'testpreds/images/' + f'{count}__' + lb + '.png'
                count += 1
                #if not correct:
                shutil.copy(filenames[X], newname)
    if args.evaluate:
        evaluate_and_save(c_matrix)

    Acc = Tp * 1.0 / (Tp + Tn_1 + Tn_2)
    print("[Info] Test Accuracy: {} [{}:{}:{}:{}]".format(
        Acc, Tp, Tn_1, Tn_2, (Tp + Tn_1 + Tn_2)))
    print(
        f"[Info] 75%+ Accuracy: {T_f/(Tp+Tn_1+Tn_2)} [{T_f}/{(Tp+Tn_1+Tn_2)}]")
    t2 = time.time()
    print(f'[Info] Global Char Accuracy:{T_c/t_chars} [{T_c}/{t_chars}] ')
    print(
        f'[Info] Char Accuracy on full length match:{T_fc/t_fchars} [{T_fc}/{t_fchars}] '
    )
    print(f"[Info] Length accuracy: {(Tp+Tn_2)/(Tp+Tn_1+Tn_2)}")
    print(f"[Info] Norm_ed: {norm_ed/(Tp+Tn_1+Tn_2)}")
    # print('Per char: ')
    # for i in range(10):
    #     print(i,": ",res_chars[i]/T_c)
    # for i in range(10,len(CHARS)-1):
    #     print(chr(55+i),': ',res_chars[i]/T_c)
    print("[Info] Test Speed: {}s 1/{}]".format((t2 - t1) / len(datasets),
                                                len(datasets)))
Ejemplo n.º 34
0
__author__ = 'user'

from nltk.metrics import distance as dist

#  transposition flag allows transpositions edits (e.g., “ab� -> “ba�),

s1 = 'dr mark keane'
s2 = 'mr mark bean'

s3 = 'rain'
s4 = 'shine'

s5 = 'mr rowan atkinson'
s6 = 'mr bean'

ans = dist.edit_distance(s1, s2, transpositions=False)
print(ans)

ans = dist.edit_distance(s3, s4, transpositions=False)
print(ans)

ans = dist.edit_distance(s5, s6, transpositions=False)
print(ans)

ans = dist.levenschtein(s1, s2)
print(ans)

ans = dist.levenschtein(s3, s4)
print(ans)

ans = dist.levenschtein(s5, s6)
Ejemplo n.º 35
0
def validation(model, criterion, evaluation_loader, converter, opt):
    """ Validation or Evaluation """
    n_correct = 0
    norm_ED = 0
    length_of_data = 0
    infer_time = 0
    valid_loss_avg = Averager()

    for i, (image_tensors, labels) in enumerate(evaluation_loader):
        batch_size = image_tensors.size(0)
        length_of_data = length_of_data + batch_size
        image = image_tensors.to(device)
        # For max length prediction
        length_for_pred = torch.IntTensor([opt.batch_max_length] *
                                          batch_size).to(device)
        text_for_pred = torch.LongTensor(batch_size, opt.batch_max_length +
                                         1).fill_(0).to(device)

        text_for_loss, length_for_loss = converter.encode(
            labels, batch_max_length=opt.batch_max_length)

        start_time = time.time()
        if 'CTC' in opt.Prediction:
            preds = model(image, text_for_pred)
            forward_time = time.time() - start_time

            # Calculate evaluation loss for CTC decoder.
            preds_size = torch.IntTensor([preds.size(1)] * batch_size)
            # permute 'preds' to use CTCloss format
            if opt.baiduCTC:
                cost = criterion(preds.permute(1, 0, 2), text_for_loss,
                                 preds_size, length_for_loss) / batch_size
            else:
                cost = criterion(
                    preds.log_softmax(2).permute(1, 0, 2), text_for_loss,
                    preds_size, length_for_loss)

            # Select max probabilty (greedy decoding) then decode index to character
            if opt.baiduCTC:
                _, preds_index = preds.max(2)
                preds_index = preds_index.view(-1)
            else:
                _, preds_index = preds.max(2)
            preds_str = converter.decode(preds_index.data, preds_size.data)

        else:
            preds = model(image, text_for_pred, is_train=False)
            forward_time = time.time() - start_time

            preds = preds[:, :text_for_loss.shape[1] - 1, :]
            target = text_for_loss[:, 1:]  # without [GO] Symbol
            cost = criterion(preds.contiguous().view(-1, preds.shape[-1]),
                             target.contiguous().view(-1))

            # select max probabilty (greedy decoding) then decode index to character
            _, preds_index = preds.max(2)
            preds_str = converter.decode(preds_index, length_for_pred)
            labels = converter.decode(text_for_loss[:, 1:], length_for_loss)

        infer_time += forward_time
        valid_loss_avg.add(cost)

        # calculate accuracy & confidence score
        preds_prob = F.softmax(preds, dim=2)
        preds_max_prob, _ = preds_prob.max(dim=2)
        confidence_score_list = []
        for gt, pred, pred_max_prob in zip(labels, preds_str, preds_max_prob):
            if 'Attn' in opt.Prediction:
                gt = gt[:gt.find('[s]')]
                pred_EOS = pred.find('[s]')
                pred = pred[:
                            pred_EOS]  # prune after "end of sentence" token ([s])
                pred_max_prob = pred_max_prob[:pred_EOS]

            # To evaluate 'case sensitive model' with alphanumeric and case insensitve setting.
            if opt.sensitive and opt.data_filtering_off:
                pred = pred.lower()
                gt = gt.lower()
                alphanumeric_case_insensitve = '0123456789abcdefghijklmnopqrstuvwxyz'
                out_of_alphanumeric_case_insensitve = f'[^{alphanumeric_case_insensitve}]'
                pred = re.sub(out_of_alphanumeric_case_insensitve, '', pred)
                gt = re.sub(out_of_alphanumeric_case_insensitve, '', gt)

            if pred == gt:
                n_correct += 1
            '''
            (old version) ICDAR2017 DOST Normalized Edit Distance https://rrc.cvc.uab.es/?ch=7&com=tasks
            "For each word we calculate the normalized edit distance to the length of the ground truth transcription."
            if len(gt) == 0:
                norm_ED += 1
            else:
                norm_ED += edit_distance(pred, gt) / len(gt)
            '''

            # ICDAR2019 Normalized Edit Distance
            if len(gt) == 0 or len(pred) == 0:
                norm_ED += 0
            elif len(gt) > len(pred):
                norm_ED += 1 - edit_distance(pred, gt) / len(gt)
            else:
                norm_ED += 1 - edit_distance(pred, gt) / len(pred)

            # calculate confidence score (= multiply of pred_max_prob)
            try:
                confidence_score = pred_max_prob.cumprod(dim=0)[-1]
            except:
                confidence_score = 0  # for empty pred case, when prune after "end of sentence" token ([s])
            confidence_score_list.append(confidence_score)
            # print(pred, gt, pred==gt, confidence_score)

    accuracy = n_correct / float(length_of_data) * 100
    norm_ED = norm_ED / float(
        length_of_data)  # ICDAR2019 Normalized Edit Distance

    return valid_loss_avg.val(
    ), accuracy, norm_ED, preds_str, confidence_score_list, labels, infer_time, length_of_data
Ejemplo n.º 36
0
def validation(model, criterion, evaluation_loader, converter, opt):
    """ validation or evaluation """
    n_correct = 0
    norm_ED = 0
    length_of_data = 0
    infer_time = 0
    valid_loss_avg = Averager()

    for i, (image_tensors, labels) in enumerate(evaluation_loader):
        batch_size = image_tensors.size(0)
        length_of_data = length_of_data + batch_size
        image = image_tensors.to(device)
        # For max length prediction
        length_for_pred = torch.IntTensor([opt.batch_max_length] *
                                          batch_size).to(device)
        text_for_pred = torch.LongTensor(batch_size, opt.batch_max_length +
                                         1).fill_(0).to(device)

        text_for_loss, length_for_loss = converter.encode(
            labels, batch_max_length=opt.batch_max_length)

        start_time = time.time()
        if 'CTC' in opt.Prediction:
            preds = model(image, text_for_pred).log_softmax(2)
            forward_time = time.time() - start_time

            # Calculate evaluation loss for CTC deocder.
            preds_size = torch.IntTensor([preds.size(1)] * batch_size)
            # permute 'preds' to use CTCloss format
            torch.backends.cudnn.enabled = False
            cost = criterion(
                preds.permute(1, 0, 2).to(device), text_for_loss.to(device),
                preds_size.to(device), length_for_loss.to(device))
            torch.backends.cudnn.enabled = True

            # Select max probabilty (greedy decoding) then decode index to character
            _, preds_index = preds.max(2)
            preds_index = preds_index.view(-1)
            preds_str = converter.decode(preds_index.data, preds_size.data)

        else:
            preds = model(image, text_for_pred, is_train=False)
            forward_time = time.time() - start_time

            preds = preds[:, :text_for_loss.shape[1] - 1, :]
            target = text_for_loss[:, 1:]  # without [GO] Symbol
            cost = criterion(preds.contiguous().view(-1, preds.shape[-1]),
                             target.contiguous().view(-1))

            # select max probabilty (greedy decoding) then decode index to character
            _, preds_index = preds.max(2)
            preds_str = converter.decode(preds_index, length_for_pred)
            labels = converter.decode(text_for_loss[:, 1:], length_for_loss)

        infer_time += forward_time
        valid_loss_avg.add(cost)

        # calculate accuracy & confidence score
        preds_prob = F.softmax(preds, dim=2)
        preds_max_prob, _ = preds_prob.max(dim=2)
        confidence_score_list = []
        for gt, pred, pred_max_prob in zip(labels, preds_str, preds_max_prob):
            if 'Attn' in opt.Prediction:
                gt = gt[:gt.find('[s]')]
                pred_EOS = pred.find('[s]')
                pred = pred[:
                            pred_EOS]  # prune after "end of sentence" token ([s])
                pred_max_prob = pred_max_prob[:pred_EOS]

            if pred == gt:
                n_correct += 1
            if len(gt) == 0:
                norm_ED += 1
            else:
                norm_ED += edit_distance(pred, gt) / len(gt)

            # calculate confidence score (= multiply of pred_max_prob)
            try:
                confidence_score = pred_max_prob.cumprod(dim=0)[-1]
            except:
                confidence_score = 0  # for empty pred case, when prune after "end of sentence" token ([s])
            confidence_score_list.append(confidence_score)
            # print(pred, gt, pred==gt, confidence_score)

    accuracy = n_correct / float(length_of_data) * 100

    return valid_loss_avg.val(
    ), accuracy, norm_ED, preds_str, confidence_score_list, labels, infer_time, length_of_data
Ejemplo n.º 37
0
def calEditDistance(barK, truP):
    dist = edit_distance(barK, truP)
    return dist
Ejemplo n.º 38
0
def validation(model, criterion, eval_loader, converter, opt, tqdm_position=1):
    """validation or evaluation"""
    n_correct = 0
    norm_ED = 0
    length_of_data = 0
    infer_time = 0
    valid_loss_avg = Averager()

    for i, (image_tensors, labels) in tqdm(
            enumerate(eval_loader),
            total=len(eval_loader),
            position=tqdm_position,
            leave=False,
    ):
        batch_size = image_tensors.size(0)
        length_of_data = length_of_data + batch_size
        image = image_tensors.to(device)
        # For max length prediction
        labels_index, labels_length = converter.encode(
            labels, batch_max_length=opt.batch_max_length)

        if "CTC" in opt.Prediction:
            start_time = time.time()
            preds = model(image)
            forward_time = time.time() - start_time

            # Calculate evaluation loss for CTC deocder.
            preds_size = torch.IntTensor([preds.size(1)] * batch_size)
            # permute 'preds' to use CTCloss format
            cost = criterion(
                preds.log_softmax(2).permute(1, 0, 2),
                labels_index,
                preds_size,
                labels_length,
            )

        else:
            text_for_pred = (torch.LongTensor(batch_size).fill_(
                converter.dict["[SOS]"]).to(device))

            start_time = time.time()
            preds = model(image, text_for_pred, is_train=False)
            forward_time = time.time() - start_time

            target = labels_index[:, 1:]  # without [SOS] Symbol
            cost = criterion(
                preds.contiguous().view(-1, preds.shape[-1]),
                target.contiguous().view(-1),
            )

        # select max probabilty (greedy decoding) then decode index to character
        _, preds_index = preds.max(2)
        preds_size = torch.IntTensor([preds.size(1)] *
                                     preds_index.size(0)).to(device)
        preds_str = converter.decode(preds_index, preds_size)

        infer_time += forward_time
        valid_loss_avg.add(cost)

        # calculate accuracy & confidence score
        preds_prob = F.softmax(preds, dim=2)
        preds_max_prob, _ = preds_prob.max(dim=2)
        confidence_score_list = []
        for gt, prd, prd_max_prob in zip(labels, preds_str, preds_max_prob):
            if "Attn" in opt.Prediction:
                prd_EOS = prd.find("[EOS]")
                prd = prd[:
                          prd_EOS]  # prune after "end of sentence" token ([EOS])
                prd_max_prob = prd_max_prob[:prd_EOS]
            """
            In our experiment, if the model predicts at least one [UNK] token, we count the word prediction as incorrect.
            To not take account of [UNK] token, use the below line.
            prd = prd.replace('[UNK]', '') 
            """

            # To evaluate 'case sensitive model' with alphanumeric and case insensitve setting. = same with ASTER
            gt = gt.lower()
            prd = prd.lower()
            alphanumeric_case_insensitve = "0123456789abcdefghijklmnopqrstuvwxyz"
            out_of_alphanumeric_case_insensitve = f"[^{alphanumeric_case_insensitve}]"
            gt = re.sub(out_of_alphanumeric_case_insensitve, "", gt)
            prd = re.sub(out_of_alphanumeric_case_insensitve, "", prd)

            if opt.NED:
                # ICDAR2019 Normalized Edit Distance
                if len(gt) == 0 or len(prd) == 0:
                    norm_ED += 0
                elif len(gt) > len(prd):
                    norm_ED += 1 - edit_distance(prd, gt) / len(gt)
                else:
                    norm_ED += 1 - edit_distance(prd, gt) / len(prd)

            else:
                if prd == gt:
                    n_correct += 1

            # calculate confidence score (= multiply of prd_max_prob)
            try:
                confidence_score = prd_max_prob.cumprod(dim=0)[-1]
            except:
                confidence_score = 0  # for empty pred case, when prune after "end of sentence" token ([EOS])
            confidence_score_list.append(confidence_score)

    if opt.NED:
        # ICDAR2019 Normalized Edit Distance. In web page, they report % of norm_ED (= norm_ED * 100).
        score = norm_ED / float(length_of_data) * 100
    else:
        score = n_correct / float(length_of_data) * 100  # accuracy

    return (
        valid_loss_avg.val(),
        score,
        preds_str,
        confidence_score_list,
        labels,
        infer_time,
        length_of_data,
    )
Ejemplo n.º 39
0
# rosalind_ba5g
# edit distance
import numpy as np

f = open('rosalind_ba5g.txt')
a = f.readline().rstrip()
b = f.readline().rstrip()

from nltk.metrics import distance
print(distance.edit_distance(a, b))


# Wagner-Fischer algorithm
def edit_distance(s, t):
    import numpy as np
    m, n = len(s), len(t)
    d = np.zeros((m + 1, n + 1), dtype=int)
    d[:, 0] = np.arange(m + 1)
    d[0, :] = np.arange(n + 1)

    for j in np.arange(n):
        for i in np.arange(m):
            if s[i] == t[j]:
                d[i + 1, j + 1] = d[i, j]
            else:
                d[i + 1, j + 1] = min(d[i, j + 1] + 1, d[i + 1, j] + 1,
                                      d[i, j] + 1)
    return d[m, n]


print(edit_distance(a, b))
Ejemplo n.º 40
0
from nltk.metrics.distance import edit_distance
import codecs

# Array de questões
questions = []

# Lendo arquivo cybora
cybora_aiml = codecs.open("./core/base/cybora.aiml", "r", encoding="utf-8")

# Frase para teste
phrase = 'meu nome é rodrigo e o seu?'

# Menor distância
distance_less = len(phrase)

# Inicializa varivéis usadas no algoritmo
delta = 0
trigger = ''

# Inicia teste
for question in questions:
    # Calcula distância
    delta = edit_distance(phrase, question)
    # Verifica distância
    if delta < distance_less:
        # Atualiza resultados
        distance_less = delta
        trigger = question

print trigger, distance_less
Ejemplo n.º 41
0
    def get_features(self, lhs, rhs, alignment=['0-0']):
        """calculate and return features for a rule"""
        features = {}
        # no changes
        if lhs == rhs:
            return features

        # indicate if tokens are aligned
        rfound = [(rtok[0] == '[') for rtok in rhs]
        lfound = [(ltok[0] == '[') for ltok in lhs]

        # iterate through aligned tokens--assume no NTs
        for tuple in alignment:
            lind, rind = [int(i) for i in tuple.split('-')]
            lfound[lind] = True
            rfound[rind] = True
            if lhs[lind] == rhs[rind]:
                continue

            # calculate features for substituion
            self.increment(features, 'substituted')
            self.increment(features, 'char-ld',
                           edit_distance(lhs[lind], rhs[rind]))
            if rhs[rind] in self.dictionary.suggest(lhs[lind]):
                if self.dictionary.check(lhs[lind]):
                    self.increment(features,
                                   'alternate-spelling',
                                   v=self.get_weight(rhs[rind]))
                else:
                    self.increment(features,
                                   'mispelled',
                                   v=self.get_weight(rhs[rind]))
            # if the tokens aren't the same, compare them
            ltok, lpos = self.get_pos(lhs[lind])
            rtok, rpos = self.get_pos(rhs[rind])
            self.increment(features, '%s-%s' % (lpos, rpos))
            if rpos[0] == 'W' or rpos[0] == 'C':
                self.increment(features, '%s-error' % (rpos[:2]))
            else:
                self.increment(features, '%s-error' % (rpos[0]))
            # compare lemmas/morphology
            if self.do_morph:
                try:
                    lmorph = self.get_morphology(ltok)
                    rmorph = self.get_morphology(rtok)

                    if lmorph[0] != rmorph[0]:
                        if len(lmorph) + len(rmorph) > 2:
                            self.increment(features, 'diff-lemma-diff-morph')
                        else:
                            self.increment(features, 'diff-lemma-same-morph')
                    else:
                        self.increment(features, 'same-lemma-diff-morph')
                    if len(lmorph) + len(rmorph) > 2:
                        self.increment(
                            features, 'morph-%s-%s' %
                            ('+'.join(lmorph[1:]), '+'.join(rmorph[1:])))
                except:
                    sys.stderr.write('Error handling morphology in rule %d\n' %
                                     i)

        # calculate feaures for deletion
        for deltok in self.analyze_unaligned(lfound, lhs):
            self.increment(features, 'deleted')
            self.increment(features, deltok + '-')
        # calculate feaures for insertion
        for instok in self.analyze_unaligned(rfound, rhs):
            self.increment(features, 'inserted')
            self.increment(features, '-' + instok)

        self.increment(features, 'tok-ld', edit_distance(lhs, rhs))
        return features
Ejemplo n.º 42
0
def lev_no_case_sens(a, b):
    a = a.lower()
    b = b.lower()
    dist = edit_distance(a, b)
    return dist
Ejemplo n.º 43
0
def did_you_mean(keyword, keyword_pool):
    candidates = list(keyword_pool)
    closest_match_idx = np.argmin(
        [edit_distance(keyword, candidate) for candidate in candidates])
    return candidates[closest_match_idx]
Ejemplo n.º 44
0
 def lev_dist(first, second):
     return edit_distance(first, second)
Ejemplo n.º 45
0
    def getFeatureVector(self, mention, entity):
        features = []

        page_title = self._db.getPageTitle(entity)
        page_title = utils.text.normalize_unicode(
            page_title) if page_title is not None else None
        mention_text = utils.text.normalize_unicode(mention.mention_text())

        for feature in self.feature_names:

            # Count features
            if feature == 'prior':
                features.append(self._stats.getCandidatePrior(entity))
            elif feature == 'prior_yamada':
                features.append(
                    self._stats.getCandidatePriorYamadaStyle(entity))
            elif feature == 'normalized_prior':
                features.append(
                    self._stats.getCandidatePrior(entity, normalized=True))
            elif feature == 'normalized_log_prior':
                features.append(
                    self._stats.getCandidatePrior(entity,
                                                  normalized=True,
                                                  log=True))
            elif feature == 'relative_prior':
                if entity in mention.candidates:
                    count = 0
                    for cand in mention.candidates:
                        count += self._stats.getCandidatePrior(cand)
                    if count == 0:
                        features.append(float(0))
                    else:
                        features.append(
                            float(self._stats.getCandidatePrior(entity)) /
                            count)
                else:
                    features.append(float(0))
            elif feature == 'cond_prior':
                features.append(
                    self._stats.getCandidateConditionalPrior(entity, mention))
            elif feature == 'n_of_candidates':
                features.append(len(mention.candidates))
            elif feature == 'max_prior':
                max_prior = self._stats.getCandidateConditionalPrior(
                    entity, mention)
                for m in mention.document().mentions:
                    if entity in m.candidates and self._stats.getCandidateConditionalPrior(
                            entity, m) > max_prior:
                        max_prior = self._stats.getCandidateConditionalPrior(
                            entity, m)
                features.append(max_prior)

            # string similarity features
            elif feature == 'entity_title_starts_or_ends_with_mention':
                x = 1 if page_title is not None and (
                    page_title.lower().startswith(mention_text.lower()) or
                    page_title.lower().endswith(mention_text.lower())) else 0
                features.append(x)
            elif feature == 'mention_text_starts_or_ends_with_entity':
                x = 1 if page_title is not None and (
                    mention_text.lower().startswith(page_title.lower()) or
                    mention_text.lower().endswith(page_title.lower())) else 0
                features.append(x)
            elif feature == 'edit_distance':
                features.append(
                    edit_distance(page_title.lower(), mention_text.lower()
                                  ) if page_title is not None else 0)

            # context similarity features
            elif feature == 'yamada_context_similarity':
                if not hasattr(mention.document(), 'yamada_context_nouns'):
                    mention.document().yamada_context_nouns = \
                        self._opennlp.list_nouns(mention.document().sentences)

                if not hasattr(mention.document(), 'yamada_context_embd'):
                    mention.document().yamada_context_embd = dict()
                if mention_text not in mention.document().yamada_context_embd:
                    context_embd = self.yamada_txt_to_embd.text_to_embedding(
                        mention.document().yamada_context_nouns, mention_text)
                    mention.document(
                    ).yamada_context_embd[mention_text] = context_embd
                context_embd = mention.document(
                ).yamada_context_embd[mention_text]
                entity_embd = self.yamada_txt_to_embd.from_the_cache(entity)

                self.n += 1
                if entity_embd is not None:
                    s = self.yamada_txt_to_embd.similarity(
                        context_embd, entity_embd)
                    #                    print self.yamada_txt_to_embd.similarity(context_embd, entity_embd)
                    features.append(s)
                    if s > 0:
                        self.nn += 1
                else:
                    #print 0
                    features.append(0.0)

                if self.n % 100 == 0:
                    print "yamada got sim", self.nn / float(self.n)

            elif feature == 'our_context_similarity':
                if not hasattr(mention.document(), 'our_context_nouns'):
                    mention.document().our_context_nouns = \
                        self._w2v.get_nouns(mention.document().sentences)

                if not hasattr(mention.document(), 'our_context_embd'):
                    mention.document().our_context_embd = dict()
                if mention_text not in mention.document().our_context_embd:
                    context_embd = self._w2v.text_to_embedding(
                        mention.document().our_context_nouns, mention_text)
                    mention.document(
                    ).our_context_embd[mention_text] = context_embd
                context_embd = mention.document(
                ).our_context_embd[mention_text]
                entity_embd = self._w2v.get_entity_vec(entity)
                if entity_embd is not None:
                    print self._w2v.similarity(context_embd, entity_embd)
                    features.append(
                        self._w2v.similarity(context_embd, entity_embd))
                else:
                    print 0
                    features.append(0.0)
            elif feature.startswith('model_'):
                x = self.models_as_features_predictors[
                    feature[6:]].predict_prob(mention, entity)
                features.append(x)
            else:
                raise "feature undefined"

        return features
Ejemplo n.º 46
0
def validation(model, criterion, evaluation_loader, converter, opt):
    """ validation or evaluation """
    for p in model.parameters():
        p.requires_grad = False

    n_correct = 0
    norm_ED = 0
    length_of_data = 0
    infer_time = 0
    valid_loss_avg = Averager()

    for i, (image_tensors, labels) in enumerate(evaluation_loader):
        batch_size = image_tensors.size(0)
        length_of_data = length_of_data + batch_size
        with torch.no_grad():
            image = image_tensors.cuda()
            # For max length prediction
            length_for_pred = torch.cuda.IntTensor([opt.batch_max_length] *
                                                   batch_size)
            text_for_pred = torch.cuda.LongTensor(
                batch_size, opt.batch_max_length + 1).fill_(0)

            text_for_loss, length_for_loss = converter.encode(labels)

        start_time = time.time()
        if 'CTC' in opt.Prediction:
            preds = model(image, text_for_pred).log_softmax(2)
            forward_time = time.time() - start_time

            # Calculate evaluation loss for CTC deocder.
            preds_size = torch.IntTensor([preds.size(1)] * batch_size)
            preds = preds.permute(1, 0, 2)  # to use CTCloss format
            cost = criterion(preds, text_for_loss, preds_size, length_for_loss)

            # Select max probabilty (greedy decoding) then decode index to character
            _, preds_index = preds.max(2)
            preds_index = preds_index.transpose(1, 0).contiguous().view(-1)
            preds_str = converter.decode(preds_index.data, preds_size.data)

        else:
            preds = model(image, text_for_pred, is_train=False)
            forward_time = time.time() - start_time

            preds = preds[:, :text_for_loss.shape[1] - 1, :]
            target = text_for_loss[:, 1:]  # without [GO] Symbol
            cost = criterion(preds.contiguous().view(-1, preds.shape[-1]),
                             target.contiguous().view(-1))

            # select max probabilty (greedy decoding) then decode index to character
            _, preds_index = preds.max(2)
            preds_str = converter.decode(preds_index, length_for_pred)
            labels = converter.decode(text_for_loss[:, 1:], length_for_loss)

        infer_time += forward_time
        valid_loss_avg.add(cost)

        # calculate accuracy.
        for pred, gt in zip(preds_str, labels):
            if 'Attn' in opt.Prediction:
                pred = pred[:pred.find(
                    '[s]')]  # prune after "end of sentence" token ([s])
                gt = gt[:gt.find('[s]')]

            if pred == gt:
                n_correct += 1
            if len(gt) == 0:
                norm_ED += 1
            else:
                norm_ED += edit_distance(pred, gt) / len(gt)

    accuracy = n_correct / float(length_of_data) * 100

    return valid_loss_avg.val(
    ), accuracy, norm_ED, preds_str, labels, infer_time, length_of_data
Ejemplo n.º 47
0
 def levenshtein(self, a, b):
     return distance.edit_distance(a, b)
Ejemplo n.º 48
0
def evaluate(sentx, senty):
    sent_max_len = max(len(list(sentx)), len(list(senty)))
    if sent_max_len == 0:
        return 0

    return edit_distance(sentx, senty) / sent_max_len
def distance(name, query):
    return edit_distance(name, query)
Ejemplo n.º 50
0
 def cmp_text_edit_distance(self, annotation, candidate, entire_annotation):
     """
     """
     result = edit_distance(annotation, candidate)
     return result
Ejemplo n.º 51
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_text', '-i', required=True, type=str)
    parser.add_argument('--output_json', '-o', required=True, type=str)
    parser.add_argument('--tmp_dir', '-t', default='results/tmp', type=str)
    parser.add_argument('--classifier_type',
                        '-ct',
                        default='RelationClassification',
                        type=str,
                        choices=['RelationClassification', 'fasttext'])
    parser.add_argument('--classifier_model', '-c', nargs='*', type=str)
    parser.add_argument('--classifier_preprocessor',
                        '-cp',
                        nargs='*',
                        type=str)
    parser.add_argument('--use_amr', '-uamr', action='store_true')
    parser.add_argument('--amrs_from', type=str)
    parser.add_argument('--tokenize', action='store_true')
    parser.add_argument('--use_sdg', '-usdg', action='store_true')
    parser.add_argument('--sdg_model',
                        '-sdg',
                        default='stanford',
                        type=str,
                        choices=['stanford', 'spacy'])
    parser.add_argument('--entity_recognizer',
                        '-er',
                        default='None',
                        type=str,
                        choices=['None', 'tagNERv2', 'tagNERv3', 'byteNER'])
    parser.add_argument('--entities_from', type=str)
    parser.add_argument('--anonymize', '-a', action='store_true')
    parser.add_argument('--add_symmetric_pairs', '-sym', action='store_true')
    parser.add_argument('--ensembling_mode',
                        '-ens',
                        type=str,
                        default='average',
                        choices=['average', 'majority_vote'])
    args = parser.parse_args()

    basename = os.path.basename(args.input_text)
    args.tmp_dir = os.path.abspath(args.tmp_dir)
    print("Using tmp dir: {}".format(args.tmp_dir))

    ensure_dir(args.tmp_dir)  # not very necessary

    if args.entity_recognizer == 'None':
        if args.entities_from:
            print("Getting ground truth entities and pairs...")

            with io.open(args.entities_from, encoding='utf-8') as fr:
                with io.open(args.output_json, 'w', encoding='utf-8') as fw:
                    ground_truth = fr.read()
                    fw.write(ground_truth)

            print('Done\n')

        else:
            raise Exception("--entities_from is not specified")

    elif args.entity_recognizer == 'tagNERv2':
        tokenized_input = os.path.join(args.tmp_dir,
                                       '{}.tokenized.txt'.format(basename))
        entities_output = os.path.join(args.tmp_dir,
                                       '{}.tokenized.txt.IOB'.format(basename))
        # candidate_tuples_json = os.path.join(args.tmp_dir, '{}.candidates.json'.format(basename))

        print("Tokenizing...")
        # print("Adding spaces around -")
        with io.open(args.input_text, encoding='utf-8') as fr:
            with io.open(tokenized_input, 'w', encoding='utf-8') as fw:
                for line in fr.readlines():
                    id, sentence = line[:-1].split('\t')  # \n symbol
                    #sentence = sentence.replace('-',' - ')
                    sentence = ' '.join(sentence.split())
                    fw.write("{}\t{}\n".format(id, sentence))

        print('Running tagNERv2...')
        check_call(['bash', 'tag_NER.sh', '-i', tokenized_input, '-f', 'IOB'],
                   cwd='submodules/tag_NER_v2')
        print('Done\n')
        # the output is entities_output
        print('Building interaction tuples with unknown labels...')
        check_call([
            'python', 'iob_to_bind_json.py', '--input_text', args.input_text,
            '--input_iob2', entities_output, '--output_json', args.output_json
        ])  #candidate_tuples_json])
        print('Done\n')

    elif args.entity_recognizer == 'tagNERv3':

        tokenized_input = os.path.join(args.tmp_dir,
                                       '{}.tokenized.txt'.format(basename))
        entities_output = os.path.join(args.tmp_dir,
                                       '{}.tokenized.txt.IOB'.format(basename))
        # candidate_tuples_json = os.path.join(args.tmp_dir, '{}.candidates.json'.format(basename))

        print("Tokenizing...")
        # print("Adding spaces around -")
        with io.open(args.input_text, encoding='utf-8') as fr:
            with io.open(tokenized_input, 'w', encoding='utf-8') as fw:
                for line in fr.readlines():
                    id, sentence = line[:-1].split('\t')  # \n symbol
                    #sentence = sentence.replace('-',' - ')
                    sentence = ' '.join(sentence.split())
                    fw.write("{}\t{}\n".format(id, sentence))

        print('Running tagNERv3 inside Docker...')

        with open(tokenized_input, 'r') as f_in, open(entities_output,
                                                      'w') as f_out:
            p = run([
                'nvidia-docker', 'run', '-i', '--rm', 'yerevann/tag-ner-v3',
                '-i', '/dev/stdin', '-f', 'IOB'
            ],
                    stdin=f_in,
                    stdout=f_out)

        print('Done\n')
        # the output is entities_output
        print('Building interaction tuples with unknown labels...')
        check_call([
            'python', 'iob_to_bind_json.py', '--input_text', args.input_text,
            '--input_iob2', entities_output, '--output_json', args.output_json
        ])  #candidate_tuples_json])
        print('Done\n')

    elif args.entity_recognizer == 'byteNER':
        input_without_ids = os.path.join(args.tmp_dir,
                                         '{}.noids.txt'.format(basename))
        entities_output = os.path.join(args.tmp_dir, '{}.IOB'.format(basename))
        entities_output_chr = os.path.join(args.tmp_dir,
                                           '{}.IOB.chr'.format(basename))

        print('Removing IDs from input for byteNER')
        with io.open(args.input_text, encoding='utf-8') as fr:
            with io.open(input_without_ids, 'w', encoding='utf-8') as fw:
                for line in fr.readlines():
                    id, sentence = line[:-1].split('\t')  # \n symbol
                    fw.write("{}\n".format(sentence))
        print("Done")

        print('Running byteNER...')
        # requires Keras 2.0.6 on python2!
        env = os.environ.copy()
        env['KERAS_BACKEND'] = 'theano'
        env['THEANO_FLAGS'] = 'dnn.enabled=False'
        check_call([
            'python2', 'tagger.py', '-m',
            'models/20CNN,dropout0.5,bytedrop0.3,lr0.0001,bytes,bpe,blstm,crf,biocreative.model',
            '-i', input_without_ids, '-o', entities_output, '--output_format',
            'iob'
        ],
                   cwd='submodules/byteNER',
                   env=env)
        print('Done\n')

        print('Building interaction tuples with unknown labels...')
        check_call([
            'python', 'iob_to_bind_json.py', '--character_level',
            '--input_text', args.input_text, '--input_iob2',
            entities_output_chr, '--output_json', args.output_json
        ])  #candidate_tuples_json])
        print('Done\n')

    pretokenized_input = os.path.join(args.tmp_dir,
                                      '{}.pretokenized.txt'.format(basename))
    if args.tokenize:
        with open(args.input_text, 'r', encoding='utf-8') as fr:
            with open(pretokenized_input, 'w', encoding='utf-8') as fw:
                for line in fr:
                    id, sentence = line[:-1].split('\t')
                    sentence = double_normalize_text(sentence, lower=False)
                    # although fasttext vectors require lower(), RelClass handles it internally
                    fw.write("{}\t{}\n".format(id, sentence))
    else:
        with open(args.input_text, 'r', encoding='utf-8') as fr:
            with open(pretokenized_input, 'w', encoding='utf-8') as fw:
                fw.write(fr.read())

    if args.add_symmetric_pairs:
        # useful for symmetric interactions like `bind`
        with io.open(args.output_json, 'r', encoding='utf-8') as f:
            dense = json.load(f)
            print("Adding symmetric pairs...")
            for sentence in dense:
                sym = []
                for i, pair in enumerate(sentence['extracted_information']):
                    reverse_pair = pair.copy()
                    reverse_pair['participant_a'] = pair['participant_b']
                    reverse_pair['participant_b'] = pair['participant_a']
                    reverse_pair['_sym_of'] = i
                    sym.append(reverse_pair)
                sentence['extracted_information'] += sym
        with io.open(args.output_json, 'w', encoding='utf-8') as f:
            json.dump(dense, f)

    if args.use_amr:
        print('Adding AMRs...')
        if args.amrs_from:
            with open(args.amrs_from, 'r', encoding='utf-8') as f:
                amrs = json.load(f)
            amr_dict = {}
            for sample in amrs:
                amr_dict[sample['id']] = sample['amr']
            with open(args.output_json, 'r', encoding='utf-8') as f:
                data = json.load(f)
            for sentence in data:
                sentence['amr'] = amr_dict[sentence['id']]
            with open(args.output_json, 'w', encoding='utf-8') as f:
                json.dump(data, f)
        else:
            check_call([
                'python3',
                'add_amr.py',
                '--input_text',
                pretokenized_input,
                '--input_json',
                args.output_json,
                '--model',
                'amr2_bio7_best_after_2_fscore_0.6118.m',
                #'--model', 'bio_model_best.m',
                '--output_json',
                args.output_json,
                '--tmp_dir',
                args.tmp_dir
            ])
        print('Done\n')

        print('Extracting AMR paths...')
        check_call([
            'python3', 'append_amr_paths.py', '--input_json', args.output_json,
            '--output_json', args.output_json, '--tmp_dir', args.tmp_dir
        ])
        print('Done\n')

        print('Appending Amr Soft-Matching Statistics...')
        with open(args.output_json, 'r', encoding='utf-8') as f:
            data = json.load(f)

        for sentence in data:
            for info in sentence['extracted_information']:
                participant_a = info['participant_a']
                participant_b = info['participant_b']
                if not info['amr_path']:
                    info['amr_path'] = '{} _nopath_ {}'.format(
                        participant_a, participant_b)
                    info['amr_soft_match_distance_a'] = -1
                    info['amr_soft_match_distance_b'] = -1
                else:
                    amr_match_a = info['amr_path'].split()[0]
                    amr_match_b = info['amr_path'].split()[-1]
                    info['amr_soft_match_distance_a'] = edit_distance(
                        participant_a, amr_match_a)
                    info['amr_soft_match_distance_b'] = edit_distance(
                        participant_b, amr_match_b)
        with open(args.output_json, 'w', encoding='utf-8') as f:
            json.dump(data, f)
        print('Done')

    if args.use_sdg:
        print('Adding Stanford Dependency Graphs...')
        check_call([
            'python', 'add_sdg.py', '--input_text', pretokenized_input,
            '--input_json', args.output_json, '--output_json',
            args.output_json, '--model', args.sdg_model, '--tmp_dir',
            args.tmp_dir
        ])
        print('Done\n')

        print('Extracting SDG paths...')
        check_call([
            'python', 'append_sdg_paths.py', '--input_json', args.output_json,
            '--output_json', args.output_json
        ])
        print('Done\n')

        print('Appending SDG Soft-Matching Statistics...')
        with open(args.output_json, 'r', encoding='utf-8') as f:
            data = json.load(f)

        for sentence in data:
            for info in sentence['extracted_information']:
                participant_a = info['participant_a']
                participant_b = info['participant_b']
                if not info['sdg_path']:
                    info['sdg_path'] = '{} _nopath_ {}'.format(
                        participant_a, participant_b)
                    info['sdg_soft_match_distance_a'] = -1
                    info['sdg_soft_match_distance_b'] = -1
                else:
                    sdg_match_a = info['sdg_path'].split()[0]
                    sdg_match_b = info['sdg_path'].split()[-1]
                    info['sdg_soft_match_distance_a'] = edit_distance(
                        participant_a, sdg_match_a)
                    info['sdg_soft_match_distance_b'] = edit_distance(
                        participant_b, sdg_match_b)
        with open(args.output_json, 'w', encoding='utf-8') as f:
            json.dump(data, f)

        print('Done')

    # raise Exception("Classifier is not ready!")

    before_classifier = os.path.join(
        args.tmp_dir, '{}.before-classifier.json'.format(basename))
    after_classifier = os.path.join(
        args.tmp_dir, '{}.after-classifier.0.json'.format(basename))
    after_classifier_format_string = os.path.join(
        args.tmp_dir, '{}.after-classifier.{}.json'.format(basename, "{}"))

    print(
        "Converting dense JSON to flat JSON: {} ...".format(before_classifier))
    with io.open(args.output_json, encoding='utf-8') as fr:
        dense = json.load(fr)
        flat = {}
        if args.anonymize:
            print("Anonymizing...")
        for sentence in dense:
            for i, pair in enumerate(sentence['extracted_information']):
                id = "{}|{}".format(sentence['id'], i)
                sentence['extracted_information'][i]['id'] = id
                flat[id] = {
                    'text':
                    sentence['text'],
                    'interaction_tuple': [
                        pair['interaction_type'], '', pair['participant_a'],
                        pair['participant_b']
                    ],
                    'label':
                    1 if pair['label'] != 0 else 0  # TODO: -1s
                }
                if '_sym_of' in pair:
                    flat[id]['_sym_of'] = "{}|{}".format(
                        sentence['id'], pair['_sym_of'])

                tokenized_text = None
                if args.anonymize:
                    placeholder_a = '__participant_a__'
                    placeholder_b = '__participant_b__'

                    flat[id]['interaction_tuple'][2] = placeholder_a
                    flat[id]['interaction_tuple'][3] = placeholder_b
                    if not args.use_sdg and not args.use_amr:
                        raise NotImplementedError('Anonymization for this \
                                                   setting is not implemented')
                    if args.use_sdg:
                        sdg_match_a = pair['sdg_path'].split()[0]
                        sdg_match_b = pair['sdg_path'].split()[-1]
                        pair['sdg_path'] = pair['sdg_path'].replace(
                            sdg_match_a, placeholder_a)
                        pair['sdg_path'] = pair['sdg_path'].replace(
                            sdg_match_b, placeholder_b)
                        tokenized_text = sentence['tokenized_text']
                        tokenized_text = [
                            placeholder_a if word == sdg_match_a else word
                            for word in tokenized_text
                        ]
                        tokenized_text = [
                            placeholder_b if word == sdg_match_b else word
                            for word in tokenized_text
                        ]

                        # sdg = sentence['sdg'].replace(sdg_match_a,
                        #                               placeholder_a)
                        # sdg = sdg.replace(sdg_match_b,
                        #                   placeholder_b)

                    if args.use_amr:
                        amr_match_a = pair['amr_path'].split()[0]
                        amr_match_b = pair['amr_path'].split()[-1]
                        pair['amr_path'] = pair['amr_path'].replace(
                            amr_match_a, placeholder_a)
                        pair['amr_path'] = pair['amr_path'].replace(
                            amr_match_b, placeholder_b)

                    if args.use_sdg:
                        participant_a = sdg_match_a
                        participant_b = sdg_match_b
                    else:
                        participant_a = amr_match_a
                        participant_b = amr_match_b
                    text = flat[id]['text']
                    text = text.replace(participant_a, placeholder_a)
                    text = text.replace(participant_b, placeholder_b)
                    flat[id]['text'] = text

                if 'amr_path' in pair:
                    flat[id]['amr_path'] = pair['amr_path']
                if 'sdg_path' in pair:
                    flat[id]['sdg_path'] = pair['sdg_path']
                if 'tokenized_text' in sentence:
                    if tokenized_text is not None:
                        # custom, anonymized version
                        flat[id]['tokenized_text'] = tokenized_text
                    else:
                        # general version
                        flat[id]['tokenized_text'] = sentence['tokenized_text']
                if 'pos_tags' in sentence:
                    flat[id]['pos_tags'] = sentence['pos_tags']

    flat_json_string = json.dumps(flat, indent=True)

    with io.open(before_classifier, 'w', encoding='utf-8') as fw:
        fw.write(flat_json_string)
    print("Done!")

    print('Detecting true interactions using {} ...'.format(
        args.classifier_type))
    if args.classifier_type == "RelationClassification":
        for i, (model, processor) in enumerate(
                zip(args.classifier_model, args.classifier_preprocessor)):
            print('Running model number {}'.format(i))
            print('Model filepath: {}'.format(model))
            check_call([
                'python2',
                'predict.py',
                '--input_path',
                before_classifier,
                '--output_path',
                after_classifier_format_string.format(i),
                '--processor_path',
                processor,
                '--model_path',
                model,
            ],
                       cwd='submodules/RelationClassification/')
    elif args.classifier_type == "fasttext":
        # TODO: this does not support multiple models!
        # TODO: this is pretty ugly.
        # Preprocessing and postprocessing for fasttext and RelClass should be at the same level
        before_fasttext = os.path.join(
            args.tmp_dir, '{}.before-fasttext.txt'.format(basename))
        fasttext_keys = []

        with io.open(before_fasttext, 'w', encoding='utf-8') as fw:
            for k, v in flat.items():
                fw.write("{}\n".format(v['text']))
                fasttext_keys.append(k)

        fasttext_output = check_output([
            'fasttext',
            'predict',
            args.classifier_model,
            before_fasttext,
            #after_classifier
        ])
        fasttext_labels = fasttext_output.decode('utf-8').split('\n')

        for i, k in enumerate(fasttext_keys):
            label_string = fasttext_labels[i]
            if not label_string.startswith("__label__"):
                print("Error: invalid label: {}".format(label_string))
            else:
                flat[k]['prediction'] = int(label_string[9:])

        flat_json_string = json.dumps(flat, indent=True)
        with io.open(after_classifier, 'w', encoding='utf-8') as fw:
            fw.write(flat_json_string)

    for after_classifier in sorted(
            glob(after_classifier_format_string.format('*'))):
        print("Reading classifier output from flat JSON: {} ...".format(
            after_classifier))
        with io.open(after_classifier, encoding='utf-8') as fr:
            flat = json.load(fr)
            found = 0
            missing = 0
            for sentence in dense:
                for pair in sentence['extracted_information']:
                    if pair['id'] in flat:
                        if 'predictions' not in pair:
                            pair['predictions'] = []
                        if 'probabilities' not in pair:
                            pair['probabilities'] = []
                        pair['predictions'].append(
                            flat[pair['id']]['prediction'])
                        if 'probabilities' in flat[pair['id']]:
                            pair['probabilities'].append(
                                flat[pair['id']]['probabilities'])
                        found += 1
                    else:
                        missing += 1
            print("{}/{} items did not have predictions in {}".format(
                missing, missing + found, after_classifier))

    #  Performing Ensembling

    for sentence in dense:
        for pair in sentence['extracted_information']:
            if args.ensembling_mode == 'majority_vote':
                if sum(pair['predictions']) / len(pair['predictions']) < 0.5:
                    pair['label'] = 0
                else:
                    pair['label'] = 1
            else:  # args.ensembling_mode = 'average'
                prob = np.array(pair['probabilities']).mean(axis=0)
                pair['label'] = int(prob.argmax())

    with io.open(args.output_json, 'w', encoding='utf-8') as fw:
        dense_json_string = json.dumps(dense, indent=True)
        fw.write(dense_json_string)
    print("Done!")
Ejemplo n.º 52
0
def MatchURLTraces(d, m):
    dtraces = sorted(d.traces.iteritems())
    mtraces = sorted(m.traces.iteritems())

    dTrStr = utils.getURLFeatures(dtraces, d)
    mTrStr = utils.getURLFeatures(mtraces, m)

    print "*"*100
    print "Desktop features:", len(dTrStr)
    for tr in dTrStr:
        print '\t',tr[0]
    print "Mobile features:", len(mTrStr)
    for tr in mTrStr:
        print '\t',tr[0]
    print "*"*100

    x,y = len(dTrStr), len(mTrStr)
    data = numpy.zeros(shape=(x,y))

    for i in range(x):
        for j in range(y):
            df, dStr = dTrStr[i]
            mf, mStr = mTrStr[j]
            dist = distance.edit_distance(dStr,mStr)
            data[i,j] = dist

            ## try
            t = max(len(dStr), len(mStr))
            if dist < URL_THRESH * t:
                print "++",
                data[i,j] = dist
            else:
                print "--",
                data[i,j] = 100000000
            print dist, t, dStr, mStr, df, mf
#             if dist < 1:
#                 print '\t- perfect match', df, dStr, mf, mStr

#     exit()

    print data

    N = max(x,y)
    print "Resizing", x, y, " to ", N
    #data.resize((N,N))

    matrix = numpy.ones(shape=(N,N)) * 100000000
    matrix[:x,:y] = data

    #matrix = numpy.copy(data)

    mwbgm = bipartitematching.Munkres()
    indexes = mwbgm.compute(matrix)
    bipartitematching.print_matrix(matrix, msg='Lowest cost through this matrix:')
    total = 0
    for row, column in indexes:
        if row < x and column < y:
            value = data[row][column]
            if value > 10000000:
                continue
            total += value
            print '(%d, %d) -> %d ' % (row, column, value),
            print '==> %s = %s' %(dTrStr[row][0], mTrStr[column][0])
    print 'total cost: %d' % total

    print "*"*100
 for i in toktok.tokenize(s):
     if re.match(r_exp_an, i):
         print("Prueba REGEXP start:\t", end="")
         if i.lower() in stopwords:
             r_oraciones.append(i)
             print('Prueba Stopwords OK:', i)
         elif re.match(r_exp_n, i):
             r_oraciones.append(i)
             print('Prueba REGEXP-Numbers OK:', i)
         else:
             print("Buscar " + str(i) + ": ")
             x = None
             rnk = 3
             for j in datos:
                 if (len(i) + 3) >= len(j):
                     distancia = edit_distance(i.lower(), j)
                     if (distancia < rnk):
                         rnk = distancia
                         if (rnk == 0):
                             x = j
                             print(i, j,
                                   distancia, rnk, x, str((len(i) + 3)),
                                   str(len(j)), (len(i) + 3) >= len(j))
                             break
                         else:
                             x = j if (rnk < 2) else None
             if x == None:
                 print('\tCambiado->No')
                 r_oraciones.append(i)
             else:
                 print('\tCambiado->Si: ' + x)
Ejemplo n.º 54
0
real_words = set(model)

# generate Test Dataset
data = pd.DataFrame(columns=["Correct", "Misspelling"])
f = open("data/misspelling.txt", "r")
j = 0

alphabet = set('abcdefghijklmnopqrstuvwxyz')
for i in tqdm(f):
    # print(j)
    # if j > 50:
    # 	break
    j += 1
    if i[0] == "$":
        correct = i[1:].lower().strip()
    else:
        i = i.lower().strip()
        if not (i in real_words) and not (set(i) - alphabet) and not (
                set(correct) - alphabet) and 0 < edit_distance(correct,
                                                               i) <= 2:
            data = data.append({
                'Correct': correct,
                'Misspelling': i
            },
                               ignore_index=True)

data.to_csv(path_or_buf='data/testdata.txt',
            sep=' ',
            index=False,
            header=False)
Ejemplo n.º 55
0
def get_values(entities, domain):
    _random, bayes_random = {}, {}
    bayes_no_variation, bayes_variation = {}, {}
    siddharthan, deemter = {}, {}

    for _id in entities:
        evaluation = p.load(open(os.path.join(properties.evaluation_dir, _id)))

        for fold in evaluation:
            if fold not in bayes_random:
                _random[fold] = {
                    'y_real': [],
                    'y_pred': [],
                    'string': [],
                    'jaccard': []
                }
                bayes_random[fold] = {
                    'y_real': [],
                    'y_pred': [],
                    'string': [],
                    'jaccard': []
                }
                bayes_no_variation[fold] = {
                    'y_real': [],
                    'y_pred': [],
                    'string': [],
                    'jaccard': []
                }
                bayes_variation[fold] = {
                    'y_real': [],
                    'y_pred': [],
                    'string': [],
                    'jaccard': []
                }
                siddharthan[fold] = {
                    'y_real': [],
                    'y_pred': [],
                    'string': [],
                    'jaccard': []
                }
                deemter[fold] = {
                    'y_real': [],
                    'y_pred': [],
                    'string': [],
                    'jaccard': []
                }

            for item in evaluation[fold]:
                item_domain = get_domain(item['features']['fname'])

                if domain == item_domain or domain == '':
                    string_real = item['real']['reference']
                    string_random = item['random']['reference']
                    string_bayes_random = item['bayes_random']['reference'][0][
                        0]
                    string_bayes_no_variation = item['bayes_no_variation'][
                        'reference'][0][0]
                    string_bayes_variation = item['bayes_variation'][
                        'reference'][0][0]
                    string_siddharthan = item['siddharthan']['reference']
                    string_deemter = item['deemter']['reference']

                    dist_random = edit_distance(string_random, string_real)
                    dist_bayes_random = edit_distance(string_bayes_random,
                                                      string_real)
                    dist_bayes_no_variation = edit_distance(
                        string_bayes_no_variation, string_real)
                    dist_bayes_variation = edit_distance(
                        string_bayes_variation, string_real)
                    dist_siddharthan = edit_distance(string_siddharthan,
                                                     string_real)
                    dist_deemter = edit_distance(string_deemter, string_real)

                    tokens_real = set(nltk.word_tokenize(string_real))
                    tokens_random = set(nltk.word_tokenize(string_random))
                    tokens_bayes_random = set(
                        nltk.word_tokenize(string_bayes_random))
                    tokens_bayes_no_variation = set(
                        nltk.word_tokenize(string_bayes_no_variation))
                    tokens_bayes_variation = set(
                        nltk.word_tokenize(string_bayes_variation))
                    tokens_siddharthan = set(
                        nltk.word_tokenize(string_siddharthan))
                    tokens_deemter = set(nltk.word_tokenize(string_deemter))

                    jaccard_random = jaccard_distance(tokens_random,
                                                      tokens_real)
                    jaccard_bayes_random = jaccard_distance(
                        tokens_bayes_random, tokens_real)
                    jaccard_bayes_no_variation = jaccard_distance(
                        tokens_bayes_no_variation, tokens_real)
                    jaccard_bayes_variation = jaccard_distance(
                        tokens_bayes_variation, tokens_real)
                    jaccard_siddharthan = jaccard_distance(
                        tokens_siddharthan, tokens_real)
                    jaccard_deemter = jaccard_distance(tokens_deemter,
                                                       tokens_real)

                    bayes_random[fold]['y_real'].append(item['real']['label'])
                    bayes_random[fold]['y_pred'].append(
                        item['bayes_random']['label'][0])
                    bayes_random[fold]['string'].append(dist_bayes_random)
                    bayes_random[fold]['jaccard'].append(jaccard_bayes_random)

                    bayes_no_variation[fold]['y_real'].append(
                        item['real']['label'])
                    bayes_no_variation[fold]['y_pred'].append(
                        item['bayes_no_variation']['label'][0])
                    bayes_no_variation[fold]['string'].append(
                        dist_bayes_no_variation)
                    bayes_no_variation[fold]['jaccard'].append(
                        jaccard_bayes_no_variation)

                    bayes_variation[fold]['y_real'].append(
                        item['real']['label'])
                    bayes_variation[fold]['y_pred'].append(
                        item['bayes_variation']['label'][0])
                    bayes_variation[fold]['string'].append(
                        dist_bayes_variation)
                    bayes_variation[fold]['jaccard'].append(
                        jaccard_bayes_variation)

                    _random[fold]['y_real'].append(item['real']['label'])
                    _random[fold]['y_pred'].append(item['random']['label'])
                    _random[fold]['string'].append(dist_random)
                    _random[fold]['jaccard'].append(jaccard_random)

                    siddharthan[fold]['y_real'].append(item['real']['label'])
                    siddharthan[fold]['y_pred'].append(
                        item['siddharthan']['label'])
                    siddharthan[fold]['string'].append(dist_siddharthan)
                    siddharthan[fold]['jaccard'].append(jaccard_siddharthan)

                    deemter[fold]['y_real'].append(item['real']['label'])
                    deemter[fold]['y_pred'].append(item['deemter']['label'])
                    deemter[fold]['string'].append(dist_deemter)
                    deemter[fold]['jaccard'].append(jaccard_deemter)
    return _random, bayes_random, bayes_no_variation, bayes_variation, siddharthan, deemter
Ejemplo n.º 56
0
def validation(model,
               criterion,
               evaluation_loader,
               converter,
               opt,
               eval_data=None):
    """ validation or evaluation """
    for p in model.parameters():
        p.requires_grad = False

    n_correct = 0
    norm_ED = 0
    max_length = opt.batch_max_length
    length_of_data = 0
    infer_time = 0
    valid_loss_avg = Averager()

    if 'Transformer' in opt.SequenceModeling:
        text_pos = torch.arange(1,
                                max_length + 2,
                                dtype=torch.long,
                                device='cuda').expand(
                                    evaluation_loader.batch_size, -1)

    for i, (image_tensors, labels) in enumerate(evaluation_loader):
        print(image_tensors.size())
        img = image_tensors[100].squeeze().mul_(0.5).add_(0.5).mul_(
            255).numpy()
        print(img.shape)
        cv2.imshow('1', img)
        cv2.waitKey(0)
        cv2.destroyAllWindows()
        exit()
        batch_size = image_tensors.size(0)
        length_of_data = length_of_data + batch_size
        with torch.no_grad():
            image = image_tensors.cuda()
            # For max length prediction
            length_for_pred = torch.cuda.IntTensor([opt.batch_max_length] *
                                                   batch_size)
            text_for_pred = torch.cuda.LongTensor(
                batch_size, opt.batch_max_length + 1).fill_(0)

            if 'Transformer' in opt.SequenceModeling:
                text_for_loss, length_for_loss, text_pos_for_loss = converter.encode(
                    labels, opt.batch_max_length)
            elif 'CTC' in opt.Prediction:
                text_for_loss, length_for_loss = converter.encode(labels)
            else:
                text_for_loss, length_for_loss = converter.encode(
                    labels, opt.batch_max_length)

        start_time = time.time()
        if 'Transformer' in opt.SequenceModeling:
            batch_text_pos = text_pos[:batch_size]
            preds = model(image,
                          text_for_pred,
                          is_train=False,
                          tgt_pos=batch_text_pos)
            forward_time = time.time() - start_time
            # print('test pred',preds[0].size(),text_for_loss.shape[1] - 1)
            preds = preds[:, :text_for_loss.shape[1] - 1, :]

            target = text_for_loss[:, 1:]  # without [GO] Symbol
            # print('pred',preds.size(),target.size())
            # print('pred[0]',preds[0],target[0])
            cost = criterion(preds.contiguous().view(-1, preds.shape[-1]),
                             target.contiguous().view(-1))

            # select max probabilty (greedy decoding) then decode index to character
            # print('cost',cost)
            # exit()
            _, preds_index = preds.max(2)
            # print('preds_index',preds_index,length_for_pred)
            # exit()
            preds_str = converter.decode(preds_index, length_for_pred)
            labels = converter.decode(text_for_loss[:, 1:], length_for_loss)
        elif 'CTC' in opt.Prediction:
            preds = model(image, text_for_pred).log_softmax(2)
            forward_time = time.time() - start_time

            # Calculate evaluation loss for CTC deocder.
            preds_size = torch.IntTensor([preds.size(1)] * batch_size)
            preds = preds.permute(1, 0, 2)  # to use CTCloss format
            cost = criterion(preds, text_for_loss, preds_size, length_for_loss)

            # Select max probabilty (greedy decoding) then decode index to character
            _, preds_index = preds.max(2)
            preds_index = preds_index.transpose(1, 0).contiguous().view(-1)
            preds_str = converter.decode(preds_index.data, preds_size.data)

        else:
            preds = model(image, text_for_pred, is_train=False)
            forward_time = time.time() - start_time

            preds = preds[:, :text_for_loss.shape[1] - 1, :]
            target = text_for_loss[:, 1:]  # without [GO] Symbol
            cost = criterion(preds.contiguous().view(-1, preds.shape[-1]),
                             target.contiguous().view(-1))

            # select max probabilty (greedy decoding) then decode index to character
            _, preds_index = preds.max(2)
            preds_str = converter.decode(preds_index, length_for_pred)
            labels = converter.decode(text_for_loss[:, 1:], length_for_loss)
        print('forward_time', forward_time * 1000, 'ms')
        infer_time += forward_time
        valid_loss_avg.add(cost)

        # calculate accuracy.
        for pred, gt in zip(preds_str, labels):
            if 'Transformer' in opt.SequenceModeling:
                pred = pred[:pred.find('</s>')]
                gt = gt[:gt.find('</s>')]
            elif 'Attn' in opt.Prediction:
                # prune after "end of sentence" token ([s])
                pred = pred[:pred.find('[s]')]
                gt = gt[:gt.find('[s]')]

            if pred == gt:
                n_correct += 1
            norm_ED += edit_distance(pred, gt) / len(gt)

    accuracy = n_correct / float(length_of_data) * 100

    return valid_loss_avg.val(
    ), accuracy, norm_ED, preds_str, labels, infer_time, length_of_data
Ejemplo n.º 57
0
def find_correct_case(word, case_mode, structures):
    """Select the best case between a set of already encountered cases

    Parameters:
        word (:func:`str`): Word to correct
        case_mode (int): Choice between lower or upper case (extra choice for undecisive)
        structures (dict): List of structures needed to perform the choice
    Returns:
        :func:`str` - Corrected word
    """
    variations = {
        key: structures["occurence_map"][key]
        for key in structures["altcase"][word.lower()]
    }
    variations = sorted(variations.iteritems(),
                        key=operator.itemgetter(1),
                        reverse=True)

    tmp_vars = []
    if case_mode == 0:  # Upper case spelling
        for var in variations:
            _word = var[0]
            if _word[0].isupper() and sum(char.isupper()
                                          for char in _word) > 2:
                tmp_vars.append(var)

        if len(tmp_vars) == 0:
            tmp_vars = variations
    elif case_mode == 1:  # Lower case with capital initial
        for var in variations:
            _word = var[0]
            if _word[0].isupper() and sum(char.isupper()
                                          for char in _word) <= 2:
                tmp_vars.append(var)

        if len(tmp_vars) == 0:
            tmp_vars = variations
    else:  # case_mode == -1 (no capital letters found)
        tmp_vars = variations

    max_occ = tmp_vars[0][1]
    dist_vars = {
        term: edit_distance(word, term)
        for term, occ in tmp_vars if occ == max_occ
    }

    if len(dist_vars) == 1:
        return dist_vars.keys()[0]

    # Several terms with max occurence still exist
    dist_vars = sorted(dist_vars.iteritems(), key=operator.itemgetter(1))

    min_dist = dist_vars[0][1]
    min_dist_vars = [term for term, dist in dist_vars if dist == min_dist]

    if len(min_dist_vars) == 1:
        return min_dist_vars[0]

    # Several terms with same Levenhstein distance exist
    term_ascii_code = {
        term: [ord(ch) for ch in term]
        for term in min_dist_vars
    }

    for ascii_code in term_ascii_code.values():
        for i in xrange(len(ascii_code)):
            code = ascii_code[i]

            # Non a-zA-Z chars will have a 0 value
            if code < 65 or 90 < code < 97 or code > 122:
                ascii_code[i] = 0

    if case_mode >= 0:
        ascii_val = min(term_ascii_code.values())

        t = [t for t, v in term_ascii_code.items() if v == ascii_val]

        if len(t) > 1:
            raise ValueError("Too many value in final array")

        return t[0]
    else:
        ascii_val = max(term_ascii_code.values())

        t = [t for t, v in term_ascii_code.items() if v == ascii_val]

        if len(t) > 1:
            raise ValueError("Too many value in final array")

        return t[0]
Ejemplo n.º 58
0
if __name__ == '__main__':
    a = ' Herbert Karagan'
    b = 'Karajan hervert wisloW  '
    # su distancia es 2

    q = 'hervert'
    w = 'erberth'
    e = 'Hervert'
    # su distancia es 3

    y = ' qW rJRs Pin'
    w = 'PiN qw qweqwe lñlqw svkf RHqT'
    j = 'Qw RjrS pInqw'

    serie1 = pd.Series(['qwe we', 'Ana Palacios', 'pedro biescas'])
    serie2 = pd.Series([' we qweR', ' Palacios ewAna ', ' biescas pedro'])

    print(edit_distance(q, w))
    print(edit_distance(q, e))
    print(lev_no_case_sens(q, e))
    print(bow_dist(a, b))
    # print(list(permutations('123', 2)))
    print(bow_dist(y, w))
    print(bow_dist(y, w, case_sens=True))
    print(bow_dist(y, j))
    print(bow_dist(y, j, case_sens=True))

    print(dist_string_to_series('pedro viehscas', serie2))
    print(dist_series_to_series(serie1, serie2))
    print(dist_series_to_series_paralell(serie1, serie2))
Ejemplo n.º 59
0
 def distance(self, first_word, second_word):
     return edit_distance(first_word, second_word)
Ejemplo n.º 60
0
def score_prediction(y_true, y_pred):
    """Function to score prediction on IAM, using Levenshtein distance
       to calculate character error rate (CER)
    
    Parameters
    ------
    y_true: list
        list of ground truth labels
    y_pred: list
        list of predicted labels

    Returns
    -------
    CER: float
        character error rate
    WER: float
        word error rate
    """
        
    words_identified = 0
    characters_identified = 0
    char_tot = 0
#    CER = 0

#    list_accuracy_characters = []
    
    for i in range(len(y_pred)):
        #pred_row = [y_true[i], y_pred[i]] 
        
        #check if date are the same
        #if pred_row[0] == pred_row[1]:
        if y_true[i] == y_pred[i]:

            words_identified += 1
            
#        if len(pred_row[1]) < len(pred_row[0]):
#            pred_row[1] += '-' * (len(pred_row[0]) - len(pred_row[1]))    
#        elif len(pred_row[1]) > len(pred_row[1]):

#            pred_row[1] = pred_row[1][0:len(pred_row[0])]

        #check the number of characters that are the same
 #       print(y_true[i])
 #       print(y_pred[i])
        
        levenshtein_distance = edit_distance(y_true[i], y_pred[i])
        n_char = np.maximum(len(y_true[i]), len(y_pred[i]))
        
        normalized_distance = levenshtein_distance/n_char

        characters_identified += normalized_distance
#        char_tot += n_char
        
#        CER += normalized_distance
        
#        print(len(y_true[i]))
#        for k in range(len(y_true[i])):
#            print()
#            if y_true[i][k] == y_pred[1][k]:
#        characters_identified += 1
#        char_tot += 1

    # array_accuracy_characters = np.asarray(list_accuracy_characters)
    CER = float((characters_identified) / len(y_true))
    WER = (len(y_pred) - words_identified)/len(y_pred) 
        
    return CER, WER
#    return WER