Exemple #1
0
def A_star_search(tree, source_word, goal_word):
    # The set of nodes already evaluated
    closedSet = set()

    # The set of currently discovered nodes that are not evaluated yet.
    # Initially, only the start node is known.
    openSet = set()
    openSet.add(source_word)

    # For each node, which node it can most efficiently be reached from.
    # If a node can be reached from many nodes, cameFrom will eventually contain the
    # most efficient previous step.
    cameFrom = {}

    # For each node, the cost of getting from the start node to that node.
    # The cost of going from start to start is zero.
    gScore = {source_word: 0}

    # For each node, the total cost of getting from the start node to the goal
    # by passing by that node. That value is partly known, partly heuristic.
    # For the first node, that value is completely heuristic.
    fScore = {source_word: editdistance.distance(source_word, goal_word)}

    while openSet:
        current = find_min(fScore, openSet)  # Find the word with fmin AND in openset

        # If this word is the one we are looking for then return the reconstructed path
        if current == goal_word:
            return reconstruct_path(cameFrom, current)

        # Update the open/closed sets
        openSet.remove(current)
        closedSet.add(current)

        for new_node in bk_search(tree, current, 1):
            neighbour = new_node[1]
            if neighbour in closedSet:  # Ignore the neighbor which is already evaluated.
                continue
            # The distance from start to a neighbor
            tentative_gScore = gScore.get(current, sys.maxsize) + editdistance.distance(current, neighbour)

            if neighbour not in openSet:  # Discover a new node
                openSet.add(neighbour)
            elif tentative_gScore >= gScore.get(neighbour, sys.maxsize):  # Not a better path
                continue

            # This path is the best until now. Record it!
            cameFrom[neighbour] = current
            gScore[neighbour] = tentative_gScore
            fScore[neighbour] = gScore.get(neighbour) + editdistance.distance(neighbour, goal_word)

    # If we cant find a path between these 2 words then return that contains only the source word
    return reconstruct_path(cameFrom, source_word)
Exemple #2
0
def get_bot_accuracies(bot, scored_qa_pairs=None, min_qa_bot_confidence=.2):
    """ Compare answers from bot to answers in test set

    >>> from qary.skills import glossary_bots
    >>> bot = glossary_bots.Bot()
    >>> scored_qa_pairs = [dict(question='What is RMSE?', answer='Root Mean Square Error', score=.9, topic='ds')]
    >>> get_bot_accuracies(bot=bot, scored_qa_pairs=scored_qa_pairs)[0]['bot_accuracy']
    1.0
    >>> scored_qa_pairs = [dict(question='What is RMSE?', answer='root-mean-sqr-error', score=.9, topic='ds')]
    >>> get_bot_accuracies(bot=bot, scored_qa_pairs=scored_qa_pairs)[0]
    {'question': 'What is RMSE?',
     'answer': 'root-mean-sqr-error',
     'score': 0.9,
     'topic': 'ds',
     'bot_answer': 'Root Mean Square Error',
     'bot_w2v_similarity': 0.64...,
     'bot_ed_distance': 0.52...,
     'bot_ed_distance_low': 0.31...,
     'bot_ed_distance_folded': 0.15...,
     'bot_accuracy': 0.65...}
    """
    scored_qa_pairs = load_qa_dataset(
    ) if scored_qa_pairs is None else scored_qa_pairs
    scored_qa_pairs = load_qa_dataset(scored_qa_pairs) if isinstance(
        scored_qa_pairs, str) else scored_qa_pairs
    validated_qa_pairs = []
    for truth in scored_qa_pairs:
        texts = scrape_wikipedia.find_document_texts(topic=truth['topic'],
                                                     max_results=10)
        for context in texts:
            bot.reset_context(context)
            replies = sorted(bot.reply(truth['question']))
            if len(replies) and sorted(replies)[-1][0] > min_qa_bot_confidence:
                break
        replies = replies or [(0, "Sorry, I don't know.")]
        truth['bot_answer'] = replies[-1][1]
        truth['bot_w2v_similarity'] = nlp(truth['bot_answer']).similarity(
            nlp(truth['answer']))
        truth['bot_ed_distance'] = distance(
            truth['answer'], truth['bot_answer']) / len(truth['answer'])
        truth['bot_ed_distance_low'] = distance(
            truth['answer'].lower().strip(),
            truth['bot_answer'].lower().strip()) / len(truth['answer'].strip())
        truth['bot_ed_distance_folded'] = distance(
            fold_characters(truth['answer']),
            fold_characters(truth['bot_answer'])) / len(
                truth['answer'].strip())
        truth['bot_accuracy'] = .5 * truth['bot_w2v_similarity'] + .5 * (
            1 - (truth['bot_ed_distance'] + truth['bot_ed_distance_low'] +
                 truth['bot_ed_distance_folded']) / 3)
        validated_qa_pairs.append(dict(truth))

    return validated_qa_pairs
Exemple #3
0
def cer(_pred, _true, norm=True):
    """
    Computes the Character Error Rate, defined as the edit distance.
    Arguments:
        s1 (string): space-separated sentence
        s2 (string): space-separated sentence
    """
    _pred, _true, = _pred.replace(" ", ""), _true.replace(" ", "")
    if norm:
        l = len(_true) if len(_true) > 0 else 1
        return float(editdistance.distance(_pred, _true)) / l
    else:
        return float(editdistance.distance(_pred, _true))
def main():
    with open(json_file) as f:
        label_dict = json.load(f)
    label_dict = {int(key): val for key, val in label_dict.items()}
    actual_df = pd.DataFrame.from_dict(label_dict, orient='index')
    actual_df.reset_index(level=0, inplace=True)
    actual_df.columns = ['label', 'actual_word']

    predicted_df = pd.read_csv(predicted_csv_file)
    predicted_df = predicted_df.loc[predicted_df.groupby('predicted_label')
                                    ['confidence'].idxmax()]
    predicted_df = predicted_df.rename(columns={
        'word': 'predicted_word',
        'predicted_label': 'label'
    })

    combined_df = actual_df.merge(predicted_df, on='label')
    combined_df = combined_df.sort_values(by=['label'])
    combined_df.actual_word

    combined_df['edit_distance'] = combined_df.apply(
        lambda row: ed.distance(row.actual_word, row.predicted_word), axis=1)

    print("Number of total rows :", combined_df.shape[0])
    print("Number of rows where editdistance is zero:",
          (combined_df.edit_distance == 0).count())

    combined_df.to_csv("comparison.csv")
Exemple #5
0
    def calc_pattern_reliability(self):
        """ calculate the average Edist-distance between the spatial pattern of each core assembly and each of its significant patterns"""

        nCores = self.get_ncores()
        ed_cores_mats = [[]] * nCores
        ed_cores_means = [[]] * nCores
        core_pidx = self.get_core_PatchIdx()

        for c in np.arange(nCores):
            raster = self.get_patterns_raster()[c]
            nPatterns = raster.shape[1]
            ed_mat = np.zeros(nPatterns)
            ed_mat[:] = np.nan

            core_binary = np.zeros(raster.shape[0])
            core_binary[core_pidx[c][0][0]] = 1

            for i in np.arange(nPatterns):
                ed_mat[i] = ed.distance(core_binary.tolist(),
                                        raster[:, i].tolist())

            ed_cores_mats[c] = ed_mat
            ed_cores_means[c] = np.nanmean(ed_mat)

        ed_grandmean = np.nanmean(ed_cores_means)

        return ed_cores_mats, ed_cores_means, ed_grandmean
Exemple #6
0
    def evaluate_image(self, gt, pred):
        correct_num = 0
        pred_sum_num = 0
        gt_sum_num = 0

        for gt_text, pred_text in zip(gt, pred):
            pred_num = len(pred_text)
            gt_num = len(gt_text)
            dist = distance(pred_text, gt_text)
            correct_num += max(pred_num, gt_num) - dist
            pred_sum_num += pred_num
            gt_sum_num += gt_num

        precision = correct_num / pred_sum_num
        recall = correct_num / gt_sum_num
        hmean = 0 if (precision + recall) == 0 else 2.0 * \
                precision * recall / (precision + recall)
        per_sample_metric = {
            'pred_num': pred_sum_num,
            'gt_num': gt_sum_num,
            'correct_num': correct_num,
            'precision': precision,
            'recall': recall,
            'hmean': hmean
        }
        return per_sample_metric
Exemple #7
0
 def update(self, preds, targets):
     for i in range(len(preds)):
         targets_list, pred_list = list(
             filter(lambda a: a != 0, targets[i].tolist())), preds[i]
         self.edit_distance += editdistance.distance(
             targets_list, pred_list)
         self.target_length += len(targets_list)
def compute_cer(predicts: List[Char], targets: List[Char],
                indistinguish: bool) -> Tuple[torch.Tensor, torch.Tensor]:
    '''
    Calculate CER distance between two strings or two lists of strings

    Params:
    -------
    - predicts: List of predicted characters
    - targets: List of target characters
    - indistinguish: set to True to case-insensitive, or False to case-sensitive

    Returns:
    --------
    - distances: List of distances
    - n_references: List of the number of characters of targets
    '''
    assert type(predicts) == type(
        targets), 'predicts and targets must be the same type'
    assert len(predicts) == len(
        targets), 'predicts and targets must have the same length'

    if indistinguish:
        predicts = [list(map(str.lower, predict)) for predict in predicts]
        targets = [list(map(str.lower, target)) for target in targets]

    distances = torch.tensor([
        ed.distance(predict, target)
        for predict, target in zip(predicts, targets)
    ])
    num_references = torch.tensor(list(map(len, targets)))
    return distances, num_references
Exemple #9
0
def error_rate(hyps, targets):
    verbose = 0

    assert len(hyps) == len(targets)
    tot_edits = 0.0
    tot_len = 0.0
    idx = 0
    for h, t in zip(hyps, targets):
        distance = editdistance.distance(np.array(h), np.array(t))

        if verbose > 0:
            # If necessary, get 'alphabet' as argument after which you can compare strings.
            # CHECK: Make sure no blanks/ class #0 in here
            print("error_rate() [" + str(idx) + "] hyps:    " +
                  str(tensorList2list(h)))
            print("error_rate() [" + str(idx) + "] targets: " +
                  str(tensorList2list(t)))
            print("error_rate() [" + str(idx) + "] distance: " + str(distance))

        tot_edits += distance
        tot_len += len(t)
        idx += 1
    # end for

    # Compute character error rate (CER) == label error rate (LER)
    cer = (tot_edits * 100.0) / tot_len

    return cer
Exemple #10
0
    def _get_most_similar_entity(self, text, entities, keywords, unknown_dist, unknown):
        """
        Every entity is paired with its list of keywords.
        entities = [...]
        keywords = [[], [], ...]
        """
        text  = text.translate(str.maketrans('', '', string.punctuation))
        words = text.split(" ")
        words = [self._morph.parse(word)[0].normal_form for word in words]
        min_dist = 10000000
        min_entity = None
        for keywords, entity in zip(keywords, entities):
            # Skip empty entities
            if len(keywords) == 0:
                continue
            keywords = [self._morph.parse(keyword)[0].normal_form for keyword in keywords]
            entity_distance = 0
            for word in words:
                word = self._morph.parse(word)[0].normal_form
                entity_distance = min([editdistance.distance(word, keyword) for keyword in keywords])

                if entity_distance < min_dist:
                    min_entity = entity
                    min_dist = entity_distance

        if min_dist < unknown_dist:
            return min_entity
        else:
            return unknown
def replace_suspect_word_to_sentence(word, sent, dis=1):
    sent_pinyin = pypinyin.pinyin(sent, style=pypinyin.TONE3)
    sent_pinyin = [
        i[0][:-1] if i[0][-1] in tone else i[0] for i in sent_pinyin
    ]
    sent_chars = list(sent)
    word_pinyin = pypinyin.pinyin(word, style=pypinyin.TONE3)
    word_pinyin = [
        i[0][:-1] if i[0][-1] in tone else i[0] for i in word_pinyin
    ]
    word_len = len(word_pinyin)
    sent_len = len(sent_pinyin)
    replace_pos = []
    for i in range(sent_len - word_len + 1):
        sent_word = sent_pinyin[i:i + word_len]
        for s in sent_word:
            if len(s) == 0:
                break
            if len(s) == 1:
                if s not in letters:
                    break
        sent_word_edit = ''.join(sent_word)
        word_edit = ''.join(word_pinyin)
        if editdistance.distance(sent_word_edit, word_edit) <= dis:
            replace_pos.append(i)
    for pos in replace_pos:
        sent_chars[pos:pos + word_len] = word
    res = ''.join(sent_chars)
    return res
Exemple #12
0
 def compute_distance(self, predict: str, target: str) -> float:
     """
     Compute edit distance between two strings
     """
     distance = ed.distance(predict, target)
     distance = float(distance) / len(target)
     return distance
Exemple #13
0
def clean_up(df, threshold, field):
    '''
        Homogenises a field of the data frame by replacing similar entries with a canonical form.
        That's defined to be the most repeated form withing that similar group

        Parameters:
            df              The bottin pandas dataframe to be cleaned.
            threshold       A float (0-1] representing the maximum relative distance for 2 strings to be considered similar
            field           A string containing the name of the field to be cleaned up. The field must be string-valued

        returns:
            clean_df        The cleaned up dataframe
    '''
    clean_df = df.copy()
    series = df[field]
    counts = df[field].value_counts(sort=True)
    unique = series.unique()

    for i, str1 in enumerate(unique):
        for str2 in unique[i+1:]:
            dist = 2*distance(str1, str2)/(len(str1) + len(str2))
            if (dist > 0) and (dist <= threshold):
                if counts[str2] < counts[str1]:
                    canon = str1
                    abr = str2
                else:
                    canon = str2
                    abr = str1
                series.replace(to_replace=abr, value=canon, inplace=True)

    clean_df[field] = series
    return clean_df
Exemple #14
0
    def _min_distance_between_texts(self, src: str, target: str) -> int:
        src_norms, _     = self._text_to_normal_forms(src)
        tar_norms, _     = self._text_to_normal_forms(target)
        target_text      = " ".join(tar_norms)
        cur_min_distance = sys.maxsize

        # Linear time (of normal forms)
        for ind, word in enumerate(src_norms):
            cur_text = word
            cur_min_distance = min(editdistance.distance(cur_text, target_text), cur_min_distance)
            for word1 in src_norms[ind+1:ind+1+self._max_window_size]:
                cur_text = cur_text + " " + word1
                cur_min_distance = min(editdistance.distance(cur_text, target_text), cur_min_distance)

        return cur_min_distance

            
Exemple #15
0
 def update(self, preds: torch.Tensor, targets: torch.Tensor) -> None:
     N = preds.shape[0]
     for ind in range(N):
         pred = [_ for _ in preds[ind].tolist() if _ not in self.ignore_tokens]
         target = [_ for _ in targets[ind].tolist() if _ not in self.ignore_tokens]
         distance = editdistance.distance(pred, target)
         error = distance / max(len(pred), len(target))
         self.error = self.error + error
     self.total = self.total + N
Exemple #16
0
 def compute_distance(self, predict: str, target: str) -> float:
     """
     Compute edit distance between two strings
     """
     predict = "".join(predict).split(" ")
     target = "".join(target).split(" ")
     distance = ed.distance(predict, target)
     distance = float(distance) / len(target)
     return distance
Exemple #17
0
def check_strict(gt, pred):
    if abs(len(gt) - len(pred)) >= 2:
        return 0
    dis = editdistance.distance(gt, pred)
    if dis == 0:
        return 2
    elif dis < max(len(gt), len(pred)) * 0.3:
        return 1
    else:
        return 0
Exemple #18
0
 def update(self, preds: torch.Tensor, targets: torch.Tensor) -> None:
     for ind in range(preds.shape[0]):
         pred = [
             _ for _ in preds[ind].tolist() if _ not in self.ignore_tokens
         ]
         target = [
             _ for _ in targets[ind].tolist() if _ not in self.ignore_tokens
         ]
         self.dist_leven += editdistance.distance(pred, target)
         self.len_total += max(len(pred), len(target))
Exemple #19
0
def cer(_pred, _true, norm=True):
    """
    Computes the Character Error Rate using the `editdistance` library.

    Parameters
    ----------
    _pred : str
        space-separated sentence (prediction)
    _true : str
        space-separated sentence (ground truth)
    norm : bool
        divide by length of ground truth
    """
    _pred, _true, = _pred.replace(" ", ""), _true.replace(" ", "")
    if norm:
        l = len(_true) if len(_true) > 0 else 1
        return float(editdistance.distance(_pred, _true)) / l
    else:
        return float(editdistance.distance(_pred, _true))
Exemple #20
0
 def _levenshtein_candidates(self, predict_word):
     candidates = list()
     dist = dict()
     for word in self.dict_words:
         dist.update({word: ed.distance(predict_word, word)})
     min_dist = min(dist.items(), key=lambda x: x[1])[1]
     for key, value in dist.items():
         if value == min_dist:
             candidates.append(key)
     return candidates
Exemple #21
0
def findWord(parola, insieme):
    if parola in insieme:
        return 0, parola
    minDistance = sys.maxsize
    nearWord = None
    for word in insieme:
        d = distance(parola, word)
        if minDistance > d:
            minDistance = d
            nearWord = word
    return minDistance, nearWord
Exemple #22
0
def test(model,
         pe,
         TestSet,
         max_len,
         make_mask,
         class_num,
         converter,
         error_analysis=True):
    model.eval()
    loader = DataLoader(TestSet,
                        batch_size=250,
                        shuffle=False,
                        num_workers=8,
                        pin_memory=True,
                        drop_last=False)
    CCR = 0.0
    n_correct = 0
    start = time.time()
    for images, labels in loader:
        if error_analysis:
            images_norm = []
            for item in images:
                images_norm.append(norm(item))
            images_norm = torch.stack(images_norm, dim=0)
        else:
            images_norm = images
        texts = predict(model, pe, images_norm, max_len, make_mask, class_num,
                        converter)
        for i, (text, label) in enumerate(zip(texts, labels)):
            text = text[:text.find('[s]')]
            if text != label:
                n_correct += 1
                if error_analysis:
                    try:
                        name = label + " " + text + ".jpg"
                        ToPIL(
                            images[i]).save("./error_analysis_TwoDAttention/" +
                                            name)
                    except:
                        pass
            try:
                NED = 1 - editdistance.distance(text, label) / max(
                    len(text), len(label))
                CCR += NED
            except:
                pass
    CCR /= TestSet.len
    CCR = round(100 * CCR, 2)
    WCR = (TestSet.len - n_correct) / TestSet.len
    WCR = round(100 * WCR, 2)
    end = time.time()
    print(f"CCR:{CCR}%;WCR:{WCR}%;time consumed:{time_interval(end-start)}")
    torch.cuda.empty_cache()
    return CCR
Exemple #23
0
def tree_insertion(tree, word):
    if len(tree) == 0:
        tree.append([word, {}])
        return
    node = tree[0]
    while node is not None:
        node_word = node[0]
        distance = editdistance.distance(word, node_word)
        parent = node
        node = node[1].get(distance)
        if not node:
            parent[1][distance] = [word, {}]
Exemple #24
0
def compute_norm_wer(pred_tokens, tgt_tokens, split_token=' '):
    # type: (List[str], List[str], str) -> float
    pred_words = [
        ''.join(word_tokens)
        for word_tokens in split_by_token(pred_tokens, split_token)
    ]
    tgt_words = [
        ''.join(word_tokens)
        for word_tokens in split_by_token(tgt_tokens, split_token)
    ]
    wer = ed.distance(pred_words, tgt_words) / len(tgt_words)
    return wer
 def update(self, preds: torch.Tensor, targets: torch.Tensor) -> None:
     N = preds.shape[0]
     for ind in range(N):
         pred = [
             _ for _ in preds[ind].tolist() if _ not in self.ignore_tokens
         ]  # x = torch([0,2,1]) -> [2]
         target = [
             _ for _ in targets[ind].tolist() if _ not in self.ignore_tokens
         ]
         distance = editdistance.distance(
             pred, target)  # 한 문자열을 다른 문자열로 바꾸는데 필요한 add edit delete의 수
         error = distance / max(len(pred), len(target))
         self.error = self.error + error  # 0으로 initialize되어있나?
     self.total = self.total + N
Exemple #26
0
    def test_swap(self):
        typos_trans.mode = 'swap'
        x = typos_trans.transform(data_sample, n=1)
        self.assertTrue(1 == len(x))

        for sample in x:
            self.assertTrue(sample.get_text('x') != data_sample.get_text('x'))
            self.assertTrue(editdistance.distance(sample.get_text('x'),
                                                data_sample.get_text('x')) <= 4)

        special_sample = SASample({'x': '', 'y': "negative"})
        self.assertEqual([], typos_trans.transform(special_sample))
        special_sample = SASample({'x': '~!@#$%^7890"\'', 'y': "negative"})
        self.assertEqual(1, len(typos_trans.transform(special_sample)))
    def dist(self, o):
        import editdistance

        # replace NaN with 0
        # sd = np.where(self.data == self.nanValue, 0, self.data)
        # od = np.where(o.data == self.nanValue, 0, o.data)

        # neutralize NaNs
        sd = self.data.copy()
        od = o.data.copy()
        sd[sd == self.nanValue] = od[sd == self.nanValue]
        od[od == self.nanValue] = sd[od == self.nanValue]

        return editdistance.distance(str(sd), str(od))
Exemple #28
0
def find_match(source_word):
	"""Finds the best match for a source word"""

	min_dist = 100
	# min_dist = len(source_word) * 2
	optimal_words = []

	target_file = open('common_words.txt', 'r')

	# FIXME: Runtime of this is O(n^2). Can we improve this?
	for line in target_file:
		target_word = line.rstrip()

		if distance(source_word, target_word) == min_dist:
			# Add this word to the list
			optimal_words.append(target_word)

		if distance(source_word, target_word) < min_dist:
			min_dist = distance(source_word, target_word)
			# re-initialize the list, with only this word as a possible correction
			optimal_words = [target_word]

	return choice(optimal_words)
Exemple #29
0
def compute_global_wer(pred_tokens, tgt_tokens, split_token=' '):
    # type: (List[str], List[str], str) -> Tuple[int, int]
    pred_words = [
        ''.join(word_tokens)
        for word_tokens in split_by_token(pred_tokens, split_token)
    ]
    tgt_words = [
        ''.join(word_tokens)
        for word_tokens in split_by_token(tgt_tokens, split_token)
    ]

    dist = ed.distance(pred_words, tgt_words)
    num_refs = len(tgt_words)
    return dist, num_refs
Exemple #30
0
def find_string_distances(embeddings, words, phoc_levels):
    '''
    Given embeddings (2d array of shape mxn where m is the number of words and n is the size of embedding), array of words,list of levels for phocnet and word length, find the words mapped from each row of embeddings, calculates distance between mapped word and actual word and returns this list of distances
    '''
    mapped_words = []
    for i in range(embeddings.shape[0]):
        mapped_word = predict_word_from_embedding(embeddings[i], phoc_levels,
                                                  len(words[i]))
        mapped_words.append(mapped_word)

    distance_array = [
        ed.distance(words[i], mapped_words[i]) for i in range(len(words))
    ]

    return distance_array