Example #1
0
def group_clone_VJ_cdr3(dico_same_VJ, dicoSeq, Clone_threshold):
    VJ_ID_diff_CDR3 = {}
    dicoclone_vj_cdr3 = {}

    for VJ_ID in dico_same_VJ.keys():
        sub_sub_group = 0
        #print (VJ_ID,"VJ_ID,VJ_ID")
        VJ_ID_diff_CDR3[VJ_ID] = {}
        for seq in dico_same_VJ[VJ_ID]:
            sub_gourp_dist = {}
            CDR3_seq = dicoSeq[seq.rstrip()][2]
            print(VJ_ID_diff_CDR3[VJ_ID].keys())
            if len(VJ_ID_diff_CDR3[VJ_ID].keys()) != 0:
                Sub_gourp = VJ_ID_diff_CDR3[VJ_ID].keys()
                for g in Sub_gourp:
                    print(g, dicoSeq[seq.rstrip()][2])
                    print(
                        "aaaaa",
                        1 - hamming_distance(dicoSeq[seq.rstrip()][2], g) /
                        float(len(g)))
                    if len(dicoSeq[seq.rstrip()][2]) == len(g):
                        if 1 - (hamming_distance(dicoSeq[seq.rstrip()][2], g) /
                                float(len(g))) >= Clone_threshold:
                            print("here!")
                            sub_gourp_dist[g] = ('+', 1 - (hamming_distance(
                                dicoSeq[seq.rstrip()][2], g) / float(len(g))))

                    elif dicoSeq[seq.rstrip()][1].split("*")[0][-1] == "6":
                        length = max(len(dicoSeq[seq.rstrip()][2]), len(g))
                        if 1 - (levenshtein_distance(dicoSeq[seq.rstrip(
                        )][2], g) / float(length)) >= Clone_threshold:
                            sub_gourp_dist[g] = (
                                '+', 1 -
                                (levenshtein_distance(dicoSeq[seq.rstrip()][2],
                                                      g) / float(length)))

                if sub_gourp_dist == {}:
                    VJ_ID_diff_CDR3[VJ_ID][CDR3_seq] = [seq]
                else:
                    dist_loc = {}
                    for key in sub_gourp_dist.keys():
                        seqs = [
                            skbio.Protein(CDR3_seq,
                                          metadata={'id': "CDR3_seq"}),
                            skbio.Protein(key, metadata={'id': "key"})
                        ]
                        #print (seqs[0],seqs[1])
                        msa = skbio.alignment.global_pairwise_align_protein(
                            seqs[0], seqs[1], 25)
                        dist_loc[key] = float(msa[1])
                    print(dist_loc, "dist_loc")
                    best_coressp = (max(dist_loc.items(),
                                        key=operator.itemgetter(1))[0])
                    VJ_ID_diff_CDR3[VJ_ID][best_coressp].append(seq)
            else:
                VJ_ID_diff_CDR3[VJ_ID][CDR3_seq] = [seq]
            print(sub_gourp_dist)
    #print (VJ_ID_diff_CDR3)
    return VJ_ID_diff_CDR3
def find_city(city1, city2):
    if city1 == 'U' or city2 == 'U':
        return 1
    elif levenshtein_distance(city1, city2) <= 1:
        return 0
    else:
        return -1
def find_zipcode(zip1, zip2):
    if zip1 == 'U' or zip2 == 'U':
        return 1
    elif levenshtein_distance(str(zip1), str(zip2)) <= 1:
        return 0
    else:
        return -1
    def function_for_remove_duplicates(self,
                                       similar_column='full_name',
                                       threshold=2):
        """
        Function for process data, remove duplicate people records.
        :param similar_column: column by which find duplicates in data
        :param threshold: maximum value for returned Levenshtein distance between two samples in data
        :return: data without duplicates in similar_column.
        """
        dupl_indexes = []
        rows_number = self.unique_data.shape[0]
        for i in tqdm(range(rows_number - 1)):
            distances = np.array([
                levenshtein_distance(
                    self.unique_data[similar_column].values[i],
                    self.unique_data[similar_column].values[j])
                for j in range(i + 1, rows_number)
            ])
            matching_indexes = np.where(distances <= threshold)[0]
            matching_indexes = matching_indexes + i + 1

            d_b1 = self.unique_data['date_of_birth'].iloc[i]
            dupl_indexes += [
                self.unique_data.index[match] for match in matching_indexes
                if self.unique_data['date_of_birth'].iloc[match] == d_b1
            ]
        clean_df = self.unique_data.copy()
        for index_list in dupl_indexes:
            clean_df = clean_df.drop(index_list)

        return clean_df
Example #5
0
def create_random_pairs(positive_instances, positive_pairs_all_datasets,
                        existing_negatives):

    random.seed(42)
    # holds the Levenshtein distance of each concept pair
    distances = []

    # tracks already created negative pairs as tuples, i.e. (l1,l2), to avoid duplicate creation
    new_negative_pairs = []

    for i, row in tqdm(positive_instances.iterrows(),
                       total=positive_instances.shape[0]):
        label1 = row['source']

        # initialise random index
        random_index = i

        # make sure that no term pair duplicates or reverse duplicates are created
        # comparing to both positive and negative concept pairs
        while random_index == i or\
            is_existing_pair(positive_pairs_all_datasets, label1, label2) or\
            is_existing_pair(existing_negatives, label1, label2) or\
            (label1, label2) in new_negative_pairs or (label2, label1) in new_negative_pairs\
            or label1.lower() == label2.lower():

            # choose a new random index and source vs target and get a new pairing term

            random_index = random.randint(0, positive_instances.shape[0] - 1)
            source_or_target = random.choice(['source', 'target'])
            label2 = positive_instances.loc[random_index][source_or_target]

        distances.append(levenshtein_distance(label1.lower(), label2.lower()))
        new_negative_pairs.append((label1, label2))

    return new_negative_pairs, distances
def lambda_handler(event, context):
    """Get Levenshtein Distance Lambda Handler

    Parameters
    ----------
    event: dict, required
        API Gateway Lambda Proxy Input Format

        Event doc: https://docs.aws.amazon.com/apigateway/latest/developerguide/set-up-lambda-proxy-integrations.html#api-gateway-simple-proxy-for-lambda-input-format

    context: object, required
        Lambda Context runtime methods and attributes

        Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html

    Returns
    ------
    API Gateway Lambda Proxy Output Format: dict

        Return doc: https://docs.aws.amazon.com/apigateway/latest/developerguide/set-up-lambda-proxy-integrations.html
    """
    return {
        "statusCode":
        200,
        "body":
        json.dumps(
            {"result": levenshtein_distance(event['nombre'], event['x'])}),
    }
Example #7
0
def levenshtein_distance_norm(str1, str2):
    '''
    归一化的编辑距离, 越小越好
    @return [0, 1]
    '''
    max_len = max(len(str1), len(str2), 1)
    return levenshtein_distance(str1, str2) / max_len
Example #8
0
 def _parse_comorbs_sql(self, comorbs: pd.DataFrame, conflicts: str,
                        edit_threshold: int):
     for x in comorbs.comorb_name.unique():
         if x not in self._comorb_keys:
             similar = list(
                 filter(
                     lambda k: levenshtein_distance(k, x) >= edit_threshold,
                     self._comorb_keys))
             if len(similar) > 1:
                 err = f"{x} conflicts with more than one comorbidity keys: {similar}"
                 if conflicts == "raise":
                     raise ValueError(err)
                 warn(
                     f"{err}; ignoring conflict, comorbiditiy {x} will be skipped"
                 )
                 comorbs = comorbs[comorbs.comorb_name != x]
             elif len(similar) == 1:
                 err = f"{x} conflicts with existing comorbidity {similar[0]}"
                 if conflicts == "merge":
                     warn(f"{err}; merging conflict with existing value")
                     comorbs["comorb_name"] = comorbs["comorb_name"].apply(
                         lambda x: similar[0])
                 elif conflicts == "raise":
                     raise ValueError(err)
                 else:
                     warn(
                         f"{err}; ignoring conflict, comorbiditiy {x} will be skipped"
                     )
             else:
                 self._comorb_keys.append(x)
                 curr = self._config.db_connection.cursor()
                 curr.execute(
                     "INSERT INTO ComorbKey (comorb_name) VALUES (?)", x)
                 self._config.db_connection.commit()
     return comorbs
    def calc_batch_levenshtein(predicted_word_idx,
                               targets,
                               vocab: Vocabulary,
                               verbose=False):
        predicted_word_idx = list(predicted_word_idx.cpu().numpy())
        targets = list(targets.cpu().numpy())

        distances = []
        for index, predicted_sentence_word_idx in enumerate(
                predicted_word_idx):
            sentence_to_str = TrainingUtils.word_idx_to_caption_sentence(
                predicted_sentence_word_idx, vocab)
            target_sentence = TrainingUtils.word_idx_to_caption_sentence(
                targets[index], vocab)
            levenshtein_metric = levenshtein_distance(target_sentence,
                                                      sentence_to_str)
            distances.append(levenshtein_metric)

            if verbose:
                print(f'\nPredicted: {sentence_to_str}')
                print(f'Target: {target_sentence}')
                print(f'Levenshtein distance: {levenshtein_metric}\n')
                verbose = False

        return np.mean(distances)
def find_DOB(dob1, dob2):
    if dob1 == 'U' or dob2 == 'U':
        return 1
    elif levenshtein_distance(dob1, dob2) <= 1:
        return 0
    else:
        return -1
Example #11
0
def address(a1, a2):
    if a1 == a2:
        return True
    if not a1 or not a2:
        return False
    a1 = a1.lower()
    a2 = a2.lower()
    return levenshtein_distance(a1, a2) <= 3
Example #12
0
def vendor(v1, v2):
    if v1 == v2:
        return True
    if not v1 or not v2:
        return False
    v1 = v1.lower()
    v2 = v2.lower()
    return levenshtein_distance(v1, v2) <= 0
Example #13
0
def img2txt2(imgName, textFile, outfile):
    # read image
    img = cv2.imread(imgName)

    # real text
    realText = ' '.join(getText(textFile))

    # segment character from image
    print("Segmenting image to characters ...")
    segmentedChars = img2Chars(img)

    # loading the model
    model = loadModel(MODEL_NAME)


    
    f = open(outfile, 'w')
    allText = ""
    for i in tqdm(range(len(segmentedChars))):
        currentWord = ""
        for c in segmentedChars[i][2]:
            prediction = model.predict([prepareCharImg(c)])
            char = chars_decode[prediction[0]]
            currentWord += char
            allText += char
        allText += ' '
        f.write(currentWord)
        f.write(' ')
    f.close()

    with open("hello.txt", 'w') as f:
        f.write(realText)
        f.write("\n\n\n\n")
        f.write(allText)
    
    print("\n\n accuracy:")
    print(len(allText), len(realText))
    print(levenshtein_distance(realText, allText) / len(realText))

    print("another accuracy")
    realList = realText.split(' ')
    allList = allText.split(' ')
    errors = 0
    for i in range(len(realList)):
        errors += levenshtein_distance(realList[i], allList[i])
    print(errors / len(realText))
Example #14
0
 def _get_similarity_score_levenshtein(self, new_words):
     # levenshtein edit distance based
     d = sum([
         levenshtein_distance(self.words[idx], new_words[idx])
         if self.words[idx] != '' else 0 for idx in range(self.nwords)
     ])
     levenshtein_score = 1 - d / (sum(self._wordlens) +
                                  sum([len(w) for w in new_words]))
     return levenshtein_score
Example #15
0
def getTranslationNear(msgid_to_search, percent):
    max_word_diff = 1 + int(percent * len(msgid_to_search))
    possible_solutions = list()
    for msgid in datastore:
        dist = levenshtein_distance(msgid, msgid_to_search)
        if dist < max_word_diff:
            possible_solutions.append((dist, datastore[msgid]))
    possible_solutions.sort(key=lambda x: x[0])
    return possible_solutions
Example #16
0
 def cer(self, s1, s2):
     """
     Computes the Character Error Rate, defined as the edit distance.
     Arguments:
         s1 (string): space-separated sentence
         s2 (string): space-separated sentence
     """
     s1, s2, = s1.replace(' ', ''), s2.replace(' ', '')
     return levenshtein_distance(s1, s2)
Example #17
0
def products(p1, p2):
    #print(p1['name'].lower(),'---', p2['name'].lower())
    n1 = p1['name'].lower()
    n2 = p2['name'].lower()
    if levenshtein_distance(n1, n2) <= 0:
        try:
            price1 = float(p1['price'].replace(',', '.'))
        except:
            return False
        if util.floatCompare(price1, float(p2['price'])):
            a1 = p1['amount']
            a2 = p2['amount']
            return a1 == a2
    return False
    if levenshtein_distance(n1, n2) <= 0:
        return True
    #print(p1,'---',p2)
    return False
def get_distance(DataLoaderContainer, y_pred, y):
    y_greedy = torch.max(y_pred, dim=1)[1]
    y_pred_char = ''.join([
        DataLoaderContainer.index_to_char[idx]
        for idx in y_greedy.detach().cpu()
    ])
    y_true_char = ''.join(
        [DataLoaderContainer.index_to_char[idx] for idx in y.detach().cpu()])
    return levenshtein_distance(y_pred_char, y_true_char)
Example #19
0
def evaluate_text(message, goal_text, verbose=VERBOSE):
    """Given a Message and a goal_text string, return the Levenshtein distance
    between the Message and the goal_text as a length 1 tuple.
    If verbose is True, print each Message as it is evaluated.
    """
    distance = levenshtein_distance(message.get_text(), goal_text)
    if verbose:
        print("{msg!s}\t[Distance: {dst!s}]".format(msg=message, dst=distance))
    return (distance, )  # Length 1 tuple, required by DEAP
Example #20
0
def getNearestWord(word, wordsList):
    bestWord = word
    distance = 1000000000
    for w in wordsList:
        d = levenshtein_distance(word, w)
        if d < distance:
            distance = d
            bestWord = w
    return bestWord
Example #21
0
    def getDistance(self):
        with open(self.ipFile) as f:
            lines = f.readlines()
            print(lines[0])
            print(lines[1])
            self.distance = levenshtein_distance(lines[0].strip(), lines[1].strip())

        fout = open(f'{self.opFile}', 'w')
        fout.write(f"{str(self.distance)}")
        fout.close()
Example #22
0
def calculateDistance(row1, row2):
    sum = 0
    for var in explVariables:
        if var not in stringFields:
            sum += (row1[var] - row2[var]) ** 2
        elif var == 'Review Title':
            sum += (1-levenshtein_distance(row1[var], row2[var])/max(len(row1[var]), len(row2[var])))
        elif var == 'titleSentiment' and row1[var] == row2[var]:
                sum += 1
    return math.sqrt(sum)
Example #23
0
def find_diff(line, input):
    next_line = 1

    while next_line < len(input):

        if levenshtein_distance(line, input[next_line]) == 1:
            return line

        next_line += 1

    return "None"
Example #24
0
def process_message(msg, tabs, min_levenshtein_ratio, test_mode_prefix=False):
    for code in tabs:
        msgtr = translate(code, msg)
        dist = levenshtein_distance(msg, msgtr)
        ratio = dist / len(msg)
        if ratio > min_levenshtein_ratio:
            print(" code=%s ratio=%lf => %s" % (code, ratio, msgtr))
            if test_mode_prefix:
                msgtr = test_mode_prefix + msgtr
            return msgtr
    return None
Example #25
0
def crossref_is_similar(cr_info, bib_info, max_levenshtein_distance):
    is_similar = False
    if cr_parser.has_title(cr_info):
        entry_title = bib_parser.get_title(bib_info)
        entry_title = cleaner.clean_braces(entry_title)
        crossref_title = cr_parser.get_title(cr_info)
        lev_distance = levenshtein_distance(crossref_title, entry_title)
        if lev_distance <= max_levenshtein_distance:
            is_similar = True

    return is_similar
def compare_address(address1, address2):
    if address1 == 'U' or address2 == 'U':
        return 1
    levenshtein_sum = 0
    min_street = min(len(address1), len(address2))
    for x in range(min(len(address1), len(address2))):
        # if we are comparing the last word compare the shortest combination
        if x == min_street - 1:
            min_word_length = min(len(address1[x]), len(address2[x]))
            temp1 = address1[x][:min_word_length]
            temp2 = address2[x][:min_word_length]
            levenshtein_distance(temp1, temp2)
        else:
            levenshtein_sum += levenshtein_distance(address1[x], address2[x])
    if levenshtein_sum == 0:
        return 0
    if min_street / levenshtein_sum < 1:
        return -1
    else:
        return 0
Example #27
0
def filter_name(raw_name, list_name):
    ld_list = [levenshtein_distance(e, raw_name) for e in list_name]
    min_ld = min(ld_list)
    filtered_name = "?"
    if (min_ld > 2):
        name_lenght = len(raw_name)
        if (name_lenght >= 30):
            print(
                colored(
                    "[1] %s (= %s) min_ld = %d" %
                    (raw_name, list_name[ld_list.index(min_ld)], min_ld),
                    'yellow'))
            # the name might be truncated:
            ld_list = [
                levenshtein_distance(e[0:name_lenght], raw_name)
                for e in list_name
            ]
            min_ld = min(ld_list)
            if (min_ld > 4):
                print(
                    colored(
                        "[2] %s (= %s) min_ld = %d" %
                        (raw_name, list_name[ld_list.index(min_ld)], min_ld),
                        'red'))
            else:
                print(
                    colored(
                        "[2] %s (= %s) min_ld = %d" %
                        (raw_name, list_name[ld_list.index(min_ld)], min_ld),
                        'green'))
                filtered_name = list_name[ld_list.index(min_ld)]
        else:
            print(
                colored(
                    "[1] %s (= %s) min_ld = %d" %
                    (raw_name, list_name[ld_list.index(min_ld)], min_ld),
                    'red'))
    else:
        filtered_name = list_name[ld_list.index(min_ld)]
        #print("%s => %s" %(name, filtered_name))
    return filtered_name
Example #28
0
def calculateErrorRate(groundTruthPath="NewDataset/text/", predictedPath="OutputTextFiles/", statsFile="CER.txt"):
    with open(statsFile, 'w') as f:
        files = os.listdir(predictedPath)
        totalError = 0
        for file in files:
            realText = ' '.join(getText(groundTruthPath + file))
            predictedText = ' '.join(getText(predictedPath + file))
            error = levenshtein_distance(realText, predictedText) / len(realText)
            totalError += error
            f.write(f"file: {file}\t\tCER: {error}\n")
        totalError /= len(files)
        f.write(f"Total CER: {totalError}")
Example #29
0
def is_similar(value, strings, settings):
    """
    Checks is a string is similar to one in a set of strings
    :param value:
    :param strings:
    :param settings:
    :return:
    """
    for s in strings:
        if levenshtein_distance(value, s) < (len(value) / settings["s"]):
            return True
    return False
Example #30
0
def filtered_similars(wordlist, forbidden):
    unfiltered = similar_list(wordlist)
    result = []
    for s, dist in unfiltered:
        ok = True
        for parent in forbidden:
            if levenshtein_distance(parent,
                                    s) <= LEVENSHTEIN_THRESHOLD or parent in s:
                ok = False
        if ok:
            result.append((s, dist))
    return result
def levenshtein(a, b):
    d = levenshtein_distance(a, b) / max(len(a), len(b))
    return (1 - d)
Example #32
0
 def _execute(self, str1, str2):
     LDAlgorithm._execute(self, str1, str2)
     return levenshtein_distance(str1, str2)