def extract_features(df):
    features = pd.DataFrame()

    print('extracting space splitted sequence features...')

    df['q1_words'] = df.question1.map(space_split)
    df['q2_words'] = df.question2.map(space_split)

    features['str_leven1'] = df.apply(
        lambda r: distance.nlevenshtein(r.q1_words, r.q2_words, method=1),
        axis=1)
    features['str_leven2'] = df.apply(
        lambda r: distance.nlevenshtein(r.q1_words, r.q2_words, method=2),
        axis=1)
    features['str_jaccard'] = df.apply(
        lambda r: distance.jaccard(r.q1_words, r.q2_words), axis=1)
    #features['str_hamming'] = df.apply(lambda r: distance.hamming(r.q1_words, r.q2_words, normalized=True), axis=1)
    #features['str_sorensen'] = df.apply(lambda r: distance.jaccard(r.question1, r.question2), axis=1)

    print('extracting stemmed word sequence features...')

    df['q1_stems'] = df.question1.map(stem)
    df['q2_stems'] = df.question2.map(stem)

    features['stem_leven1'] = df.apply(
        lambda r: distance.nlevenshtein(r.q1_stems, r.q2_stems, method=1),
        axis=1)
    features['stem_leven2'] = df.apply(
        lambda r: distance.nlevenshtein(r.q1_stems, r.q2_stems, method=2),
        axis=1)
    features['stem_jaccard'] = df.apply(
        lambda r: distance.jaccard(r.q1_stems, r.q2_stems), axis=1)

    return features.fillna(.0)
def thelevenstein():
    verbose = 0
    mypath = '~/test'
    ## get the list of all files
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    ## read all the txt files so as not to read it severally
    fileData = [
        ' '.join(open(join(mypath, f), 'r').read().split()[1:])
        for f in onlyfiles
    ]
    ## intialize an empty dataframe
    new_df = pd.DataFrame()
    ## iterate between files and find the similarity metric
    i = 0
    for f1 in fileData:
        # print 'currently processing ', onlyfiles[i]
        j = 0
        for f2 in fileData:
            if i <= j:
                new_df.loc[i, j] = distance.nlevenshtein(f1.lower().strip(),
                                                         f2.lower().strip(),
                                                         method=2)
            if verbose and (j % 100 == 0):
                print('currently processing', onlyfiles[i], ' with ',
                      onlyfiles[j])
            j += 1
        i += 1
    new_df.columns = onlyfiles
    new_df.index = onlyfiles
    print('all calculations made. Exporting to csv')
    new_df.to_csv('document_similarity_levenstein_business.csv',
                  encoding='utf-8')
    print('Export to csv done!')
Beispiel #3
0
def names_are_similar(db_name, patents_name):
    if patents_name is None: return False
    patents_name = standardize_name(patents_name)
    dist1 = distance.nlevenshtein(db_name, patents_name, method=1)
    dist2 = distance.nlevenshtein(db_name,
                                  patents_name.split(" ")[0],
                                  method=1)
    dist3 = distance.nlevenshtein(db_name.split(" ")[0],
                                  patents_name,
                                  method=1)
    response = sum([
        dist1 < 0.2, dist2 < 0.2, dist3 < 0.2, (dist2 == 0) * 2,
        (dist3 == 0) * 2
    ]) > 1
    if response: print("--Matched:", patents_name)
    return response
Beispiel #4
0
def get_pairs(*lists, **options):
	pairs = options.get('pairs', [])
	method = options.get('method', 1)	# method 1 for shortest alignment, 2 for longgest
	# cache the result cause it is always time-consuming
	use_cache = options.get('use_cache', True)
	if use_cache and os.path.exists(CACHE_FILENAME):
			with open(CACHE_FILENAME, 'r') as f:
				cache = f.read().splitlines()
			for line in cache:
				pairs.append(filter(lambda x: x.strip(), map(lambda x: x.strip("' \""), line.split('***'))))
	else:
		for prime in lists[0]:
			pair = [ prime ]
			for minors in lists[1:]:
				# calculate its edit distance to the prime
				distances = map(lambda minor: distance.nlevenshtein(prime, minor, method), minors)
				# get the value whose levenshtein distance to the prime is the minimum
				most_matched = lambda l: minors[ l.index( min(l) ) ] 
				
				candidate = most_matched(distances)
				pair.append(candidate)
			pairs.append(pair)

		with open(CACHE_FILENAME, 'w') as f:
			for pair in pairs:
				f.write('***'.join(pair))	# write to files to cache
				f.write(os.linesep)

	return pairs
def lev(doc1, doc2):
    txt1 = open(doc1).read()
    txt2 = open(doc2).read()
    p = distance.nlevenshtein(txt1.lower().strip(),
                              txt2.lower().strip(),
                              method=2)
    return p
def val(net, test_dataset, criterion, max_iter=2):
    print('Start val')

    for p in crnn.parameters():
        p.requires_grad = False

    net.eval()
    data_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=opt.batchSize,
        num_workers=int(opt.workers),
        sampler=dataset.randomSequentialSampler(test_dataset, opt.batchSize),
        collate_fn=dataset.alignCollate(imgH=opt.imgH,
                                        imgW=opt.imgW,
                                        keep_ratio=opt.keep_ratio))
    val_iter = iter(data_loader)

    i = 0
    n_correct = 0
    loss_avg = utils.averager()
    test_distance = 0
    max_iter = min(max_iter, len(data_loader))
    for i in range(max_iter):
        data = val_iter.next()
        i += 1
        cpu_images, cpu_texts = data
        batch_size = cpu_images.size(0)
        utils.loadData(image, cpu_images)
        if ifUnicode:
            cpu_texts = [clean_txt(tx.decode('utf-8')) for tx in cpu_texts]
        t, l = converter.encode(cpu_texts)
        utils.loadData(text, t)
        utils.loadData(length, l)

        preds = crnn(image)
        preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size))
        cost = criterion(preds, text, preds_size, length) / batch_size
        loss_avg.add(cost)

        _, preds = preds.max(2)
        # preds = preds.squeeze(2)
        preds = preds.transpose(1, 0).contiguous().view(-1)
        sim_preds = converter.decode(preds.data, preds_size.data, raw=False)
        for pred, target in zip(sim_preds, cpu_texts):
            if pred.strip() == target.strip():
                n_correct += 1
            # print(distance.levenshtein(pred.strip(), target.strip()))
            test_distance += distance.nlevenshtein(pred.strip(),
                                                   target.strip(),
                                                   method=2)
    raw_preds = converter.decode(preds.data, preds_size.data,
                                 raw=True)[:opt.n_test_disp]
    for raw_pred, pred, gt in zip(raw_preds, sim_preds, cpu_texts):

        print('%-20s => %-20s, gt: %-20s' % (raw_pred, pred, gt))
    accuracy = n_correct / float(max_iter * opt.batchSize)
    test_distance = test_distance / float(max_iter * opt.batchSize)
    testLoss = loss_avg.val()
    #print('Test loss: %f, accuray: %f' % (testLoss, accuracy))
    return testLoss, accuracy, test_distance
Beispiel #7
0
def inlevenshtein(seq1, seqs, max_dist=0.1):
    for seq2 in seqs:
        dist1 = distance.levenshtein(seq1, seq2, max_dist=2)
        if dist1 !=-1:
            dist2 = distance.nlevenshtein(seq1, seq2, )
            if dist2 <= max_dist:
                yield dist2, seq2
Beispiel #8
0
    def identify_keywords(self, key_words, headers):
        key_words = [x.lower() for x in key_words]
        headers = [x.lower() for x in headers]
        all_words = []
        all_words.extend(key_words)
        all_words.extend(headers)
        print(all_words)
        featers = []
        for column_word in all_words:
            featuer_dict = {}
            featuer_dict['word'] = column_word
            for word in key_words:
                featuer_dict[word] = distance.nlevenshtein(word,
                                                           column_word,
                                                           method=1)
            featers.append(featuer_dict)

        data_frame = pd.DataFrame(featers)
        train_df = data_frame.drop(['word'], axis=1)
        kmeans = KMeans(n_clusters=len(key_words),
                        random_state=0).fit(train_df)
        clusters = kmeans.labels_.tolist()
        duplicates = set([x for x in clusters if clusters.count(x) > 1])

        df = pd.DataFrame({'header': all_words, 'cluster': clusters})
        header_pairs = []
        for duplicate in duplicates:
            header_pairs.append(df['header'][df['cluster'] == duplicate])
        return header_pairs
Beispiel #9
0
def find_pairs(*lists):
	pairs = []
	
	cache_file = 'cache.txt'
	if os.path.exists(cache_file):
		with open(cache_file, 'r') as f:
			cache = f.read().splitlines()
		for line in cache:
			pair = filter(lambda x: x.strip(),  line.split('***'))
			if pair:	# to avoid empty list
				pairs.append(pair)
	else:
		with open(cache_file, 'w') as f:
			for prime in lists[0]:
				pair = [ prime ]
				for minors in lists[1:]:
					similarty = map(lambda minor: distance.nlevenshtein(prime, minor, method=2), minors)
					most_matched = lambda l: minors[ l.index( min(l) ) ]
					candidate = most_matched(similarty)
					pair.append(candidate)
				pairs.append(pair)

				f.write('***'.join(pair))
				f.write(os.linesep)


	return pairs
Beispiel #10
0
def evaluate_pretain(searcher, voc, test_x, test_y):
    ### Format input sentence as a batch
    # words -> indexes
    x_indexes_batch = [indexesFromSentence(voc, test_x)]
    y_indexes_batch = indexesFromFPs(voc, test_y)
    # y_indexes_batch = indexesFromSentence(voc, test_y)
    # y_indexes_batch = indexesFromSentence(voc, test_y)
    # Create lengths tensor
    lengths = torch.Tensor([len(indexes) for indexes in x_indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(x_indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, 100)
    tokens = tokens[:-1]
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    # print(test_x)
    # print(x_indexes_batch)
    # print([voc.index2word[token] for token in x_indexes_batch[0]])
    # print(test_y)
    # print(decoded_words)
    reference = [decoded_words]
    candidate = [voc.index2word[token] for token in y_indexes_batch][:-1]
    print(test_x)
    print(''.join(decoded_words))
    print(''.join(candidate))
    # print(test_y)
    score = sentence_bleu(reference, candidate)
    dis = 1 - distance.nlevenshtein(decoded_words, candidate)
    print(dis)
    print('-' * 80)
    return score, 1 if dis >= 0.6 else 0
Beispiel #11
0
    def levenshtein_long(self, string_one, questions_list, print_flag=False):
        bigger = 1
        frase = ""
        index = -1
        i = 0
        for element in questions_list:

            compare = distance.nlevenshtein(string_one,
                                            element.lower(),
                                            method=2)
            #print "Score: " + str(distance.levenshtein(string_one, element))

            if print_flag:
                print "Normalizado: " + str(compare)
                print "Sentence: " + element
                print "Index number: ", i, "\n"

            if compare < bigger:
                bigger = compare
                frase = element
                index = i
            i += 1

        ans = frase, index
        return ans
def compute_similarity(X):
    """
    Compute similarity matrix with mean of 3 distances
    :param X: List of contracts ssdeep hashes
    :return: Similarity matrix
    """
    jaccard_matrix = pdist(X, lambda x, y: distance.jaccard(x[0], y[0]))
    np.savetxt("../data/jaccard_matrix.csv",
               np.asarray(squareform(jaccard_matrix)),
               delimiter=",")

    sorensen_matrix = pdist(X, lambda x, y: distance.sorensen(x[0], y[0]))
    np.savetxt("../data/sorensen_matrix.csv",
               np.asarray(squareform(sorensen_matrix)),
               delimiter=",")

    # normalized, so that the results can be meaningfully compared
    # method=1 means the shortest alignment between the sequences is taken as factor
    levenshtein_matrix = pdist(
        X, lambda x, y: distance.nlevenshtein(x[0], y[0], method=1))
    np.savetxt("../data/levenshtein_matrix.csv",
               np.asarray(squareform(levenshtein_matrix)),
               delimiter=",")

    mean_matrix = 1 - np.mean(np.array(
        [jaccard_matrix, sorensen_matrix, levenshtein_matrix]),
                              axis=0)
    np.savetxt("../data/similarity_matrix.csv",
               np.asarray(mean_matrix),
               delimiter=",")

    print("Similarity matrix computed.")
    return mean_matrix
Beispiel #13
0
def get_pairs(*lists, **options):
    pairs = options.get('pairs', [])
    method = options.get('method',
                         1)  # method 1 for shortest alignment, 2 for longgest
    # cache the result cause it is always time-consuming
    use_cache = options.get('use_cache', True)
    if use_cache and os.path.exists(CACHE_FILENAME):
        with open(CACHE_FILENAME, 'r') as f:
            cache = f.read().splitlines()
        for line in cache:
            pairs.append(
                filter(lambda x: x.strip(),
                       map(lambda x: x.strip("' \""), line.split('***'))))
    else:
        for prime in lists[0]:
            pair = [prime]
            for minors in lists[1:]:
                # calculate its edit distance to the prime
                distances = map(
                    lambda minor: distance.nlevenshtein(prime, minor, method),
                    minors)
                # get the value whose levenshtein distance to the prime is the minimum
                most_matched = lambda l: minors[l.index(min(l))]

                candidate = most_matched(distances)
                pair.append(candidate)
            pairs.append(pair)

        with open(CACHE_FILENAME, 'w') as f:
            for pair in pairs:
                f.write('***'.join(pair))  # write to files to cache
                f.write(os.linesep)

    return pairs
Beispiel #14
0
def str_levenshtein_1(str1, str2):


    #str1_list = str1.split(' ')
    #str2_list = str2.split(' ')
    res = distance.nlevenshtein(str1, str2,method=1)
    return res
def norm_edist(df):
    id_u = sorted(list(set(df.loc[:, 'id'])))
    srcs_l = []
    for idx in range(0, len(id_u)):
        if idx == len(id_u) - 1:
            break
        else:
            print 'source', id_u[idx]
            id_bool = df.loc[:, 'id'] == id_u[idx]
            src = df.loc[:, 'content'][id_bool]
            src_l = []
            i = 0
            for s in src:
                i += 1
                res_mat = np.zeros((len(id_u) - (idx + 1), 5))
                ii = 0
                for iii in range(idx + 1, len(id_u)):
                    trgt = df.loc[:, 'content'][df.loc[:, 'id'] == id_u[iii]]
                    d = []
                    for t in trgt:
                        d.append(distance.nlevenshtein(s, t, method=1))
                    res_mat[ii, 0] = i
                    res_mat[ii, 1] = id_u[idx]
                    res_mat[ii, 2] = id_u[iii]
                    res_mat[ii, 3] = np.min(d)
                    res_mat[ii, 4] = np.std(d)
                    ii += 1
                src_l.append(res_mat)
                src_mat = np.vstack(src_l)
            srcs_l.append(src_mat)
    srcs_mat = np.vstack(srcs_l)
    return srcs_mat
def str_levenshtein_1(str1, str2):


    str1_list = str1.split(' ')
    str2_list = str2.split(' ')
    res = distance.nlevenshtein(str1, str2,method=1)
    return res
 def calcTitleHashFeats(title1, title2, featVector):
     if title1 is None or title2 is None or title1 == '' or title2 == '':
         featVector.append(1)
         return
     title1 = '%x' % Simhash(get_features(normalize(title1))).value
     title2 = '%x' % Simhash(get_features(normalize(title2))).value
     t2 = distance.nlevenshtein(title1, title2)
     featVector.append(t2)
Beispiel #18
0
def get_similar(seq, dismatched, max_norm_distance=0.5):
    measured = [
        distance.nlevenshtein(seq, line[0], method=2) for line in dismatched
    ]
    if measured and min(measured) < max_norm_distance:
        return dismatched.pop(measured.index(min(measured)))
    else:
        return None
 def calcAbstractHashFeats(abstract1, abstract2, featVector):
     if abstract1 is None or abstract2 is None or abstract1 == '' or abstract2 == '':
         featVector.append(1)
         return
     abstract1 = '%x' % Simhash(get_features(abstract1)).value
     abstract2 = '%x' % Simhash(get_features(abstract2)).value
     t2 = distance.nlevenshtein(abstract1, abstract2)
     featVector.append(t2)
Beispiel #20
0
def title_similarity_np(row1, row2, method="difflib"):
    if method.lower() == "levenshtein":
        return 1 - distance.nlevenshtein(row1[1], row2[1], method=1)
    if method.lower() == "sorensen":
        return 1 - distance.sorensen(row1[1], row2[1])
    if method.lower() == "jaccard":
        return 1 - distance.jaccard(row1[1], row2[1])
    return difflib.SequenceMatcher(None, row1[1], row2[1]).quick_ratio()
def compare_files(similar_fp, base_content):
    similar_content = parser.from_file(similar_fp)['content']
    similar_content = tika_compare_clean(similar_content)

    leven_dist = distance.nlevenshtein(base_content, similar_content)

    if leven_dist <= .001:
        return similar_fp
Beispiel #22
0
def train(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete = []):
    allTrainX = list()
    allTrainY = list()
    with open("./data/train.csv") as f:
        for line in f:
            lin = line.split(",")
            if len(lin) == 3:
                st1 = lin[0].lower()
                st2 = lin[1].lower()

                temp = [
                        1.-(lev.distance(st1,st2)*2/(len(st1)+len(st2))),
                        lev.jaro(st1,st2),
                        lev.jaro_winkler(st1,st2),
                        lev.ratio(st1,st2),
                        distance.sorensen(st1,st2),
                        jaccard(set(st1),set(st2)),
                        1. - distance.nlevenshtein(st1,st2,method=1),
                        1. - distance.nlevenshtein(st1,st2,method=2),
                        dice_coefficient(st1,st2,lenGram=2),
                        dice_coefficient(st1,st2,lenGram=3),
                        dice_coefficient(st1,st2,lenGram=4),
                        cosineWords(st1,st2,dictTrain,tfidf_matrix_train),
                        cosineBigrams(st1,st2,dictTrainBigrams,tfidf_matrix_trainBigrams,lenGram)
                    ]
                if len(delete) > 0:
                    for elem in delete:
                        temp[elem] = 0.
                allTrainX.append(temp)
                allTrainY.append(int(lin[2]))


    X = np.array(allTrainX,dtype=float)
    y = np.array(allTrainY,dtype=float)
    clf = svm.LinearSVC(C=1.,dual=False,loss='l2', penalty='l1')
    clf2 = linear_model.LogisticRegression(C=1.,dual=False, penalty='l1')
    clf.fit(X, y)
    clf2.fit(X, y)
    weights = np.array(clf.coef_[0])
    print(weights)
    weights = np.array(clf2.coef_[0])
    print(weights)


    return clf,clf2
def calculate_nlevenshtein(actual: ndarray, predicted: ndarray) -> float:
    distances = []

    for row in range(actual.shape[0]):
        distances.append(
            nlevenshtein(np.array2string(actual[row]),
                         np.array2string(predicted[row])))

    return float(np.mean(distances))
Beispiel #24
0
def DL_Distance(str1, str2):
    print(str1, str2)
    print("distance 1: ", distance.nlevenshtein(str1, str2))
    print("distance 2: ", damerau_levenshtein_distance(str1, str2))
    dls = (damerau_levenshtein_distance(str1, str2) /
           max(len(str1), len(str2)))
    print("distance 3: ", dls)

    print("distance 4: ", distance.jaccard(str1, str2))
Beispiel #25
0
    def levenshtein(self, other):
        """
        Computes the edit distance between this log and the other one and does
        not do it on name sequences, but rather on the entire log.
        """
        a = [str(version) for version in self.iter_versions()]
        b = [str(version) for version in other.iter_versions()]

        return nlevenshtein(a, b)
Beispiel #26
0
def extract_basic_distance_feat(df):
    ## jaccard coef/dice dist of n-gram
    print "generate jaccard coef and dice dist for n-gram"
    dists = ["jaccard_coef", "dice_dist"]
    grams = ["unigram", "bigram", "trigram"]
    feat_names = ["origsent", "candsent"]
    for stem in ["", "_stem"]:
        for dist in dists:
            for gram in grams:
                for i in range(len(feat_names) - 1):
                    for j in range(i + 1, len(feat_names)):
                        target_name = feat_names[i]
                        obs_name = feat_names[j]
                        df["%s_of_%s_between_%s_%s%s" %
                           (dist, gram, target_name, obs_name, stem)] = list(
                               df.apply(lambda x: compute_dist(
                                   x[target_name + "_" + gram + stem], x[
                                       obs_name + "_" + gram + stem], dist),
                                        axis=1))

    print "generate rest all features"
    gram_ext = [
        "_unigram", "_bigram", "_trigram", "_char_unigram", "_char_bigram",
        "_char_trigram"
    ]
    for stem in ["", "_stem"]:
        for gram in gram_ext:
            df["levenshtein_%s%s" % (gram, stem)] = list(
                df.apply(lambda x: distance.nlevenshtein(
                    x["origsent" + gram + stem],
                    x["candsent" + gram + stem],
                    method=2),
                         axis=1))
            df["sorensen_%s%s" % (gram, stem)] = list(
                df.apply(lambda x: distance.sorensen(
                    x["origsent" + gram + stem], x["candsent" + gram + stem]),
                         axis=1))
            df["cosine_%s%s" % (gram, stem)] = list(
                df.apply(lambda x: cosine(x["origsent" + gram + stem], x[
                    "candsent" + gram + stem]),
                         axis=1))
            df["precision_%s%s" % (gram, stem)] = list(
                df.apply(lambda x: precision_recall(
                    x["origsent" + gram + stem], x["candsent" + gram + stem],
                    x["origsent" + gram + stem]),
                         axis=1))
            df["recall1gram_%s%s" % (gram, stem)] = list(
                df.apply(lambda x: precision_recall(
                    x["origsent" + gram + stem], x["candsent" + gram + stem],
                    x["candsent" + gram + stem]),
                         axis=1))
            df["f1gram_%s%s" % (gram, stem)] = list(
                df.apply(
                    lambda x: fmeasure(x["precision_%s%s" %
                                         (gram, stem)], x["recall1gram_%s%s" %
                                                          (gram, stem)]),
                    axis=1))
Beispiel #27
0
def getPairFeatures(session):

  totalTime = 1.0 + (session[-1][QTIME] - session[0][QTIME]).total_seconds()
  for i in range(len(session) - 1):
    for j in range(i + 1, len(session)):
      e1 = session[i]
      e2 = session[j]
      jaccard = 1.0 - distance.jaccard(e1[QUERY].split(), e2[QUERY].split())
      edit = 1.0 - distance.nlevenshtein(e1[QUERY].split(), e2[QUERY].split())
      timeDiff = ((e2[QTIME] - e1[QTIME]).total_seconds()) / totalTime * 1.0
      #normalized distance
      dist = (j - i) * 1.0 / len(session)
      urlMatch = -1
      if CLICKU in e1 and CLICKU in e2:
        urlMatch = 1.0 - distance.nlevenshtein(e1[CLICKU], e2[CLICKU])
      cosine = get_cosine(text_to_vector(e1[QUERY]), text_to_vector(e2[QUERY]))
      edgeScore = .20 * cosine + .20 * jaccard + .20 * edit + .15 * dist + .15 * timeDiff + .10 * urlMatch
      yield i, j, edgeScore, cosine, jaccard, edit, dist, timeDiff, urlMatch
Beispiel #28
0
    def levenshtein(self, other):
        """
        Computes the edit distance between this log and the other one and does
        not do it on name sequences, but rather on the entire log.
        """
        a = [str(version) for version in self.iter_versions()]
        b = [str(version) for version in other.iter_versions()]

        return nlevenshtein(a,b)
Beispiel #29
0
def title_similarity_pd(row, method='difflib'):
    if method.lower() == "levenshtein":
        return 1 - distance.nlevenshtein(
            row["title"], row["title_R"], method=1)
    if method.lower() == "sorensen":
        return 1 - distance.sorensen(row["title"], row["title_R"])
    if method.lower() == "jaccard":
        return 1 - distance.jaccard(row["title"], row["title_R"])
    return difflib.SequenceMatcher(None, row["title"],
                                   row["title_R"]).quick_ratio()
Beispiel #30
0
def is_similar_by_levenstein(first, second, threshold):
    """Check rather two lists are similar by Levenstein similarity metrics up to threshold
    :param first: one of the lists
    :param second: another list
    :param threshold: similarity threshold
    :return: boolean value is elements are similar up to threshold
    """
    if (1 - distance.nlevenshtein(first, second)) >= threshold:
        return True
    return False
Beispiel #31
0
def categorize_peptide_distance(annotation1, annotation2):
    #Determining if it is I/L Substitution
    if annotation1.replace("I", "L") == annotation2.replace("I", "L"):
        #I/L Substitution
        return "I/L Substitution"

    annotation1_sequence_only = re.sub(r'[0-9.+-]+', '', annotation1)
    annotation2_sequence_only = re.sub(r'[0-9.+-]+', '', annotation2)

    string_distance = distance.nlevenshtein(annotation1_sequence_only, annotation2_sequence_only, method=1)
    #Detecting Site locatization of PTMs
    if string_distance < 0.01:
        return "PTM Localization"

    hamming_distance = 0

    if len(annotation1_sequence_only) == len(annotation2_sequence_only):
        hamming_distance = distance.hamming(annotation1_sequence_only, annotation2_sequence_only)

        if hamming_distance == 2:
            return "Double Amino Substitution"

        if hamming_distance == 1:
            #Seeing if it is a deamidation
            annotation1_contains_deamidation = False
            annotation2_contains_deamidation = False

            if annotation1.find("+0.984") != -1:
                annotation1_contains_deamidation = True
            if annotation2.find("+0.984") != -1:
                annotation2_contains_deamidation = True

            if annotation1_contains_deamidation != annotation2_contains_deamidation:
                #Probably should check for Q->E
                return "Deamidation"


            #Checking for Q->K Substitution

    #Determining String Distance
    string_distance = distance.nlevenshtein(annotation1, annotation2, method=1)

    return "UNKNOWN"
Beispiel #32
0
def accuracy(first, second):
    """Calculates similarity metrics for two lists (the order of parameters doesn't matter)
    :param first: list with predicted events
    :param second: list with true events
    :return: tuple of specified accuracy metrics (similarities): normalized Levenshtein and Damerau-Levenstein, Jaccard
    """
    n_levenstein = 1 - distance.nlevenshtein(first, second)
    n_damerau_levenshtein = (1 - damerau_levenshtein_distance(
        first, second)) / max(len(first), len(second))
    jaccard = 1 - distance.jaccard(first, second)
    return n_levenstein, n_damerau_levenshtein, jaccard
Beispiel #33
0
def adjusted_similarity(dfp1, dfp2):
    """
    计算两个数据功能项的相似度
    :param dfp1:
    :param dfp2:
    :return:
    """
    if dfp1 in dfp2 or dfp2 in dfp1:
        return 1

    return 1 - distance.nlevenshtein(dfp1, dfp2, method=1)
Beispiel #34
0
def compareList(l1, l2):
    result = 1000
    for i in l1:
        for j in l2:
            current = distance.nlevenshtein(i, j, method=2)
            if result > current:
                result = current
            if result == 0:
                break

    return result
Beispiel #35
0
def levenshtein_similarity(str1,str2):
        '''
        Implements the basic Levenshtein algorithm providing a similarity measure between two strings
        return actual / possible levenstein distance to get 0-1 range normalised by the length of the longest sequence
        '''
        #sim_score=self.load_sim_from_memory(str1, str2)
        #if sim_score is None:
        from distance import nlevenshtein
        dist=nlevenshtein(str1, str2, method=1)
        sim_score= 1 - dist
        
        return sim_score
def compare_word(word1, word2: str) -> float:

    if len(word1) <= 2 or len(word2) <= 2:
        return 1000.0

    if len(word1) == 3 and len(word2) == 3:
        if word1 == word2:
            return 0.001
        else:
            return 1000.0
    else:
        return ds.nlevenshtein(word1, word2, method=1)
def get_features(raw_data):
    fet_data = pd.DataFrame()

    print "extracting count features..."
    fet_data["q_len"] = raw_data["query"].map(word_len)
    fet_data["t_len"] = raw_data["product_title"].map(word_len)
    fet_data["d_len"] = raw_data["product_description"].map(word_len)

    print "extracting basic distance features from q and t..."
    fet_data["nleven1"] = raw_data.apply(lambda x: distance.nlevenshtein(x.q, x.t, method=1), axis=1)
    fet_data["nleven2"] = raw_data.apply(lambda x: distance.nlevenshtein(x.q, x.t, method=2), axis=1)
    fet_data["sorensen"] = raw_data.apply(lambda x: distance.sorensen(x.q, x.t), axis=1)
    fet_data["jaccard"] = raw_data.apply(lambda x: distance.jaccard(x.q, x.t), axis=1)
    fet_data["ncd"] = raw_data.apply(lambda x: ncd(x.q, x.t), axis=1)

    print "extracting basic distance features from q_ex and t..."
    fet_data["sorensen_ex"] = raw_data.apply(lambda x: distance.sorensen(get_uniq_words_text(x.q_ex), x.t), axis=1)
    print "extracting basic distance features from q_ex and t..."
    fet_data["jaccard_ex"] = raw_data.apply(lambda x: distance.jaccard(get_uniq_words_text(x.q_ex), x.t), axis=1)
    print "extracting basic distance features from q_ex and t..."
    fet_data["ncd_ex"] = raw_data.apply(lambda x: ncd(get_uniq_words_text(x.q_ex), x.t), axis=1)

    return fet_data
def levenshtein_similarity(str1,str2):
        '''
        Implements the basic Levenshtein algorithm providing a similarity measure between two strings
        return actual / possible levenstein distance to get 0-1 range normalised by the length of the longest sequence
        
        e.g., http://www.pris.net.cn/wp-content/uploads/2013/12/PRIS2013.notebook.pdf
        '''
        #sim_score=self.load_sim_from_memory(str1, str2)
        #if sim_score is None:
        from distance import nlevenshtein
        dist=nlevenshtein(str1, str2, method=1)
                
        sim_score= 1 - dist
        
        return sim_score
Beispiel #39
0
def check_text(text, entry=None):
    text_results = []
    text_contexts = set()
    if text != "":
        for keyword in keywords:
            result = nlevenshtein(text, keyword, method=2)
            if result <= 0.4:
                print "Match! " + text + " is close to " + keyword + " (" + str(result) + ")"
                if entry and "phish_detail_url" in entry:
                    text_contexts.add(entry["phish_detail_url"])
                else:
                    text_contexts.add(keyword)
		text_results.append(text)
                break

    return text_results, text_contexts
		def test_edges(scheme_exp,scheme_obs):
			import pystats
			edges_exp=scheme2edges(scheme_exp)
			edges_obs=scheme2edges(scheme_obs)
			#print scheme_exp,scheme_obs

			#return pystats.mean([int(e in edges_obs) for e in set(edges_exp)])
			import distance
			dist=distance.nlevenshtein(scheme_exp, scheme_obs)
			#for s1,s2 in zip(scheme_exp,scheme_obs):
			#	if 0 in [s1,s2] and {s1,s2}!={0}:
			#		dist+=2
			print scheme_exp,'\t',scheme_obs,'\t',dist
			#if scheme_exp==(1,1) and scheme_obs!=(1,1):
			#	dist+=2
			dist = dist * 10**(1/len(scheme_exp))
			return dist
    def cell_difference(self, cell1, cell2):
        """
        return a single value indicating the extent to which cell 1 is like cell 2.
        :param cell1: a list of lines of code
        :param cell2: a list of lines of code
        :return:
        """
        cell1_concatenation = ""
        cell2_concatenation = ""
        for line_in_cell1 in cell1:
            cell1_concatenation += line_in_cell1
        for line_in_cell2 in cell2:
            cell2_concatenation += line_in_cell2

        difference = distance.nlevenshtein(cell1_concatenation, cell2_concatenation)

        return difference
def detect_trend(df):
    with open('shadow_words.txt', 'r') as f:
        shadow_tags = f.read().splitlines()
    with open('bad_list_total.txt', 'r') as f:
        shadow_tags += f.read().splitlines()
    shadow_tags.append('')
    top_tags = df.loc[df[df.shape[1]-1]>5000]
    drop_tags = [x for x in shadow_tags if x in top_tags.index]
    top_tags = top_tags.drop(drop_tags)

    #model = pycast.methods.ExponentialSmoothing(smoothingFactor=0.1,valuesToForecast=1)
    model = pycast.methods.HoltWintersMethod(seasonLength=4)
    forecast = []
    prev_times_s = map(lambda x: (x-prev_times[0]).total_seconds(), prev_times)
    top_tags.fillna(method='pad', inplace=True, axis=0)
    for index in top_tags.index:
        ts = zip(prev_times_s, df.ix[index])
        preds = model.execute(ts)
        pred = preds[-1][1] / preds[-10][1]
        forecast.append((index, pred))
    forecast.sort(key=lambda x: x[1], reverse=True)
    candidates = [x[0] for x in forecast[0:500]]
    candidates.sort(key=len, reverse=True)
    print candidates
    top = []
    for t1 in candidates:
        skip = False
        for t2 in top:
            #print t1, t2
            #print distance.nlevenshtein(t1, t2, method=1)
            if t1 != t2 and distance.nlevenshtein(t1, t2, method=1) < 0.4:
                skip = True
                break
        if skip:
            continue
        top.append(t1)
    top_sorted = []
    for tag in forecast[0:500]:
        if tag[0] in top:
            top_sorted.append(tag[0])
        if len(top_sorted) >= NUM_TOP_TAGS:
            break
    return top_sorted
	def getTags(self,keywords):
		predtags=set([])
		keywordlist = []
		for keyword in keywords:
			keywordlist.append(keyword.split(' '))
################### check if a tag is completely in a keyword#################################
		for wordlist in self.taglist:
			flag=0
			for keyword in keywordlist:
				if(set(wordlist[0])<=set(keyword)):
					flag=1
					break
			if(flag==1):
				predtags.add(self.tags[wordlist[1]])
############################################## step completed###############################		
		for keyword in keywords:
			for tag in self.taglem:
				score = distance.nlevenshtein(tag[0],keyword)
				if(0.2 > score):
			  		predtags.add(self.tags[tag[1]])
		return set(predtags)
Beispiel #44
0
def template_distance(template1, template2):
    return distance.nlevenshtein(
        template1.raw_str.strip().split(),
        template2.raw_str.strip().split()
    )
Beispiel #45
0
def stats(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete = [],plotX=False):
    with open("./data/stats.csv") as infile:
        for i,line in enumerate(infile):
            pass

    dimMatrix = 16
    predict = np.zeros((i+1,dimMatrix))


    clf1,clf2 = train(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete=delete)

    with open("./data/stats.csv") as infile:
        for i,line in enumerate(infile):
            a = line.rstrip().split("\t")

            ## create same vector with more distances
            st1 = a[0].lower()
            st2 = a[1].lower()

            temp = [
            1.-(lev.distance(st1,st2)*2/(len(st1)+len(st2))),
            lev.jaro(st1,st2),
            lev.jaro_winkler(st1,st2),
            lev.ratio(st1,st2),
            distance.sorensen(st1,st2),
            jaccard(set(st1),set(st2)),
            1. - distance.nlevenshtein(st1,st2,method=1),
            1. - distance.nlevenshtein(st1,st2,method=2),
            dice_coefficient(st1,st2,lenGram=2),
            dice_coefficient(st1,st2,lenGram=3),
            dice_coefficient(st1,st2,lenGram=4),
            cosineWords(st1,st2),
            cosineBigrams(st1,st2)]

            if len(delete) > 0:
                for elem in delete:
                    temp[elem] = 0.

            predict[i,:-3] = temp
            predict[i,-3] = clf1.decision_function(np.array(temp,dtype=float))
            predict[i,-2] = clf2.decision_function(np.array(temp,dtype=float))
            predict[i,-1] = a[-1]


    if plotX:
        labelsM = ["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"]
        f1matrix = np.zeros((100,dimMatrix-1))

        fig = plt.figure()
        fig.set_size_inches(9,6)
        ax = fig.add_subplot(111)
        iC = -1
        for i in np.linspace(0,1,100):
            iC += 1
            for j in range(dimMatrix-1):
                t = np.array(predict[:,j])
                if j >= dimMatrix-3:
                    t = (t - np.min(t))/(np.max(t)-np.min(t))
                f1matrix[iC,j] = f1_score(y_pred=t>i ,y_true=predict[:,-1])
        F1scores = []
        for j in range(dimMatrix-1):
            F1scores.append(np.max(f1matrix[:,j]))
            #ax.plot(np.linspace(0,1,100),f1matrix[:,j],label=labelsM[j],color=tableau20[j])
        ax.bar(range(dimMatrix-1),F1scores)
        plt.xticks(np.arange(dimMatrix-1)+0.5,["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"],rotation=45)
        ax.set_ylabel("F1 score")
        ax.set_xlabel("Parameter")
        plt.legend(loc=2)
        customaxis(ax)
        plt.savefig("f1_bar.pdf")
        plt.show()

        fig = plt.figure()
        fig.set_size_inches(9, 6)
        ax = fig.add_subplot(111)

        AUCScores = []
        for j in range(dimMatrix-1):
            # Compute ROC curve and area the curve
            fpr, tpr, thresholds = roc_curve(predict[:,-1], predict[:,j])
            AUCScores.append(auc(fpr, tpr))


            # Plot ROC curve
            ax.plot(fpr, tpr, label=labelsM[j],color=tableau20[j])
            ax.plot([0, 1], [0, 1], 'k--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.0])
            ax.set_xlabel('False Positive Rate')
            ax.set_ylabel('True Positive Rate')
            ax.set_title('ROC Curve')

        plt.legend(loc=2)
        customaxis(ax)
        plt.savefig("roc.pdf")
        plt.show()

        fig = plt.figure()
        fig.set_size_inches(9, 6)
        ax = fig.add_subplot(111)
        ax.bar(range(dimMatrix-1),AUCScores)
        ax.set_ylabel('Area Under Curve')
        plt.xticks(np.arange(dimMatrix-1)+0.5,["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"],rotation=45)
        customaxis(ax)
        plt.savefig("roc_bar.pdf")
        plt.show()
Beispiel #46
0
def search():
    """ search the database for names or passwords """
    admin.list_titulo='' # delete page title
    app.logger.debug('entering search')
    try:
        query_name = request.form['inputName'] or None
        query_password = request.form['inputPassword'] or None
        query_distance = request.form['inputDistance'] or '40'
        query_distance = 1.0 - (float(query_distance)/100.0)
        query_name_len = query_password_len = 0

        app.logger.debug(u"{0} {1}".format(query_name, query_password))

        if query_name is not None:
            query_name = query_name.strip()
            query_name_len = len(query_name)

        if query_password is not None:
            query_password = normalize_passport(query_password)
            query_password_len = len(query_password)

        score = []
        start_time = timeit.default_timer()
        
        if ((query_name is not None) or 
            (query_password is not None)):

            query_st = u'select rowid, word, distance from spell_{0} where word match ? order by distance limit 10' 

            param = ''
            query_spell_ref = ''
            root_filter= '/admin/entity/?flt0_5='
            query_filter_entity = ''

            if query_name is not None:
                query_stc = query_st.format(u'whole_name', query_distance)
                query_spell_ref = 'select entity_id from names where spell_ref=? limit 1'
                param = query_name
            else:
                query_stc = query_st.format(u'passport', query_distance)
                query_spell_ref = 'select entity_id from passports where spell_ref=? limit 1'
                param = query_password

            app.logger.debug(u''+query_stc+' '+param)            
            cursor=apsw_con.cursor()
            cursor2=apsw_con.cursor()

            for rowid, word, distance in cursor.execute(query_stc, (param,)):

                # distance in % by shortest alignment
                d = nlevenshtein(param.upper(), word.upper(), method=2)

                app.logger.debug(u'Distance between {0} and {1} is {2}'.format(
                            param,
                            word,
                            d
                            ))


                if d<=query_distance:
                    # find spell reference
                    for rf in cursor2.execute(query_spell_ref, (rowid,)):
                        if (len(query_filter_entity)==0):
                            query_filter_entity=rf[0]
                        else:
                            query_filter_entity+='%2C'+rf[0]    

                    score.append( (rowid, word, (1-d)*100) )

            et = u'Execution time: {0} s'.format(
                    timeit.default_timer() - start_time)

            admin.ent_ctrl.list_titulo=u"Results for {0} with\
             {1}% of similarity. ({2})".format(
                param, (1-query_distance)*100, et
             )
            return redirect(root_filter+query_filter_entity)


            # http://localhost:5000/admin/entity/?flt2_5=EU40%2CUN40

            """
            return render_template(
                'index.html',
                query_name=u"{0}".format(param),
                score=score,
                similarity=(1-query_distance)*100,
                execution_time=et)
            """    
        else:
            return redirect('/admin')

    except Exception, e:
        msg="Rendering error: {0}".format(e)
        app.logger.error(msg)
        return render_template('400.html', msg=msg)
Beispiel #47
0
def get_similar(seq, dismatched, max_norm_distance=0.5):
	measured = [ distance.nlevenshtein(seq, line[0], method=2) for line in dismatched ]
	if measured and min(measured) < max_norm_distance:
		return dismatched.pop(measured.index(min(measured)))
	else:
		return None
Beispiel #48
0
 def findEditDistance(self, qFeat):
   #print self.query, qFeat.query, distance.nlevenshtein(self.query, qFeat.query,method=1), distance.nlevenshtein(self.query, qFeat.query,method=2)
   edit = 1.0 - distance.nlevenshtein(self.query, qFeat.query, method=1)
   return edit
Beispiel #49
0
import nltk 
f = open(sys.argv[1],"r")
read_line  = f.readlines()
words = []	
z=[]
for line in read_line:
	line=line.translate(None,'\n')
	r = line.split(',')
	o = r[0] + r[1]
	z=[]
	z.append(r[0])
	z.append(r[1])
	print nltk.pos_tag(z)
	words.append(o)

lev_similarity = -1*np.array([[distance.nlevenshtein(w1,w2,method=2) for w1 in words] for w2 in words])
affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.5)
affprop.fit(lev_similarity)
for cluster_id in np.unique(affprop.labels_):
		 exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
		 y = np.nonzero(affprop.labels_==cluster_id)
		 output = ""
		 for j in y:
		 	for k in j:
		 		output = output+','+words[k]
		 print exemplar,":",output
# cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
#		 cluster_str = ", ".join(cluster)
#		 print(" - *%s:* %s" % (exemplar, cluster_str))

######Below code gives suggestion in a spell checker############

import nltk
import distance

#Array dict_words stores the words in english dictionary
dict_words=nltk.corpus.words.words('en')

#Array to store similar words
sim_words = []

#Array to store similarity values for each word in sim_words array
sim_dist = []


for word in dict_words:
    if distance.nlevenshtein(word, "applicablity", method=1) <= 0.25 :
        sim_words.append(word)
        sim_dist.append(distance.nlevenshtein(word, "applicablity", method=1))


#Minimum value among all the sim_dist values, which represents more similarity#
min_value = min(sim_dist)

#Extracting the index of the min_value
index_min_val= sim_dist.index(min(sim_dist))

#Suggeting word for wrong spelling
print(sim_words[index_min_val])
Beispiel #51
0
def logline_distance(logline1, logline2):
    return distance.nlevenshtein(
        logline1.text.strip().split(),
        logline2.text.strip().split())