def get_cosine_distances(self, tokens, ngrams):
     start_time = time.time()
     cos = Cosine(ngrams)
     distances = np.array([[int(100*cos.distance(w1, w2)) for w1 in tokens] for w2 in tokens])
     end_time = time.time()
     logging.info("Cosine distances computation time: " + str(round(end_time-start_time, 2)) + " seconds")
     return distances
Beispiel #2
0
def get_similarity_score():
    cosine = Cosine(2)
    f, g, ctr = open('model_res').readlines(), open('real_res').readlines(), 0
    for i in range(len(f)):
        f[i], g[i] = f[i].replace("\n", ""), g[i].replace("\n", "")
        ctr += cosine.similarity_profiles(cosine.get_profile(f[i]),
                                          cosine.get_profile(g[i]))
    return ctr / len(f)
Beispiel #3
0
    def triangulate(self, tweet, loc):

        print('Triangulating: ' + tweet)
        cosine = Cosine(2)
        cos_tweet = cosine.get_profile(tweet)

        with open("clean/clean_rss.txt", "r") as clean_rss:

            for rss in clean_rss:
                rss = rss.split('\n')[0]
                cos_rss = cosine.get_profile(rss)
                cos_result = cosine.similarity_profiles(cos_tweet, cos_rss)

                if cos_result > 0.7:
                    print('\t[PASS: '******'] ' + rss)
                    return True
                else:
                    print('\t[FAIL: ' + str(cos_result) + '] ' + rss)

        with open("clean/clean_retweet.txt", "r") as clean_rt:

            for rtweet in clean_rt:
                rt = rtweet.rsplit(' ', 1)[0]
                rt_loc = rtweet.split('\n')[0].rsplit(' ', 1)[1]
                cos_rt = cosine.get_profile(rt)

                if loc == rt_loc:
                    cos_result = cosine.similarity_profiles(cos_tweet, cos_rt)
                    if cos_result > 0.7:
                        print('\t[PASS: '******'] ' + rt)
                        return True
                    else:
                        print('\t[FAIL: ' + str(cos_result) + '] ' + rt)

        with open('clean/clean_tweet.txt', 'r') as clean_tweet:

            for ctweet in clean_tweet:
                ct = ctweet.rsplit(' ', 1)[0]
                ct_loc = ctweet.split('\n')[0].rsplit(' ', 1)[1]
                cos_ct = cosine.get_profile(ct)

                if loc == ct_loc:
                    cos_result = cosine.similarity_profiles(cos_tweet, cos_ct)
                    if cos_result > 0.7 and cos_result != 1.0:
                        print('\t[PASS: '******'] ' + ct)
                        return True
                    else:
                        print('\t[FAIL: ' + str(cos_result) + '] ' + ct)

        print('\tNo matching results found...')
        return False
Beispiel #4
0
def choose_the_best_clause(union, question):
    if USE_COSINE:
        cos = Cosine(1)
        article_and_clause_no = []
        max_cosine = 0
        for ref in union:
            article, clause = ref[1]
            clause = corpus[article]['clauses'][clause]['text']
            cur_cos = cos.similarity(clause, question)
            if max_cosine < cur_cos:
                max_cosine = cur_cos
                article_and_clause_no = ref[1]
        return article_and_clause_no
    else:
        union.sort(key=lambda x: x[0], reverse=True)
        return union[0][1]
Beispiel #5
0
def match(num):
    stop_plate = [0, "", "0"]
    suspected = ''
    from similarity.cosine import Cosine
    cosine = Cosine(2)
    s0 = str(num)
    f = open('lic.csv').readlines()
    for s1 in f:
        s1 = s1.replace("\n", "")
        if s1 in stop_plate:
            continue
        print(s1)
        p0 = cosine.get_profile(s0)
        p1 = cosine.get_profile(s1)
        if cosine.similarity_profiles(p0, p1) >= 0.65:
            suspected += s1 + " "
    return suspected
Beispiel #6
0
    def title_similarity(self, page):
        try:
            s1 = normalize(self.reference.title)
            s2 = normalize(page.title)
            n = 3
            p1_trigrams = Counter(nltk.ngrams(s1, n))
            p2_trigrams = Counter(nltk.ngrams(s2, n))
            p1_grams = Counter(nltk.ngrams(s1, 1))
            p2_grams = Counter(nltk.ngrams(s2, 1))
            cosine = Cosine(1)
            similarity = cosine.similarity_profiles(p1_trigrams, p2_trigrams)
            similarity = cosine.similarity_profiles(p1_grams, p2_grams)
            similarity = similarity / 2

        except ZeroDivisionError:
            similarity = 0
        return similarity
Beispiel #7
0
def footer(df, kgram=2, TOP_LINES=5):

    df['isFooter'] = False
    pgs = df['page'].unique()

    cosine = Cosine(kgram)
    for pg in pgs[1:]:

        prev_idx = df.index[df['page'] == (pg - 1)]
        pres_idx = df.index[df['page'] == pg]

        for ln in range(TOP_LINES):

            prev_ln = prev_idx[-1 * (ln + 1)]
            pres_ln = pres_idx[-1 * (ln + 1)]

            s0 = df.loc[prev_ln, 'text']
            s1 = df.loc[pres_ln, 'text']

            skip = 0
            if s0.isdigit():
                df.loc[prev_ln, 'isFooter'] = True
                skip = 1

            if s1.isdigit():
                df.loc[pres_ln, 'isFooter'] = True
                skip = 1

            if (skip == 1) | (len(s0) < kgram) | (len(s1) < kgram):
                continue

            #print(s0,",", s1)
            p0 = cosine.get_profile(s0)
            p1 = cosine.get_profile(s1)

            sim = cosine.similarity_profiles(p0, p1)
            if (sim > 0.9):
                df.loc[prev_ln, 'isFooter'] = True
                df.loc[pres_ln, 'isFooter'] = True
                #print(pg,",", ln, ",", s0,",", s1,",", sim)

    return (df)
def met_cosine(s1, s2, n):
    cosine = Cosine(n)
    p1 = cosine.get_profile(s1)
    p2 = cosine.get_profile(s2)
    val = cosine.similarity_profiles(p1, p2)
    return val
Beispiel #9
0
    def similarity(self, question, answer):

        stopword = self.read_from(folder_path + '上证专用停用词.txt')
        stopwords = []
        for sw in stopword:
            sw = sw.strip('\n')
            sw = sw.strip(' ')
            stopwords.append(sw)
        # print(stopwords)

        meaningful_words1 = []
        meaningful_words2 = []

        words2 = jieba.cut(str(question))
        words3 = jieba.cut(str(answer))
        for word in words2:
            if word not in stopwords:
                meaningful_words1.append(word)
        for word in words3:
            if word not in stopwords:
                meaningful_words2.append(word)
        s2 = ''.join(meaningful_words1)
        # print(s2)
        s3 = ''.join(meaningful_words2)
        a1 = Cosine(1)
        b1 = Damerau()
        c1 = Jaccard(1)
        d1 = JaroWinkler()
        e1 = Levenshtein()
        f1 = LongestCommonSubsequence()
        g1 = MetricLCS()
        h1 = NGram(2)
        i1 = NormalizedLevenshtein()
        j1 = OptimalStringAlignment()
        k1 = QGram(1)
        l1 = SorensenDice(2)
        m1 = WeightedLevenshtein(character_substitution=CharSub())

        line_sim = []

        cos_s = a1.similarity(s2, s3)
        line_sim.append(cos_s)
        cos_d = a1.distance(s2, s3)
        line_sim.append(cos_d)
        dam = b1.distance(s2, s3)
        line_sim.append(dam)
        jac_d = c1.distance(s2, s3)
        line_sim.append(jac_d)
        jac_s = c1.similarity(s2, s3)
        line_sim.append(jac_s)
        jar_d = d1.distance(s2, s3)
        line_sim.append(jar_d)
        jar_s = d1.similarity(s2, s3)
        line_sim.append(jar_s)
        lev = e1.distance(s2, s3)
        line_sim.append(lev)
        lon = f1.distance(s2, s3)
        line_sim.append(lon)
        met = g1.distance(s2, s3)
        line_sim.append(met)
        ngr = h1.distance(s2, s3)
        line_sim.append(ngr)
        nor_d = i1.distance(s2, s3)
        line_sim.append(nor_d)
        nor_s = i1.similarity(s2, s3)
        line_sim.append(nor_s)
        opt = j1.distance(s2, s3)
        line_sim.append(opt)
        qgr = k1.distance(s2, s3)
        line_sim.append(qgr)
        sor_d = l1.distance(s2, s3)
        line_sim.append(sor_d)
        sor_s = l1.similarity(s2, s3)
        line_sim.append(sor_s)
        wei = m1.distance(s2, s3)
        line_sim.append(wei)

        return line_sim
import matplotlib.pyplot as plt
import re
import json
from similarity.jarowinkler import JaroWinkler
from similarity.qgram import QGram
from similarity.cosine import Cosine
from similarity.ngram import NGram
from similarity.levenshtein import Levenshtein
from similarity.normalized_levenshtein import NormalizedLevenshtein
import csv
from similarity.qgram import QGram

with open('qgram_human_mobility.txt') as f:
    data = json.load(f)

cosine = Cosine(2)

profiles = []

for i in range(len(data)):

    temp = data[i][0].strip('"')

    profiles.append(cosine.get_profile(temp))

for i in range(len(profiles)):

    print(profiles[i])

profile_sim = []
Beispiel #11
0
from similarity.levenshtein import Levenshtein
from similarity.normalized_levenshtein import NormalizedLevenshtein
from similarity.cosine import Cosine
lev = Levenshtein()
nolev = NormalizedLevenshtein()
cosine = Cosine(4)
str1 = 'I enjoy playing football'
str2 = 'I love to play soccer'

print(lev.distance(str1, str2))
print('Levenshtein distance:')
print(nolev.similarity(str1, str2))
print('Cosine similarity:')
print(cosine.similarity(str1, str2))
Beispiel #12
0
def dashboard_carrer():

    with open('.data/test.csv', 'rb') as f:
        result = chardet.detect(f.read())

    p_test = pd.read_csv(".data/test.csv", encoding=result['encoding'])
    #Encoding
    p_test = p_test.replace('Enjoy', 5)
    p_test = p_test.replace('Slightly Enjoy', 4)
    p_test = p_test.replace('Neutral', 3)
    p_test = p_test.replace('Slightly Disagree', 2)
    p_test = p_test.replace('Strongly Disagree', 1)

    #Realistic Questions
    realistic = p_test[[
        'I like to work on cars', 'I like to build things',
        'I like to take care of animals',
        'I like putting things together or assembling things',
        'I like to cook', 'I am a practical person', 'I like working outdoors'
    ]]
    #Investigative Questions
    investigative = p_test[[
        'I like to do puzzles', 'I like to do experiments', 'I enjoy science',
        'I enjoy trying to figure out how things work',
        'I like to analyze things (problems/situations)',
        'I like working with numbers  or charts', 'I am good at math'
    ]]
    #Artistic Questions
    artistic = p_test[[
        'I am good at working independently',
        'I like to read about art and music', 'I enjoy creative writing',
        'I am a creative person', 'I like to play instruments or sing',
        'I like acting in plays', 'I like to draw'
    ]]
    #Social Questions
    social = p_test[[
        'I like to work in teams', 'I like to teach or train people',
        'I like trying to help people solve their problems',
        'I am interested in healing people',
        'I enjoy learning about other cultures',
        'I like to get into discussions about issues around me',
        'I like helping people'
    ]]
    #Enterprising Questions
    enterprising = p_test[[
        'I am an ambitious person who set goals for myself',
        'I like to try to influence or persuade people',
        'I like selling things', 'I am quick to take on new responsibilities',
        'I would like to start my own business', 'I like to give speeches',
        'I like to lead'
    ]]
    #Conventional Questions
    conventional = p_test[[
        'I like to organize things',
        'I wouldn’t mind working 8 hours per day in an office',
        'I pay attention to details', 'I like to do filing or typing',
        'I am good at keeping records of my work',
        'I would like to work in an office'
    ]]

    #Summing Up
    realistic['R'] = realistic.sum(axis=1)
    investigative['I'] = investigative.sum(axis=1)
    artistic['A'] = artistic.sum(axis=1)
    social['S'] = social.sum(axis=1)
    enterprising['E'] = enterprising.sum(axis=1)
    conventional['C'] = conventional.sum(axis=1)

    code = realistic['R']
    code = code.to_frame()
    code['I'] = investigative['I']
    code['A'] = artistic['A']
    code['S'] = social['S']
    code['E'] = enterprising['E']
    code['C'] = conventional['C']

    n = 3

    new_d = [
        list(map(it(0),
                 (row[1:].sort_values(ascending=False)[:n].iteritems())))
        for _, row in code.iterrows()
    ]

    std = pd.DataFrame(new_d)

    std['code'] = std[0] + std[1] + std[2]

    #std has the test code
    std = std.drop([0, 1, 2], axis=1)

    #Read the course data
    course = pd.read_csv(".data/course.csv")

    df = pd.MultiIndex.from_product(
        [std["code"], course["course_code"], course["Course_short"]],
        names=["code", "course_code", "course"]).to_frame(index=False)

    df = df.dropna()

    #Cosine Similarity
    cosine = Cosine(2)
    df["p0"] = df["code"].apply(lambda s: cosine.get_profile(s))
    df["p1"] = df["course_code"].apply(lambda s: cosine.get_profile(s))
    df["cosine_sim"] = [
        cosine.similarity_profiles(p0, p1)
        for p0, p1 in zip(df["p0"], df["p1"])
    ]
    df.drop(["p0", "p1"], axis=1, inplace=True)

    #Sorting the Values
    top_n = df.sort_values(['cosine_sim'],
                           ascending=False).groupby(df['code'].values).head(3)

    options = top_n["course"].to_numpy()

    # selecting rows based on condition
    rec = course.loc[course['Course_short'].isin(options)]

    recommendations = json.loads(rec.to_json(orient='records'))

    return render_template('./dashboard_carrer.html',
                           title='Dashboard - Carrer',
                           std=std,
                           recommendations=recommendations)
Beispiel #13
0
 def cosine(self, s0, s1):
     cosine = Cosine(15)
     p0 = cosine.get_profile(s0)
     p1 = cosine.get_profile(s1)
     #print('Cosine similarity \"%\" vs \"%\"'% (s0,s1))
     return cosine.similarity_profiles(p0, p1)
Beispiel #14
0
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import numpy as np

# Inizializza all'import
levenshtein = Levenshtein()
norm_levenshtein = NormalizedLevenshtein()
damerau = Damerau()
optimal_string_alignment = OptimalStringAlignment()
jarowinkler = JaroWinkler()
lcs = LongestCommonSubsequence()
metric_lcs = MetricLCS()
ngram = NGram()
qgram = QGram()
dice = SorensenDice()
cos = Cosine(5)
jaccard = Jaccard(5)

similarity_functions = [
    norm_levenshtein.similarity, lambda a, b: 1 - metric_lcs.distance(a, b),
    lambda a, b: 1 - ngram.distance(a, b), cos.similarity, dice.similarity
]


def mono_vector0(tup1, tup2):

    str1 = ' '.join(tup1).lower()
    str2 = ' '.join(tup2).lower()

    simv = list(map(lambda x: x(str1, str2), similarity_functions))