def get_cosine_distances(self, tokens, ngrams):
     start_time = time.time()
     cos = Cosine(ngrams)
     distances = np.array([[int(100*cos.distance(w1, w2)) for w1 in tokens] for w2 in tokens])
     end_time = time.time()"Cosine distances computation time: " + str(round(end_time-start_time, 2)) + " seconds")
     return distances
Ejemplo n.º 2
def get_similarity_score():
    cosine = Cosine(2)
    f, g, ctr = open('model_res').readlines(), open('real_res').readlines(), 0
    for i in range(len(f)):
        f[i], g[i] = f[i].replace("\n", ""), g[i].replace("\n", "")
        ctr += cosine.similarity_profiles(cosine.get_profile(f[i]),
    return ctr / len(f)
Ejemplo n.º 3
def choose_the_best_clause(union, question):
    if USE_COSINE:
        cos = Cosine(1)
        article_and_clause_no = []
        max_cosine = 0
        for ref in union:
            article, clause = ref[1]
            clause = corpus[article]['clauses'][clause]['text']
            cur_cos = cos.similarity(clause, question)
            if max_cosine < cur_cos:
                max_cosine = cur_cos
                article_and_clause_no = ref[1]
        return article_and_clause_no
        union.sort(key=lambda x: x[0], reverse=True)
        return union[0][1]
Ejemplo n.º 4
    def triangulate(self, tweet, loc):

        print('Triangulating: ' + tweet)
        cosine = Cosine(2)
        cos_tweet = cosine.get_profile(tweet)

        with open("clean/clean_rss.txt", "r") as clean_rss:

            for rss in clean_rss:
                rss = rss.split('\n')[0]
                cos_rss = cosine.get_profile(rss)
                cos_result = cosine.similarity_profiles(cos_tweet, cos_rss)

                if cos_result > 0.7:
                    print('\t[PASS: '******'] ' + rss)
                    return True
                    print('\t[FAIL: ' + str(cos_result) + '] ' + rss)

        with open("clean/clean_retweet.txt", "r") as clean_rt:

            for rtweet in clean_rt:
                rt = rtweet.rsplit(' ', 1)[0]
                rt_loc = rtweet.split('\n')[0].rsplit(' ', 1)[1]
                cos_rt = cosine.get_profile(rt)

                if loc == rt_loc:
                    cos_result = cosine.similarity_profiles(cos_tweet, cos_rt)
                    if cos_result > 0.7:
                        print('\t[PASS: '******'] ' + rt)
                        return True
                        print('\t[FAIL: ' + str(cos_result) + '] ' + rt)

        with open('clean/clean_tweet.txt', 'r') as clean_tweet:

            for ctweet in clean_tweet:
                ct = ctweet.rsplit(' ', 1)[0]
                ct_loc = ctweet.split('\n')[0].rsplit(' ', 1)[1]
                cos_ct = cosine.get_profile(ct)

                if loc == ct_loc:
                    cos_result = cosine.similarity_profiles(cos_tweet, cos_ct)
                    if cos_result > 0.7 and cos_result != 1.0:
                        print('\t[PASS: '******'] ' + ct)
                        return True
                        print('\t[FAIL: ' + str(cos_result) + '] ' + ct)

        print('\tNo matching results found...')
        return False
Ejemplo n.º 5
def match(num):
    stop_plate = [0, "", "0"]
    suspected = ''
    from similarity.cosine import Cosine
    cosine = Cosine(2)
    s0 = str(num)
    f = open('lic.csv').readlines()
    for s1 in f:
        s1 = s1.replace("\n", "")
        if s1 in stop_plate:
        p0 = cosine.get_profile(s0)
        p1 = cosine.get_profile(s1)
        if cosine.similarity_profiles(p0, p1) >= 0.65:
            suspected += s1 + " "
    return suspected
Ejemplo n.º 6
    def title_similarity(self, page):
            s1 = normalize(self.reference.title)
            s2 = normalize(page.title)
            n = 3
            p1_trigrams = Counter(nltk.ngrams(s1, n))
            p2_trigrams = Counter(nltk.ngrams(s2, n))
            p1_grams = Counter(nltk.ngrams(s1, 1))
            p2_grams = Counter(nltk.ngrams(s2, 1))
            cosine = Cosine(1)
            similarity = cosine.similarity_profiles(p1_trigrams, p2_trigrams)
            similarity = cosine.similarity_profiles(p1_grams, p2_grams)
            similarity = similarity / 2

        except ZeroDivisionError:
            similarity = 0
        return similarity
Ejemplo n.º 7
def footer(df, kgram=2, TOP_LINES=5):

    df['isFooter'] = False
    pgs = df['page'].unique()

    cosine = Cosine(kgram)
    for pg in pgs[1:]:

        prev_idx = df.index[df['page'] == (pg - 1)]
        pres_idx = df.index[df['page'] == pg]

        for ln in range(TOP_LINES):

            prev_ln = prev_idx[-1 * (ln + 1)]
            pres_ln = pres_idx[-1 * (ln + 1)]

            s0 = df.loc[prev_ln, 'text']
            s1 = df.loc[pres_ln, 'text']

            skip = 0
            if s0.isdigit():
                df.loc[prev_ln, 'isFooter'] = True
                skip = 1

            if s1.isdigit():
                df.loc[pres_ln, 'isFooter'] = True
                skip = 1

            if (skip == 1) | (len(s0) < kgram) | (len(s1) < kgram):

            #print(s0,",", s1)
            p0 = cosine.get_profile(s0)
            p1 = cosine.get_profile(s1)

            sim = cosine.similarity_profiles(p0, p1)
            if (sim > 0.9):
                df.loc[prev_ln, 'isFooter'] = True
                df.loc[pres_ln, 'isFooter'] = True
                #print(pg,",", ln, ",", s0,",", s1,",", sim)

    return (df)
def met_cosine(s1, s2, n):
    cosine = Cosine(n)
    p1 = cosine.get_profile(s1)
    p2 = cosine.get_profile(s2)
    val = cosine.similarity_profiles(p1, p2)
    return val
Ejemplo n.º 9
    def similarity(self, question, answer):

        stopword = self.read_from(folder_path + '上证专用停用词.txt')
        stopwords = []
        for sw in stopword:
            sw = sw.strip('\n')
            sw = sw.strip(' ')
        # print(stopwords)

        meaningful_words1 = []
        meaningful_words2 = []

        words2 = jieba.cut(str(question))
        words3 = jieba.cut(str(answer))
        for word in words2:
            if word not in stopwords:
        for word in words3:
            if word not in stopwords:
        s2 = ''.join(meaningful_words1)
        # print(s2)
        s3 = ''.join(meaningful_words2)
        a1 = Cosine(1)
        b1 = Damerau()
        c1 = Jaccard(1)
        d1 = JaroWinkler()
        e1 = Levenshtein()
        f1 = LongestCommonSubsequence()
        g1 = MetricLCS()
        h1 = NGram(2)
        i1 = NormalizedLevenshtein()
        j1 = OptimalStringAlignment()
        k1 = QGram(1)
        l1 = SorensenDice(2)
        m1 = WeightedLevenshtein(character_substitution=CharSub())

        line_sim = []

        cos_s = a1.similarity(s2, s3)
        cos_d = a1.distance(s2, s3)
        dam = b1.distance(s2, s3)
        jac_d = c1.distance(s2, s3)
        jac_s = c1.similarity(s2, s3)
        jar_d = d1.distance(s2, s3)
        jar_s = d1.similarity(s2, s3)
        lev = e1.distance(s2, s3)
        lon = f1.distance(s2, s3)
        met = g1.distance(s2, s3)
        ngr = h1.distance(s2, s3)
        nor_d = i1.distance(s2, s3)
        nor_s = i1.similarity(s2, s3)
        opt = j1.distance(s2, s3)
        qgr = k1.distance(s2, s3)
        sor_d = l1.distance(s2, s3)
        sor_s = l1.similarity(s2, s3)
        wei = m1.distance(s2, s3)

        return line_sim
import matplotlib.pyplot as plt
import re
import json
from similarity.jarowinkler import JaroWinkler
from similarity.qgram import QGram
from similarity.cosine import Cosine
from similarity.ngram import NGram
from similarity.levenshtein import Levenshtein
from similarity.normalized_levenshtein import NormalizedLevenshtein
import csv
from similarity.qgram import QGram

with open('qgram_human_mobility.txt') as f:
    data = json.load(f)

cosine = Cosine(2)

profiles = []

for i in range(len(data)):

    temp = data[i][0].strip('"')


for i in range(len(profiles)):


profile_sim = []
Ejemplo n.º 11
from triathon_text_similarity.prediction import *
from triathon_text_similarity.similar_text import *
from similarity.cosine import Cosine
import pandas as pd

cosine = Cosine(2)

dataset_text = pd.read_csv('./advocates.tsv', delimiter='\t', quoting=3)

counter = 0
input_list = []
compare_list = []
for i in range(0, 1000):
    if dataset_text['Liked'][i] == 1:
        counter += 1
        if counter < 41:

final_text_str = ''.join(input_list)
for i in range(1000):
    if dataset_text['Liked'][i] == 1:
        counter += 1
        if counter > 41:

final_compare_text_str = ''.join(compare_list)
Ejemplo n.º 12
from similarity.levenshtein import Levenshtein
from similarity.normalized_levenshtein import NormalizedLevenshtein
from similarity.cosine import Cosine
lev = Levenshtein()
nolev = NormalizedLevenshtein()
cosine = Cosine(4)
str1 = 'I enjoy playing football'
str2 = 'I love to play soccer'

print(lev.distance(str1, str2))
print('Levenshtein distance:')
print(nolev.similarity(str1, str2))
print('Cosine similarity:')
print(cosine.similarity(str1, str2))
Ejemplo n.º 13
def dashboard_carrer():

    with open('.data/test.csv', 'rb') as f:
        result = chardet.detect(

    p_test = pd.read_csv(".data/test.csv", encoding=result['encoding'])
    p_test = p_test.replace('Enjoy', 5)
    p_test = p_test.replace('Slightly Enjoy', 4)
    p_test = p_test.replace('Neutral', 3)
    p_test = p_test.replace('Slightly Disagree', 2)
    p_test = p_test.replace('Strongly Disagree', 1)

    #Realistic Questions
    realistic = p_test[[
        'I like to work on cars', 'I like to build things',
        'I like to take care of animals',
        'I like putting things together or assembling things',
        'I like to cook', 'I am a practical person', 'I like working outdoors'
    #Investigative Questions
    investigative = p_test[[
        'I like to do puzzles', 'I like to do experiments', 'I enjoy science',
        'I enjoy trying to figure out how things work',
        'I like to analyze things (problems/situations)',
        'I like working with numbers  or charts', 'I am good at math'
    #Artistic Questions
    artistic = p_test[[
        'I am good at working independently',
        'I like to read about art and music', 'I enjoy creative writing',
        'I am a creative person', 'I like to play instruments or sing',
        'I like acting in plays', 'I like to draw'
    #Social Questions
    social = p_test[[
        'I like to work in teams', 'I like to teach or train people',
        'I like trying to help people solve their problems',
        'I am interested in healing people',
        'I enjoy learning about other cultures',
        'I like to get into discussions about issues around me',
        'I like helping people'
    #Enterprising Questions
    enterprising = p_test[[
        'I am an ambitious person who set goals for myself',
        'I like to try to influence or persuade people',
        'I like selling things', 'I am quick to take on new responsibilities',
        'I would like to start my own business', 'I like to give speeches',
        'I like to lead'
    #Conventional Questions
    conventional = p_test[[
        'I like to organize things',
        'I wouldn’t mind working 8 hours per day in an office',
        'I pay attention to details', 'I like to do filing or typing',
        'I am good at keeping records of my work',
        'I would like to work in an office'

    #Summing Up
    realistic['R'] = realistic.sum(axis=1)
    investigative['I'] = investigative.sum(axis=1)
    artistic['A'] = artistic.sum(axis=1)
    social['S'] = social.sum(axis=1)
    enterprising['E'] = enterprising.sum(axis=1)
    conventional['C'] = conventional.sum(axis=1)

    code = realistic['R']
    code = code.to_frame()
    code['I'] = investigative['I']
    code['A'] = artistic['A']
    code['S'] = social['S']
    code['E'] = enterprising['E']
    code['C'] = conventional['C']

    n = 3

    new_d = [
        for _, row in code.iterrows()

    std = pd.DataFrame(new_d)

    std['code'] = std[0] + std[1] + std[2]

    #std has the test code
    std = std.drop([0, 1, 2], axis=1)

    #Read the course data
    course = pd.read_csv(".data/course.csv")

    df = pd.MultiIndex.from_product(
        [std["code"], course["course_code"], course["Course_short"]],
        names=["code", "course_code", "course"]).to_frame(index=False)

    df = df.dropna()

    #Cosine Similarity
    cosine = Cosine(2)
    df["p0"] = df["code"].apply(lambda s: cosine.get_profile(s))
    df["p1"] = df["course_code"].apply(lambda s: cosine.get_profile(s))
    df["cosine_sim"] = [
        cosine.similarity_profiles(p0, p1)
        for p0, p1 in zip(df["p0"], df["p1"])
    df.drop(["p0", "p1"], axis=1, inplace=True)

    #Sorting the Values
    top_n = df.sort_values(['cosine_sim'],

    options = top_n["course"].to_numpy()

    # selecting rows based on condition
    rec = course.loc[course['Course_short'].isin(options)]

    recommendations = json.loads(rec.to_json(orient='records'))

    return render_template('./dashboard_carrer.html',
                           title='Dashboard - Carrer',
Ejemplo n.º 14

from similarity.cosine import Cosine
import sys
import os
import re
import subprocess

clean_times = []
clean_lines = []
part_file_names = []
time_regex = r'(.+) --> (.+)'
cosine = Cosine(2)
mySrtFileName = ""
mediaFileName = ""
mediaFileExtension = ""
accuracy_decimal = 0.0

def make_part(startTime, endTime, partNum):
    musicFileCopyName = "parts%sCleanPart%s%s" % (os.sep, partNum,

    with open(musicFileName) as musicFile:
        bashCommand = "ffmpeg -y -i \"%s\" -ss %s -to %s -c copy \"%s\"" % (
            musicFileName, startTime, endTime, musicFileCopyName)
        bashCall = subprocess.Popen(bashCommand,
        output, error = bashCall.communicate()
Ejemplo n.º 15
 def cosine(self, s0, s1):
     cosine = Cosine(15)
     p0 = cosine.get_profile(s0)
     p1 = cosine.get_profile(s1)
     #print('Cosine similarity \"%\" vs \"%\"'% (s0,s1))
     return cosine.similarity_profiles(p0, p1)
Ejemplo n.º 16
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import numpy as np

# Inizializza all'import
levenshtein = Levenshtein()
norm_levenshtein = NormalizedLevenshtein()
damerau = Damerau()
optimal_string_alignment = OptimalStringAlignment()
jarowinkler = JaroWinkler()
lcs = LongestCommonSubsequence()
metric_lcs = MetricLCS()
ngram = NGram()
qgram = QGram()
dice = SorensenDice()
cos = Cosine(5)
jaccard = Jaccard(5)

similarity_functions = [
    norm_levenshtein.similarity, lambda a, b: 1 - metric_lcs.distance(a, b),
    lambda a, b: 1 - ngram.distance(a, b), cos.similarity, dice.similarity

def mono_vector0(tup1, tup2):

    str1 = ' '.join(tup1).lower()
    str2 = ' '.join(tup2).lower()

    simv = list(map(lambda x: x(str1, str2), similarity_functions))