Example #1
0
    def triangulate(self, tweet, loc):

        print('Triangulating: ' + tweet)
        cosine = Cosine(2)
        cos_tweet = cosine.get_profile(tweet)

        with open("clean/clean_rss.txt", "r") as clean_rss:

            for rss in clean_rss:
                rss = rss.split('\n')[0]
                cos_rss = cosine.get_profile(rss)
                cos_result = cosine.similarity_profiles(cos_tweet, cos_rss)

                if cos_result > 0.7:
                    print('\t[PASS: '******'] ' + rss)
                    return True
                else:
                    print('\t[FAIL: ' + str(cos_result) + '] ' + rss)

        with open("clean/clean_retweet.txt", "r") as clean_rt:

            for rtweet in clean_rt:
                rt = rtweet.rsplit(' ', 1)[0]
                rt_loc = rtweet.split('\n')[0].rsplit(' ', 1)[1]
                cos_rt = cosine.get_profile(rt)

                if loc == rt_loc:
                    cos_result = cosine.similarity_profiles(cos_tweet, cos_rt)
                    if cos_result > 0.7:
                        print('\t[PASS: '******'] ' + rt)
                        return True
                    else:
                        print('\t[FAIL: ' + str(cos_result) + '] ' + rt)

        with open('clean/clean_tweet.txt', 'r') as clean_tweet:

            for ctweet in clean_tweet:
                ct = ctweet.rsplit(' ', 1)[0]
                ct_loc = ctweet.split('\n')[0].rsplit(' ', 1)[1]
                cos_ct = cosine.get_profile(ct)

                if loc == ct_loc:
                    cos_result = cosine.similarity_profiles(cos_tweet, cos_ct)
                    if cos_result > 0.7 and cos_result != 1.0:
                        print('\t[PASS: '******'] ' + ct)
                        return True
                    else:
                        print('\t[FAIL: ' + str(cos_result) + '] ' + ct)

        print('\tNo matching results found...')
        return False
Example #2
0
def get_similarity_score():
    cosine = Cosine(2)
    f, g, ctr = open('model_res').readlines(), open('real_res').readlines(), 0
    for i in range(len(f)):
        f[i], g[i] = f[i].replace("\n", ""), g[i].replace("\n", "")
        ctr += cosine.similarity_profiles(cosine.get_profile(f[i]),
                                          cosine.get_profile(g[i]))
    return ctr / len(f)
Example #3
0
    def title_similarity(self, page):
        try:
            s1 = normalize(self.reference.title)
            s2 = normalize(page.title)
            n = 3
            p1_trigrams = Counter(nltk.ngrams(s1, n))
            p2_trigrams = Counter(nltk.ngrams(s2, n))
            p1_grams = Counter(nltk.ngrams(s1, 1))
            p2_grams = Counter(nltk.ngrams(s2, 1))
            cosine = Cosine(1)
            similarity = cosine.similarity_profiles(p1_trigrams, p2_trigrams)
            similarity = cosine.similarity_profiles(p1_grams, p2_grams)
            similarity = similarity / 2

        except ZeroDivisionError:
            similarity = 0
        return similarity
Example #4
0
def match(num):
    stop_plate = [0, "", "0"]
    suspected = ''
    from similarity.cosine import Cosine
    cosine = Cosine(2)
    s0 = str(num)
    f = open('lic.csv').readlines()
    for s1 in f:
        s1 = s1.replace("\n", "")
        if s1 in stop_plate:
            continue
        print(s1)
        p0 = cosine.get_profile(s0)
        p1 = cosine.get_profile(s1)
        if cosine.similarity_profiles(p0, p1) >= 0.65:
            suspected += s1 + " "
    return suspected
Example #5
0
def footer(df, kgram=2, TOP_LINES=5):

    df['isFooter'] = False
    pgs = df['page'].unique()

    cosine = Cosine(kgram)
    for pg in pgs[1:]:

        prev_idx = df.index[df['page'] == (pg - 1)]
        pres_idx = df.index[df['page'] == pg]

        for ln in range(TOP_LINES):

            prev_ln = prev_idx[-1 * (ln + 1)]
            pres_ln = pres_idx[-1 * (ln + 1)]

            s0 = df.loc[prev_ln, 'text']
            s1 = df.loc[pres_ln, 'text']

            skip = 0
            if s0.isdigit():
                df.loc[prev_ln, 'isFooter'] = True
                skip = 1

            if s1.isdigit():
                df.loc[pres_ln, 'isFooter'] = True
                skip = 1

            if (skip == 1) | (len(s0) < kgram) | (len(s1) < kgram):
                continue

            #print(s0,",", s1)
            p0 = cosine.get_profile(s0)
            p1 = cosine.get_profile(s1)

            sim = cosine.similarity_profiles(p0, p1)
            if (sim > 0.9):
                df.loc[prev_ln, 'isFooter'] = True
                df.loc[pres_ln, 'isFooter'] = True
                #print(pg,",", ln, ",", s0,",", s1,",", sim)

    return (df)
def met_cosine(s1, s2, n):
    cosine = Cosine(n)
    p1 = cosine.get_profile(s1)
    p2 = cosine.get_profile(s2)
    val = cosine.similarity_profiles(p1, p2)
    return val
for i in range(len(data)):

    temp = data[i][0].strip('"')

    profiles.append(cosine.get_profile(temp))

for i in range(len(profiles)):

    print(profiles[i])

profile_sim = []

for i in range(len(profiles)):

    sim_score = 0

    for j in range(len(profiles)):

        temp = cosine.similarity_profiles(profiles[i], profiles[j])

        sim_score = sim_score + temp

    profile_sim.append([i, sim_score])

    print(i)

with open('cosine_similarity.txt', 'w', encoding="ISO-8859-1") as outfile:
    json.dump(profile_sim, outfile)
    outfile.close()
Example #8
0
print(input_list)
print(counter)

same = similar_text_score()
txt_true = "The term international child abduction is generally synonymous with international parental kidnapping, child snatching, and child stealing.[1] However, the more precise legal usage of international child abduction originates in private international law and refers to the illegal removal of children from their home by an acquaintance or family member to a foreign country. In this context, 'illegal' is normally taken to mean 'in breach of custodial rights' and 'home' is defined as the child's habitual residence"

txt_pred = "What is today called 'parental kidnapping,' 'international child abduction,', 'parental child abduction' and 'parental child trafficking' has existed as long as different legal jurisdictions and international borders have—though often under different names. None of these names achieved the modern day broad acceptance of terms like international child abduction. Lacking a common set of terminology or specifically designed laws to address the, at the time, poorly defined problem, researchers on the history of cross-border child abduction must search for terms like 'custodial interference,' 'contempt of child custody orders,' 'legal kidnapping' or, in cases where children were viewed more as property than as individual subjects of rights, name variations on theft, child-maintenance debt and smuggling, among others."

y_pred2 = "New Delhi: At least 35 people were killed and over 200 injured on Sunday when over dozen coaches of two superfast Express trains got derailed in Uttar Pradesh and Assam, raising concerns once again about the patchy safety record of Indian Railways.Thirty-five have been confirmed dead and over 140 injured as the Howrah-Delhi-Kalka Express got derailed near Fatehpur Malwa in Uttar Pradesh. The incident took place around 12:30 pm. Approximately 1200 people were travelling on board Kalka Mail.The train was travelling from Howrah to New Delhi and was moving at the speed of 108 Km/Hr when the driver used emergency brakes to slow it down, which led to the derailment, sources claimed."

print(lcs.distance(final_text_str, final_compare_text_str))

print(qgram.distance('hello', 'world'))

print(
    cosine.similarity_profiles(cosine.get_profile(final_text_str),
                               cosine.get_profile(final_compare_text_str)))

print(jarowinkler.similarity(final_text_str, final_compare_text_str))

same.similarity_score(txt_true=final_text_str, txt_pred=final_compare_text_str)

pred = prediction()
pred.generate_a_dataset(qnt_of_rand_num=200, dataset_name='lawers200.csv')
pred.train_model(csv_file='lawers200.csv', save_model_name='model200.pickle')
pred.load_model_and_pred(model_name='model200.pickle',
                         rating=0.95,
                         wins=6,
                         time_diff=196)

import numpy as np
import pandas as pd
Example #9
0
def dashboard_carrer():

    with open('.data/test.csv', 'rb') as f:
        result = chardet.detect(f.read())

    p_test = pd.read_csv(".data/test.csv", encoding=result['encoding'])
    #Encoding
    p_test = p_test.replace('Enjoy', 5)
    p_test = p_test.replace('Slightly Enjoy', 4)
    p_test = p_test.replace('Neutral', 3)
    p_test = p_test.replace('Slightly Disagree', 2)
    p_test = p_test.replace('Strongly Disagree', 1)

    #Realistic Questions
    realistic = p_test[[
        'I like to work on cars', 'I like to build things',
        'I like to take care of animals',
        'I like putting things together or assembling things',
        'I like to cook', 'I am a practical person', 'I like working outdoors'
    ]]
    #Investigative Questions
    investigative = p_test[[
        'I like to do puzzles', 'I like to do experiments', 'I enjoy science',
        'I enjoy trying to figure out how things work',
        'I like to analyze things (problems/situations)',
        'I like working with numbers  or charts', 'I am good at math'
    ]]
    #Artistic Questions
    artistic = p_test[[
        'I am good at working independently',
        'I like to read about art and music', 'I enjoy creative writing',
        'I am a creative person', 'I like to play instruments or sing',
        'I like acting in plays', 'I like to draw'
    ]]
    #Social Questions
    social = p_test[[
        'I like to work in teams', 'I like to teach or train people',
        'I like trying to help people solve their problems',
        'I am interested in healing people',
        'I enjoy learning about other cultures',
        'I like to get into discussions about issues around me',
        'I like helping people'
    ]]
    #Enterprising Questions
    enterprising = p_test[[
        'I am an ambitious person who set goals for myself',
        'I like to try to influence or persuade people',
        'I like selling things', 'I am quick to take on new responsibilities',
        'I would like to start my own business', 'I like to give speeches',
        'I like to lead'
    ]]
    #Conventional Questions
    conventional = p_test[[
        'I like to organize things',
        'I wouldn’t mind working 8 hours per day in an office',
        'I pay attention to details', 'I like to do filing or typing',
        'I am good at keeping records of my work',
        'I would like to work in an office'
    ]]

    #Summing Up
    realistic['R'] = realistic.sum(axis=1)
    investigative['I'] = investigative.sum(axis=1)
    artistic['A'] = artistic.sum(axis=1)
    social['S'] = social.sum(axis=1)
    enterprising['E'] = enterprising.sum(axis=1)
    conventional['C'] = conventional.sum(axis=1)

    code = realistic['R']
    code = code.to_frame()
    code['I'] = investigative['I']
    code['A'] = artistic['A']
    code['S'] = social['S']
    code['E'] = enterprising['E']
    code['C'] = conventional['C']

    n = 3

    new_d = [
        list(map(it(0),
                 (row[1:].sort_values(ascending=False)[:n].iteritems())))
        for _, row in code.iterrows()
    ]

    std = pd.DataFrame(new_d)

    std['code'] = std[0] + std[1] + std[2]

    #std has the test code
    std = std.drop([0, 1, 2], axis=1)

    #Read the course data
    course = pd.read_csv(".data/course.csv")

    df = pd.MultiIndex.from_product(
        [std["code"], course["course_code"], course["Course_short"]],
        names=["code", "course_code", "course"]).to_frame(index=False)

    df = df.dropna()

    #Cosine Similarity
    cosine = Cosine(2)
    df["p0"] = df["code"].apply(lambda s: cosine.get_profile(s))
    df["p1"] = df["course_code"].apply(lambda s: cosine.get_profile(s))
    df["cosine_sim"] = [
        cosine.similarity_profiles(p0, p1)
        for p0, p1 in zip(df["p0"], df["p1"])
    ]
    df.drop(["p0", "p1"], axis=1, inplace=True)

    #Sorting the Values
    top_n = df.sort_values(['cosine_sim'],
                           ascending=False).groupby(df['code'].values).head(3)

    options = top_n["course"].to_numpy()

    # selecting rows based on condition
    rec = course.loc[course['Course_short'].isin(options)]

    recommendations = json.loads(rec.to_json(orient='records'))

    return render_template('./dashboard_carrer.html',
                           title='Dashboard - Carrer',
                           std=std,
                           recommendations=recommendations)
Example #10
0
 def cosine(self, s0, s1):
     cosine = Cosine(15)
     p0 = cosine.get_profile(s0)
     p1 = cosine.get_profile(s1)
     #print('Cosine similarity \"%\" vs \"%\"'% (s0,s1))
     return cosine.similarity_profiles(p0, p1)