Ejemplo n.º 1
0
def mergeItemsDiscretes(item1, item2):
	diff_price = feature_diffPrice(item1['price'], item2['price'])
	diff_price = discretizaDiffPrice(diff_price)
	jw_title = jf.jaro_winkler(item1['title'], item2['title'])
	jw_desc  = jf.jaro_winkler(item1['description'], item2['description'])
	simi_json,eq_keys_json = feature_attrJson(item1, item2)
	diff_latlon = feature_diffLatLon(item1['lon'], item1['lat'], item2['lon'], item2['lat'])

	metroID1 = '0'
	if(item1['metroID'] != ''):
		metroID1 = item1['metroID']

	metroID2 = '0'
	if(item2['metroID'] != ''):
		metroID2 = item2['metroID']

	json = {
		'x':
		[
			int(item1['itemID'] == item2['itemID']),
			
			int(item1['categoryID']),
			int(item2['categoryID']),
			int(item1['categoryID'] == item2['categoryID']),

			int(item1['metroID'] == ''),
			int(item2['metroID'] == ''),
			int(round(float(metroID1))),
			int(round(float(metroID2))),
			int(item1['metroID'] == item2['metroID']),

			int(item1['locationID'] == ''),
			int(item2['locationID'] == ''),
			int(item1['locationID']),
			int(item2['locationID']),
			int(item1['locationID'] == item2['locationID']),

			int(item1['price']	== item2['price']),
			int(item1['price'] 	== ''),
			int(item2['price'] 	== ''),
			int(item1['price'] 	== '1.0'),
			int(item2['price'] 	== '1.0'),
			diff_price,

			round(jw_title * 10),
			round(jw_desc * 10),

			int(item1['attrsJSON'] == ''),
			int(item2['attrsJSON'] == ''),
			round(round(simi_json * 100)/10),
			round(round(eq_keys_json * 100)/10),

			round(((1/diff_latlon)/1000))
		]
	}

	return json
def get_closest_jaro_winkler(needle,haystack):
	closest = None;
	for x in haystack:
		if(closest == None):
			closest = (x,jellyfish.jaro_winkler(needle,x));
		else:
			temp = (x,jellyfish.jaro_winkler(needle,x));
			if(temp[1] > closest[1]):
				closest = temp;
	if(closest == None):
		return None;
	return closest[0];
def test_jellyfish():
    text1 = 'Телефон в хорошем состоянии, трещин и сколов нет, за все время менялся только аккумулятор(поэтому заряд держит хорошо), остальное все родное, в целом работает отлично! В комплекте кабель. Обмен не интересен.'
    text2 = 'Продам телефон в хорошем состоянии Полностью рабочий есть WiFi'
    lst1 = normalize(text1)
    lst2 = normalize(text2)
    text_norm1 = ' '.join(lst1)
    text_norm2 = ' '.join(lst2)
    print(jellyfish.jaro_distance(text1, text2))
    print(jellyfish.jaro_distance(text_norm1, text_norm2))
    print(jellyfish.jaro_winkler(text1, text2))
    print(jellyfish.jaro_winkler(text_norm1, text_norm2))
    print(jellyfish.nysiis(text1))
    print(jellyfish.nysiis(text2))
    exit()
Ejemplo n.º 4
0
    def score(self,s,t):
        '''
        Returns the similarity score
        '''
        similar = namedtuple('Similar',['r1','r2','sim'])
        similarity=[]
        tfidfdict = self.builddict()
        for i,ti in enumerate(s.split(" ")):
            for j,tj in enumerate(t.split(" ")):
                dist = jf.jaro_winkler(ti,tj)
                if dist >= THRESHOLD:
                    similarity.append(similar(i,j,
                                                 dist*tfidfdict.get(ti)* tfidfdict.get(tj)))
    
        similarity.sort(reverse=True,key=lambda x:x.sim)

        sused = np.array([False]*len(s),dtype=bool)
        tused = np.array([False]*len(t),dtype=bool)
    
    
        #check that the term are counted only once
        sim = 0.0
        for s in similarity:
            if(sused[s.r1] | tused[s.r2]):
                continue;
            sim+=s.sim
            sused[s.r1] = True
            tused[s.r2] = True
        return sim  
Ejemplo n.º 5
0
def alldist(filex, filey):
    xread = open(filex, 'r').read()
    yread = open(filey, 'r').read()
    lvd = jellyfish.levenshtein_distance(xread,yread)
    dlvd= jellyfish.damerau_levenshtein_distance(xread,yread)
    spsum = spamsum.match(xread,yread)
    spsum = 100 - spsum
    spsum = float(spsum/100.00)
#    print lvd
    res = float( lvd / 100.00 )
    dres= float(dlvd / 100.00 )
#    print res
#    print "Levenshtein Distance=",res
    jaro = jellyfish.jaro_distance(xread,yread)
## Added jaro-winkler distance by fahim 20111011
    jarowink = jellyfish.jaro_winkler(xread,yread)
    jaro = 1.0 - jaro
    jarowink = 1.0 - jarowink
#   print "Jaro Distance = ",jaro
    ham = jellyfish.hamming_distance(xread,yread)
    ham = float ( ham / 100.00)
    print "Hamming Distance = ", ham
#	print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2))
#	print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1))
#    print "Spamsum Match score: ", spsum
    kl = kldiv(tokenize(xread), tokenize(yread))

    return res, dres , jaro, jarowink, ham, kl, spsum
Ejemplo n.º 6
0
def jaro_winkler_similarity(s, t):
    """ Jaro-Winkler Similarity """

    jw_sim = jellyfish.jaro_winkler(s, t)


    return jw_sim
Ejemplo n.º 7
0
def find_string_similarity(first_str, second_str, normalized=False, ignore_list=[]):
    """ Calculates matching ratio between two strings

    Args:
        first_str (str) : First String
        second_str (str) : Second String
        normalized (bool) : if True ,method removes special characters and extra whitespace
                            from strings then calculates matching ratio
        ignore_list (list) : list has some characters which has to be substituted with "" in string


    Returns:
       Float Value : Returns a matching ratio between 1.0 ( most matching ) and 0.0 ( not matching )
                    using difflib's SequenceMatcher and and jellyfish's jaro_winkler algorithms with
                    equal weightage to each

    Examples:
        >>> find_string_similarity("hello world","Hello,World!",normalized=True)
        1.0
        >>> find_string_similarity("entrepreneurship","entreprenaurship")
        0.95625
        >>> find_string_similarity("Taj-Mahal","The Taj Mahal",normalized= True,ignore_list=["the","of"])
        1.0
    """
    first_str = process_str_for_similarity_cmp(first_str, normalized=normalized, ignore_list=ignore_list)
    second_str = process_str_for_similarity_cmp(second_str, normalized=normalized, ignore_list=ignore_list)
    match_ratio = (difflib.SequenceMatcher(None, first_str, second_str).ratio() + jellyfish.jaro_winkler(unicode(first_str), unicode(second_str)))/2.0
    return match_ratio
Ejemplo n.º 8
0
def row_stats(in_row):
    """Compute additional stats for each row"""
    out_row = {}
    index, row = in_row

    out_row['index'] = index
    out_row['positives'] = row.count()
    out_row['distincts'] = row.dropna().unique().size
    out_row['max'] = row.value_counts().max()

    labels = row.dropna()

    if labels.size < 2:
        out_row['similarity'] = np.nan
    else:
        similarities = []

        for l1, l2 in itertools.combinations(labels, 2):
            s = jellyfish.jaro_winkler(l1, l2)
            similarities.append(s)

        out_row['resemblance'] = np.mean(similarities)
        out_row['resemblance_min'] = np.min(similarities)
        out_row['resemblance_max'] = np.max(similarities)

    return out_row
Ejemplo n.º 9
0
    def score(s,t):
        
        similar = namedtuple('Similar',['r1','r2','sim'])
        similarity=[]
        for i,ti in enumerate(s.split(" ")):
            for j,tj in enumerate(t.split(" ")):
                dist = jf.jaro_winkler(ti,tj)
                if dist >= THRESHOLD:
                    similarity.append(similar(i,j,
                                                 dist*tfidfdict[ti]*tfidfdict[tj]))
    
        similarity.sort(reverse=True,key=lambda x:x.sim)

        sused = np.array([False]*len(s),dtype=bool)
        tused = np.array([False]*len(t),dtype=bool)
    
    
        
        sim = 0.0
        for s in similarity:
            if(sused[s.r1] | tused[s.r2]):
                continue;
            sim+=s.sim
            sused[s.r1] = True
            tused[s.r2] = True
        return sim  
Ejemplo n.º 10
0
def best_match(s, categories, top_n=5):
    """Return the top N best matches from your categories with the best match
    in the 0th position of the return list.
    Usage:
            >>> best_match('ilinois', ['Michigan', 'Ohio', 'Illinois'], 2)
            [('Illinois', 96), ('Michigan', 22)]

    :param s: str value to find best match
    :param categories: list values to compare against
    :param top_n: number of matches to return
    :returns: list of tuples (guess, percentage)
    """
    scores = []
    for cat in categories:
        scores.append((cat, jellyfish.jaro_winkler(
            s.encode('ascii', 'replace').upper(),
            cat.encode('ascii', 'replace').upper()
        )))

    scores = sorted(scores, key=lambda x: x[1])
    scores = scores[-top_n:]
    scores = [(score[0], int(score[1] * 100)) for score in scores]
    scores.reverse()

    return scores
def get_jaro_winkler_avg(row1, row2):
	sum = 0
	for columnIndex in xrange(1,15):
		a = row1[columnIndex]
		b = row2[columnIndex]
		sum += jellyfish.jaro_winkler(a, b)
	return sum / 14.0
Ejemplo n.º 12
0
def best_match(s, categories, top_n=5):
    """
    Return the top N best matches from your categories with the best match
    in the 0th position of the return list. The comparison does not check
    the first element of the category name, only the second element.

    Usage:
            >>> best_match('illinois', [ ('_', 'Michigan'),
                                         ('_', 'Ohio',
                                         ('_', 'Illinois') ],
                                        2)
            [('Illinois', 96), ('Michigan', 22)]

    Args:
        s: str value to find best match
        categories: list of tuples to compare against. needs to be
        [('table1', 'value1'), ('table2', 'value2')]
        top_n: number of matches to return

    Returns:
        list of tuples (table, guess, percentage)

    """

    # print 'starting match on {}'.format(s)
    scores = []
    for cat in categories:
        # verify that the category has two elements, if not, then just
        # return _ for the first category. Need this because fuzzy_in_set uses the
        # same method
        table_name = '_'
        category = None
        if isinstance(cat, tuple):
            table_name = cat[0]
            category = cat[1]
        else:
            category = cat

        scores.append(
            (
                table_name,
                category,
                jellyfish.jaro_winkler(
                    s.encode('ascii', 'replace').lower(),
                    category.encode('ascii', 'replace').lower()
                )
            )
        )

        # sort first by the ones

    # print 'all scores for {} are {}'.format(s, scores)
    scores = sorted(scores, cmp=sort_scores)
    # take the top n number of matches
    scores = scores[:top_n]
    # convert to hundreds
    scores = [(score[0], score[1], int(score[2] * 100)) for score in scores]
    # print 'ending all categories match of {} with scores {}'.format(s, scores)

    return scores
Ejemplo n.º 13
0
def is_same_label(label1, label2):
    # noop cannot be same with non-noop
    if (label1 == "noop" and label2 != "noop") or (label1 != "noop" and label2 == "noop"):
        return [False, "", "noop", 0]
    if label1 == label2:
        return [True, label1, "identical", 1]
    l1 = label1.lower()
    l2 = label2.lower()
    if l1 == l2:
        return [True, l1, "case", 1]
    fl1 = get_filtered_label(l1)
    fl2 = get_filtered_label(l2)
    if get_filtered_label(l1) == get_filtered_label(l2):
        return [True, get_filtered_label(l1), "stopword", 1]
    # TODO: string comparison, sentence analysis
    # import difflib
    # sim_score = difflib.SequenceMatcher(None, fl1,fl2).ratio()
    import jellyfish
    sim_score = jellyfish.jaro_winkler(fl1, fl2)
    if sim_score >= SIM_THRESHOLD:
        # print "fuzzy", sim_score,
        # print "[fuzzy]", sim_score, l1, "===", l2  # fl1, "===", fl2, "|||",
        final_label = max([l1, l2], key=len)
        return [True, final_label, "sim", sim_score]
    # print "diff", sim_score, l1, "===", l2
    return [False, "", "", sim_score]
Ejemplo n.º 14
0
    def build_uid_hash(self):
        for user in self.users:
           self.uid_hash[user.id] = [user]

        for i, user_i in enumerate(self.users):
            if user_i.id in self.marked:
                continue

            for j, user_j in enumerate(self.users):
                if j < i + 1:
                    continue

                cn_i = user_i.data['cn']
                cn_j = user_j.data['cn']
                distance = jaro_winkler(cn_i, cn_j)
                if distance >= DuplicatesKiller.RESEMBLANCE_CUANTUM:
                    self.uid_hash[user_i.id].append(user_j)
                    self.marked.add(user_j.id)

        for k, v in self.uid_hash.items():
            # at least one resemblance for current object
            if v.__len__() >= 2:
                self.groups.append(v)

        return self.groups
Ejemplo n.º 15
0
 def organize_results(self):
     self.structured_references = defaultdict(lambda: defaultdict(list))
     self.tagContradictions = []
     self.notReferenced = []
     #print self.scrape_results
     for category in self.scrape_results:
         for file_name in self.scrape_results[category]:
             for tag in self.scrape_results[category][file_name]:
                 finds = self.scrape_results[category][file_name][tag][0]
                 known_files = []
                 if len(self.scrape_results[category][file_name][tag]) > 1:
                     known_files = self.scrape_results[category][file_name][tag][1:]
                 referenced = False
                 for find in finds:
                     idtag, title = find[0], find[1]
                     print idtag, title
                     for tag2 in self.structured_references:
                         for id in self.structured_references[tag2]:
                             if id in idtag or idtag in id:
                                 referenced = True
                                 if len(self.structured_references[tag2][id]) == 0:
                                     if title.strip() != '':
                                         self.structured_references[tag2][id] = [title, {title: [file_name]}]
                                 elif title in self.structured_references[tag2][id][1] or max(
                                         [jf.jaro_winkler(unicode(title), unicode(t)) for t in
                                          self.structured_references[tag2][id][1]]) > 0.85:
                                     if title.strip() != '':
                                         self.structured_references[tag2][id][1][title].append(file_name)
                                 else:
                                     if title.strip() != '':
                                         self.structured_references[tag2][id][1][title] = [file_name]
                             else:
                                 if len(self.structured_references[tag2][id]) != 0:
                                     if title in self.structured_references[tag2][id][1] or max(
                                             [jf.jaro_winkler(unicode(title), unicode(t)) for t in
                                              self.structured_references[tag2][id][1]]) > 0.85:
                                         if idtag.strip() == '' and title.strip() != '':
                                             referenced = True
                                             self.structured_references[tag2][id][1][title].append(file_name)
                                         elif title.strip() != '':
                                             self.tagContradictions.append(
                                                 [file_name, self.scrape_results[category][file_name], tag2, id])
                 if not referenced:
                     self.notReferenced.append([file_name, self.scrape_results[category][file_name]])
     print len(self.structured_references)
     self.vote_and_restructure()
     self.flip_references()
Ejemplo n.º 16
0
 def mc_is_close_match(self):
     """True if the given candidate is a close match
     only missing a word like Inc or Ltd
     """
     stripped = list(
         self.kb.common_stripped.get(self.mc['candidate'], ['']))[0]
     if jf.jaro_winkler(self.mention_text, stripped) > 0.95:
         return True
     return False
Ejemplo n.º 17
0
    def test_jaro_winkler(self):
        cases = [("dixon", "dicksonx", 0.8133),
                 ("dicksonx", "dixon", 0.8133),
                 ("martha", "marhta", 0.9611),
                 ("dwayne", "duane", 0.84)]

        for (s1, s2, value) in cases:
            actual = jellyfish.jaro_winkler(s1, s2)
            self.assertAlmostEqual(actual, value, places=4)
Ejemplo n.º 18
0
    def jaro_winkler_apply(x):

        try:
            return jellyfish.jaro_winkler(x[0], x[1])
        except Exception as err:
            if pandas.isnull(x[0]) or pandas.isnull(x[1]):
                return np.nan
            else:
                raise err
Ejemplo n.º 19
0
def cluster(df):
    df1 = df[['nname', 'nzip', 'snum']].reset_index()
    dfm = df1.merge(df1, on='nzip')
    #dfm = dfm[dfm.index_x != dfm.index_y]
    dfm['g1'] = dfm[['nname_x','nname_y']].apply(lambda x: 1 if jellyfish.jaro_winkler(x[0],x[1]) > 0.8 else 0, axis=1)
    dfm['g2'] = dfm[['snum_x','snum_y']].apply(lambda x: 1 if x[0] == x[1] else 0, axis=1)
    dfc = clusterdf(dfm)
    dd = dfc.groupby(['cluster_id', 'index_x']).first().reset_index()
    ddd = dfc.groupby('cluster_id').first()
    return df.ix[ddd.index_x]
Ejemplo n.º 20
0
def best_match(s, categories, top_n=5):
    """Return the top N best matches from your categories."""
    scores = []
    for cat in categories:
        scores.append((cat, jellyfish.jaro_winkler(s.upper(), cat.upper())))

    scores = sorted(scores, key=lambda x: x[1])
    scores = scores[-top_n:]
    scores = [(scores[0][0], int(scores[0][1] * 100))]

    return scores
def measure_mrn_similarity(ssn1, ssn2, sign):
    if ssn1 == "" or ssn2 == "" or ssn1 is None or ssn2 is None:
        return 0

    r1 = jellyfish.jaro_winkler(ssn1, ssn2)
    r2 = 1 - jellyfish.hamming_distance(ssn1, ssn2) / len(ssn1)

    if sign == "t":
        print("jw-{} vs hd-{}".format(r1, r2))
    elif sign == "w":
        return max(r1, r2)
Ejemplo n.º 22
0
def fuzzy_match_with_ref_list(nouns, skill_list):
    possible_skills = set()
    for noun in nouns:
        max_score = 0.00
        for skill in skill_list:
            jaro_score = jellyfish.jaro_winkler(skill, noun)
            if jaro_score > max_score:
                max_score = jaro_score
                ref_skill = skill
        if max_score >= 0.88:
            possible_skills.add(ref_skill)
    return possible_skills
Ejemplo n.º 23
0
def searchstratergy1(record1,record2):
    '''
    Input: Two strings
    Output: The similarity score if its above the threshold
    Example: Jaro-winkler with threshold 0.9
    '''
    
    score = j.jaro_winkler(record1, record2)
    if score >= JARO_THRESHOLD:
        return score
    else:
        return 0
Ejemplo n.º 24
0
def jaro_match(tup):
    import jellyfish

    birth = tup[0]
    death = tup[1]

    birth_name = birth.first_name + ' ' + birth.last_name
    death_name = death.first_name + ' ' + death.last_name

    if jellyfish.jaro_winkler(birth_name, death_name) > JARO_THRESH:
        return True

    return False
Ejemplo n.º 25
0
def match_title(anidb_title, absolute_titles):
    max_simi = 0

    for title in absolute_titles:
        simi = jellyfish.jaro_winkler(anidb_title, title.encode('utf-8'))

        if simi > 0.9 and simi > max_simi:
            max_simi = simi

    if max_simi > 0:
        return max_simi
    else:
        return 0
Ejemplo n.º 26
0
def person_similarity(p1, p2, year_window, parent_sims=False, people_dict1=None,
                      people_dict2=None):
    # Don't match inaccurate and accurate dates (unrealistic)
    if (p1.byear is not None) != (p2.byear is not None):
        return 0
    # Don't match if dates are too different
    if (p1.byear is not None) and (p2.byear is not None) and \
                    abs(p1.byear-p2.byear) > year_window:
        return 0
    
    terms = 0
    sim_sum = 0
    
    sim_sum += jellyfish.jaro_winkler(p1.clean_first_name, p2.clean_first_name)
    terms += 1
    
    sim_sum += jellyfish.jaro_winkler(p1.clean_last_name, p2.clean_last_name)
    terms += 1
    
    sim = sim_sum / float(terms)
    
    if parent_sims and people_dict1 is not None and people_dict1 is not None:
        parent_factor = 0.5
        n_parents = 0
        recursive = False
        if p1.dad is not None and p2.dad is not None:
            sim += parent_factor * person_similarity(
                people_dict1[p1.dad], people_dict2[p2.dad], year_window,
                recursive, people_dict1, people_dict2)
            n_parents += 1

        if p1.mom is not None and p2.mom is not None:
            sim += parent_factor * person_similarity(
                people_dict1[p1.mom], people_dict2[p2.mom], year_window,
                recursive, people_dict1, people_dict2)
            n_parents += 1
        sim /= float(1 + parent_factor * n_parents)
    return sim
Ejemplo n.º 27
0
def commit_localization(graph):
    """Computes the relative number of directories modified by a commit."""
    result = []
    for commit in graph.iterate_commits():
        paths = map(os.path.dirname, graph.commit_files[commit])
        if len(paths) <= 1:
            result.append(len(paths))
            continue
        similarity_scores = []
        for pair_of_files in itertools.combinations(paths, 2):
            distance = jellyfish.jaro_winkler(*pair_of_files)
            similarity_scores.append(distance)
        result.append(float(sum(similarity_scores)) / max(len(similarity_scores), 1))
    return result
	def addressMatches(self, businessAddress, addressTerms) :

		print "For %s, comparing %s to %s..."%(self.NICKNAME, businessAddress, addressTerms)
  
		   #Compare the City, State and Zip first.. if these don't match, then exit.
		for addressTerm in addressTerms[1:] :
			if addressTerm not in businessAddress[1:] :
				return False
				
		#use the Jaro_winkler distance algo 
		jarowinkdist = jellyfish.jaro_winkler(businessAddress[0], addressTerms[0])
		if(jarowinkdist > 0.80):
				print "Match!"
				return True
Ejemplo n.º 29
0
def mergeItemsGeneral(item1, item2):
	diff_price = feature_diffPrice(item1['price'], item2['price'])
	diff_latlon = feature_diffLatLon(item1['lon'], item1['lat'], item2['lon'], item2['lat'])
	simi_json,eq_keys_json = feature_attrJson(item1, item2)
	
	json = {
				'x': 
				[
					int(item1['categoryID'] == item2['categoryID']),
					int(item1['locationID'] == item2['locationID']),
					int(item1['metroID'] 	== item2['metroID']),
					int(item1['metroID'] == ''),
					int(item2['metroID'] == ''),
					int(item1['price']		== item2['price']),
					int(item1['price'] 		== ''),
					int(item2['price'] 		== ''),
					int(item1['price'] 		== '1.0'),
					int(item2['price'] 		== '1.0'),
					diff_price,
					jf.jaro_winkler(item1['title'], item2['title']),
					jf.jaro_winkler(item1['description'], item2['description']),				
					int(item1['attrsJSON'] == ''),
					int(item2['attrsJSON'] == ''),
					diff_latlon,
					simi_json,
					eq_keys_json
				]
	}
	# no for seguinte, estou assumindo que os dois itens tem a mesma categoria sempre!
	for cat in categories.keys():
		if(int(item1['categoryID']) == cat):
			json['x'].append(1)
		else:
			json['x'].append(0)


	return json
Ejemplo n.º 30
0
 def score(self,s,t):
     '''
     Input: s - multi-word string
            t - multi-word string
     Output : score
     Note: In single word string, score = jaro-winkler score
     '''
     cummax = 0
     maxscore=0
     for ws in s.split(" "):
         for wt in t.split(" "):
             maxscore = max(maxscore,j.jaro_winkler(ws,wt))
         cummax += maxscore
     
     return cummax/len(s.split(" "))
Ejemplo n.º 31
0
def are_strings_similar(string_a, string_b):
    d = jellyfish.jaro_winkler(string_a, string_b)
    return d >= 0.9
Ejemplo n.º 32
0
def social_graph_creation(G, dataframe):
    import numpy as np
    actor_tot_list = []
    actor_buffer_list = [
    ]  # List used to check if an article is a perfect replica of the previous one
    for actor_list, theme_list in zip(dataframe.V2ENHANCEDPERSONS.unique(),
                                      dataframe.V2ENHANCEDTHEMES.unique()):
        actor_temp_list, offset_temp_list = [], []

        if not isinstance(actor_list, float):
            max_offset_diff = maximum_offset_difference(actor_list, theme_list)
            for actor in actor_list.split(';'):
                [actor_temp, offset_temp] = actor.split(',')

                if offset_temp not in offset_temp_list:
                    offset_temp_list.append(offset_temp)

                    # Compute similarity between actor_temp and all actors in the tot_list
                    if actor_tot_list:
                        similarity_max = np.max([
                            jellyfish.jaro_winkler(actor_temp, actor2)
                            for actor2 in actor_tot_list
                        ])
                        index_max = np.argmax([
                            jellyfish.jaro_winkler(actor_temp, actor2)
                            for actor2 in actor_tot_list
                        ])
                        actor_max = actor_tot_list[index_max]

                        nb_identical_names = len(
                            set(actor_temp.split(' '))
                            & set(actor_max.split(' ')))

                    else:
                        similarity_max = 0
                        nb_identical_names = 0

                    # Condition to correct the name if there is a misdetected 'A'
                    if actor_temp[0:2] == 'A ':
                        actor_temp = actor_temp[2:]

                    if 'Kanzler Joseph' in actor_temp:
                        actor_max = 'Youssef Chahed'
                        similarity_max, nb_identical_names = 1, 1

                    if similarity_max > 0.7 and nb_identical_names > 0:  # This actor is already present in the list
                        actor_temp = actor_max
                    else:
                        actor_tot_list.append(actor_temp)
                        G.add_node(actor_temp)

                    if actor_temp not in actor_temp_list:
                        actor_temp_list.append(actor_temp)

        if actor_temp_list != actor_buffer_list:
            actor_buffer_list = actor_temp_list

            nb_actors = len(actor_temp_list)
            #print("Actor list: ", nb_actors, actor_temp_list)

            # Edge creation between the actors of the article
            for index1 in range(0, len(actor_temp_list)):
                actor1 = actor_temp_list[index1]
                offset1 = int(offset_temp_list[index1])
                for index2 in range(index1 + 1, len(actor_temp_list)):
                    actor2 = actor_temp_list[index2]
                    offset2 = int(offset_temp_list[index2])
                    weight_edge = np.abs(offset2 - offset1) / (
                        max_offset_diff * nb_actors)
                    #print("Weight: ", weight_edge)

                    if G.has_edge(actor1, actor2):
                        G[actor1][actor2]['weight'] += weight_edge
                    else:
                        G.add_edge(actor1, actor2, weight=weight_edge)
Ejemplo n.º 33
0
def fuzzy_value_scoring(values_list1, values_list2):
    """
	string pairwise matcher
	NB only best matches are taken this is not all by all
	gets fuzzy pair match based on jarowinkler
	returns dict with mean, stc and 0.9 qualtile
	for jarowinkler, damerau levenshtein and hamming distances

	If the number of values is too long (>1000) the most frequently
	used values are taken as best representatives. This is to make
	computation doable.


	"""
    if len(values_list1) > 0 and len(values_list2) > 0:

        if len(values_list1) > 1000 or len(values_list2) > 1000:
            if len(values_list1) > 1000:
                x = value_info.get(facet1)
                value_df = pd.DataFrame(columns=['frequency']).from_dict(
                    x, orient='index').reset_index().rename(columns={
                        "index": "value",
                        0: "frequency"
                    }).sort_values(['frequency'], ascending=False).head(n=1000)
                values_list1 = value_df['value'].tolist()
            if len(values_list2) > 1000:
                x = value_info.get(facet2)
                value_df = pd.DataFrame(columns=['frequency']).from_dict(
                    x, orient='index').reset_index().rename(columns={
                        "index": "value",
                        0: "frequency"
                    }).sort_values(['frequency'], ascending=False).head(n=1000)
                values_list2 = value_df['value'].tolist()

        if len(values_list1) > len(values_list2):
            short_list = values_list2
            long_list = values_list1
        else:
            short_list = values_list1
            long_list = values_list2

        # calculate the best fuzzy matches
        best_match_list = []
        for value1 in short_list:
            jaro_distance_list = []
            for value2 in long_list:

                try:
                    damerau_levenshtein_distance = jellyfish.damerau_levenshtein_distance(
                        value1, value2)
                except ValueError:
                    damerau_levenshtein_distance = py_jellyfish.damerau_levenshtein_distance(
                        value1, value2)

                jaro_winkler = jellyfish.jaro_winkler(value1, value2)
                hamming_distance = jellyfish.hamming_distance(value1, value2)

                jaro_tuple = (value1, value2, jaro_winkler,
                              damerau_levenshtein_distance, hamming_distance)
                jaro_distance_list.append(jaro_tuple)
            best_match = max(jaro_distance_list, key=lambda x: x[2])
            best_match_list.append(best_match)
        df = pd.DataFrame(best_match_list,
                          columns=[
                              'facet1', 'facet2', 'jaro_distance',
                              'damerau_levenshtein_distance',
                              'hamming_distance'
                          ])

        jaro_distance_quant = df['jaro_distance'].quantile(0.9)
        jaro_distance_mean = df['jaro_distance'].mean()
        jaro_distance_std = df['jaro_distance'].std()
        damerau_levenshtein_distance_quant = df[
            'damerau_levenshtein_distance'].quantile(0.9)
        damerau_levenshtein_distance_mean = df[
            'damerau_levenshtein_distance'].mean()
        damerau_levenshtein_distance_std = df[
            'damerau_levenshtein_distance'].std()
        hamming_distance_quant = df['hamming_distance'].quantile(0.9)
        hamming_distance_mean = df['hamming_distance'].mean()
        hamming_distance_std = df['hamming_distance'].std()

        results = {
            'jaro_distance_quant': jaro_distance_quant,
            'jaro_distance_mean': jaro_distance_mean,
            'jaro_distance_std': jaro_distance_std,
            'damerau_levenshtein_distance_quant':
            damerau_levenshtein_distance_quant,
            'damerau_levenshtein_distance_mean':
            damerau_levenshtein_distance_mean,
            'damerau_levenshtein_distance_std':
            damerau_levenshtein_distance_std,
            'hamming_distance_quant': hamming_distance_quant,
            'hamming_distance_mean': hamming_distance_mean,
            'hamming_distance_std': hamming_distance_std
        }
        # so a good match will be a high mean, low std. The quantile is prob better than mean.

        return results
    else:

        # 'N.A.' returned if one or both of the facets dont have any values.


        results = {'jaro_distance_quant':'N.A.', \
        'jaro_distance_mean':'N.A.', \
        'jaro_distance_std':'N.A.', \
        'damerau_levenshtein_distance_quant':'N.A.', \
        'damerau_levenshtein_distance_mean':'N.A.', \
        'damerau_levenshtein_distance_std':'N.A.', \
        'hamming_distance_quant':'N.A.', \
        'hamming_distance_mean':'N.A.', \
        'hamming_distance_std':'N.A.'}

        return results
Ejemplo n.º 34
0
    def score(self, s, t):
        """ Returns the soft tf-idf similarity """

        # Check to see whether a model exists; otherwise default to degenerate solution
        if (self.LOG_IDF is None) | (self.CORPUS_VOCAB is
                                     None) | (self.OOV_IDF_VAL is None):
            self.logger.info(
                "Either (or both) IDF or corpus vocabulary parameters not given "
                +
                "Defaulting to degenerate mode where corpus consists only of the "
                + "two strings given as input.")
            self.compute_query_idf([s, t])

        # Get V(w,S) and V(w,T) (along with vocab lists for s and t)
        try:
            (s_vocab, vprime_ws, vprime_ws_norm) = self.compute_VwS(s)
            (t_vocab, vprime_wt, vprime_wt_norm) = self.compute_VwS(t)
        except ValueError:
            self.logger.info("string got stop-listed; most likely b/c " \
                    "it is of length 1, with the only character being a " \
                    "non-normalized punctuation mark. (i.e. '.')")
            sim = 0.0
            return sim

        #compute D(w,T) for all w
        max_vT = dict()
        jw_sims = dict()
        for w in s_vocab:
            max_vT[w] = dict()
            max_vT[w]['score'] = 0.0
            max_vT[w]['max_v'] = ''
            jw_sims[w] = dict()
            for v in t_vocab:
                dist = jf.jaro_winkler(w, v)
                jw_sims[w][v] = dist
                if (dist >= max_vT[w]['score']):
                    max_vT[w]['score'] = dist
                    max_vT[w]['max_v'] = v
        self.logger.debug("max_vT: {0}".format(max_vT))

        # compute soft tf-idf sim
        sim = 0.0
        self.logger.debug(s_vocab)
        for w in s_vocab:
            for v in t_vocab:
                if (jw_sims[w][v] >= self.THRESHOLD):
                    inner_sum = (vprime_ws[w] / vprime_ws_norm) * (
                        vprime_wt[max_vT[w]['max_v']] /
                        vprime_wt_norm) * max_vT[w]['score']
                    self.logger.debug(
                        u"(w,vprime_ws[w],vprime_ws_norm): ({0},{1},{2})".
                        format(w, vprime_ws[w], vprime_ws_norm))
                    self.logger.debug(
                        u"(max_vT[w]['max_v'],vprime_wt[max_vT['max_v'],vprime_wt_norm): ({0},{1},{2})"
                        .format(max_vT[w]['max_v'],
                                vprime_wt[max_vT[w]['max_v']], vprime_wt_norm))
                    self.logger.debug(u"(max_vT[w]['score']): ({0})".format(
                        max_vT[w]['score']))
                    self.logger.debug(u"(w,v,inner_sum): ({0},{1},{2})".format(
                        w, v, inner_sum))
                    sim += inner_sum
                    break

        self.logger.debug("Soft TF-IDF Similarity: {0}".format(sim))

        return sim
Ejemplo n.º 35
0
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator


def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)


text1 = u'I am happy.'
text2 = u'I am very happy.'

vector1 = text_to_vector(text1)
vector2 = text_to_vector(text2)

cosine = get_cosine(vector1, vector2)
print "sentences are \n", text1, "\n", text2
print 'Cosine distance :', cosine

##########################################################
#--> JAro distance between sentences and other distance as well
lvd = j.damerau_levenshtein_distance((text1), (text2))
jd = j.jaro_winkler((text1), (text2))
print "levenshtein_distance :", lvd
print "Jaro distance :", jd
def closest_word(word):
    # check closest GLOVE word
    return max(glove, key=lambda x: jellyfish.jaro_winkler(x, word))
X_test = np.zeros((len(X_test_1a), maxsents, maxlen), dtype = 'int32')

print('Loading Death certificates...')

death_cert = [X_test_1a, X_test_1b, X_test_1c, X_test_1d, X_test_2]

for m in range(len(death_cert)):
    part = death_cert[m]
    for i, sentences in enumerate(part):
        sentences = tokenize.sent_tokenize( sentences )
        k = 0
        for j, sent in enumerate(sentences):
            wordTokens = text_to_word_sequence(sent)
            for _ , word in enumerate(wordTokens):
                if word_index.get(word) == None: 
                    aux = [(jellyfish.jaro_winkler(k,word),v) for k,v in word_index.items()]
                    if k < maxlen and max(aux)[1] < max_features:
                        X_test[i,m,k] = max(aux)[1]
                        k = k + 1
                else:
                    if k < maxlen and word_index.get(word) < max_features:
                        X_test[i,m,k] = word_index.get(word)
                        k = k + 1
                    
print('Loading bic...')

bic_components = [X_test_bic, X_test_bic_admiss, X_test_bic_sit]
for m in range(len(bic_components)):
    bic_part = bic_components[m]
    for i, sentences in enumerate(bic_part):
        sentences = tokenize.sent_tokenize( sentences )
Ejemplo n.º 38
0
	def getFeatures(self, a, b):
		# feature vector
		f = {}
 		aa, ab = self.authors[a], self.authors[b]
 		name_para = (('mid', 'name_middle'), ('first', 'name_first'), ('last', 'name_last'))
 		for id_f, id_o in name_para:
	 		la, lb = len(aa[id_o]), len(ab[id_o])
			if la == 0 or lb == 0: #at least one lacks the name part
				f[id_f] = 3 if (la == lb) else 2
			elif aa[id_o] == ab[id_o]: #full name match
				f[id_f] = 5 if (la > 1) else 4
			elif la > 1 and lb > 1: #full names supplied and no match
				f[id_f] = 0
			elif aa[id_o][0] == ab[id_o][0]: #at least one is initial and initials match
				f[id_f] = 4
			else: #initials don't match
				f[id_f] = 1


		if aa['fullname_tfidf'] is not None and ab['fullname_tfidf'] is not None:
			f['fullname_sharedidf'] = shared_terms_sum(aa['fullname_tfidf'], ab['fullname_tfidf'])

		if aa['affil_tfidf'] is not None and ab['affil_tfidf'] is not None:
			f['has_affil'] = 2
		elif aa['affil_tfidf'] is not None or ab['affil_tfidf'] is not None:
			f['has_affil'] = 1
		else:
			f['has_affil'] = 0

		if f['has_affil'] != 2:
			f['affil_sharedidf'] = np.nan
		else:
			f['affil_sharedidf'] = shared_terms_sum(aa['affil_tfidf'], ab['affil_tfidf'])

		if aa['name_last'] == ab['name_last'] and (
			(aa['name_first'] == ab['name_middle'] and not aa['name_middle']) or
			(ab['name_first'] == aa['name_middle'] and not ab['name_middle'])
		):
			if len(aa['name_first']) > 1:
				f['firstmidswap'] = 2
			else:
				f['firstmidswap'] = 1
		else:
			f['firstmidswap'] = 0

		# 1 = off by two, 2 = off by one
		f['offbylastone'] = 0
		
		la, lb = len(aa['fullname']), len(ab['fullname'])
		if aa['fullname'].startswith(ab['fullname']):
			f['subsetprefix'] = lb
			if la - lb <= 2:
				f['offbylastone'] = 3 - (la - lb)
		elif ab['fullname'].startswith(aa['fullname']):
			f['subsetprefix'] = la
			if lb - la <= 2:
				f['offbylastone'] = 3 - (lb - la)
		else:
			f['subsetprefix'] = 0
		
		f['lastidf'] = 0 if (aa['name_last'] != ab['name_last'] or not aa['name_last']) else aa['lastname_idf']
		f['iFfLidf'] = 0 if (aa['iFfL'] != ab['iFfL'] or not aa['iFfL']) else aa['iFfL_idf']
		f['exact'] = int(aa['fullname_joined'] == ab['fullname_joined'] and len(aa['fullname_joined']) > 0)
		f['jaro_distance'] = 0 if (':' in aa['fullname'] or ':' in ab['fullname']) else jellyfish.jaro_distance(aa['fullname'], ab['fullname'])
		f['jaro_winkler'] = 0 if (':' in aa['fullname'] or ':' in ab['fullname']) else jellyfish.jaro_winkler(aa['fullname'], ab['fullname'])
		f['jarow_first'] = jellyfish.jaro_winkler(aa['name_first'], ab['name_first'])
		f['jarow_mid'] = jellyfish.jaro_winkler(aa['name_middle'], ab['name_middle'])
		f['jarow_last'] = jellyfish.jaro_winkler(aa['name_last'], ab['name_last'])
		f['jarow_firstmid'] = jellyfish.jaro_winkler(aa['name_first']+aa['name_middle'], ab['name_first']+ab['name_middle'])
		f['jarow_midlast'] = jellyfish.jaro_winkler(aa['name_middle']+aa['name_last'], ab['name_middle']+ab['name_last'])
		
		f['suffix'] = int(aa['name_suffix'] == ab['name_suffix'] and len(aa['name_suffix']) > 0)
		f['metaphone'] = int(aa['metaphone_fullname'] == ab['metaphone_fullname'] and len(aa['metaphone_fullname']) > 0)

		f.update(self.PFG.getEdgeFeatures(a, b))

		return f
Ejemplo n.º 39
0
def get_target_pert_indices(gse_gsm_info):
    """
    Best match the perturbation samples with control samples
    Args:
        gse_gsm_info: the GSE and GSM info tuple

    Returns:
        the GSE and GSM info tuple
    """
    key, val = gse_gsm_info
    gse_id, pert_agent, gsm_ids, ctrl_text, ctrl_indices = key
    pert_texts, pert_indices = val
    target_index = max_score = max_days_diff = None

    # Search for time-based samples
    ctrl_days_text = re.search("\d+\s*(d(ays?)?|h((ours?)|(r|rs)?))",
                               ctrl_text,
                               flags=re.IGNORECASE)
    pert_days_texts = [
        re.search("\d+\s*(d(ays?)?|h((ours?)|(r|rs)?))",
                  x,
                  flags=re.IGNORECASE) for x in pert_texts
    ]

    if ctrl_days_text is None:
        ctrl_days_text = re.search("\d+", "0")

    # If both control and perturbation samples contain time-based texts,
    # Match the perturbation sample with the maximum time difference to the control sample
    if ctrl_days_text is not None and any(x is not None
                                          for x in pert_days_texts):
        ctrl_days_num = int(re.search("\d+", ctrl_days_text.group()).group())
        for i, pert_days_text in enumerate(pert_days_texts):
            if pert_days_text is not None:
                pert_days_num = int(
                    re.search("\d+", pert_days_text.group()).group())
                days_diff = pert_days_num - ctrl_days_num

                if days_diff >= 0 and (max_days_diff is None
                                       or days_diff > max_days_diff):
                    max_days_diff = days_diff
                    target_index = i

    # Match the perturbation sample with the highest text similarity with the control sample
    else:
        for i, pert_text in enumerate(pert_texts):
            score = jellyfish.jaro_winkler(ctrl_text, pert_text)
            if max_score is None or score > max_score:
                max_score = score
                target_index = i

    if target_index is None:
        return None

    gsm_ids = np.array(gsm_ids)
    ctrl_indices = list(ctrl_indices)
    target_pert_indices = pert_indices[target_index]

    # Create string for microarray analysis
    microarray_grouping = np.chararray(len(gsm_ids), unicode=True)
    microarray_grouping[:] = "X"
    microarray_grouping[ctrl_indices] = "0"
    microarray_grouping[target_pert_indices] = "1"
    microarray_grouping = "".join(microarray_grouping)

    return gse_id, (pert_agent, "|".join(gsm_ids[ctrl_indices]),
                    "|".join(gsm_ids[target_pert_indices]),
                    microarray_grouping)
Ejemplo n.º 40
0
def partition_tuples(zagat, fodors, match_tuples,\
 unmatch_tuples, possible_tuples):
    '''
    Iterates through all possible combinations of entries from zagat and
     fodors dataframes and computes tuples. Sends each possible combination
      to its respective dataframe

    Inputs:
        zagat(Pandas Dataframe): zagat dataframe
        fodors(Pandas Dataframe): fodors dataframe
        match_tuples(list): list of tuples to be classified as matches
        unmatch_tuples(list): list of tuples to be classified as unmatches
        possible_tuples(list): list of tuples to be classified as possible
         matches

    Outputs:
        matches_df: dataframe of matches
        possible_df: dataframe of possible matches
        unmatches_df: dataframe of non matches

    '''
    column_index = (['z_restaurant', 'z_city', 'z_address',\
     'f_restaurant', 'f_city', 'f_address'])
    matches_rows = []
    unmatches_rows = []
    possible_rows = []

    for i in range(len(zagat) - 1):

        for j in range(len(fodors) - 1):

            z_restaurant = zagat['restaurant'][i]
            f_restaurant = fodors['restaurant'][j]

            z_city = zagat['city'][i]
            f_city = fodors['city'][j]

            z_address = zagat['address'][i]
            f_address = fodors['address'][j]

            r_score = jellyfish.jaro_winkler(z_restaurant, f_restaurant)
            c_score = jellyfish.jaro_winkler(z_city, f_city)
            a_score = jellyfish.jaro_winkler(z_address, f_address)

            tup = (util.get_jw_category(r_score), util.get_jw_category\
                (c_score), util.get_jw_category(a_score))

            if tup in match_tuples:
                matches_rows.append([z_restaurant, z_city, z_address,\
                 f_restaurant, f_city, f_address])

            elif tup in unmatch_tuples:
                unmatches_rows.append([z_restaurant, z_city, z_address,\
                 f_restaurant, f_city, f_address])

            elif tup in possible_tuples:
                possible_rows.append([z_restaurant, z_city, z_address,\
                 f_restaurant, f_city, f_address])

    matches_df = pd.DataFrame(data=matches_rows, columns=column_index)
    unmatches_df = pd.DataFrame(data=unmatches_rows, columns=column_index)
    possible_df = pd.DataFrame(data=possible_rows, columns=column_index)

    return matches_df, possible_df, unmatches_df
Ejemplo n.º 41
0
 def closest_match_neighbors(self, search_name):
     line_penalty = lambda x: 100 * (x.parent_station[0] in search_name.split("_")[-1])
     d = lambda x: fish.jaro_winkler(unicode(x.stop_name.lower()), unicode(search_name.lower())) + line_penalty(x)
     distances = self.data.apply(d, axis=1)
     i = np.argmax(distances)
     return self.data.stop_name[i], self.data.stop_lat[i], self.data.stop_lon[i], self.data.parent_station[i]
Ejemplo n.º 42
0
    def parse(self, bedes_version, schema_version):
        # parse correct bedes version
        bedes = BedesParser(bedes_version)
        bedes.save()

        # check for manual mappings CSV file
        the_path = os.path.join(os.path.dirname(__file__), '../../lib/bedes',
                                bedes_version)
        if not os.path.isfile("%s/manual_mapping_table.csv" % (the_path)):
            raise FileNotFoundError(
                "Cannot find the manual_mapping_table.csv file in lib/bedes/{} directory"
                .format(bedes_version))

        # read data from manual mappings CSV file and store in local dict variable
        csv_file = open("%s/manual_mapping_table.csv" % (the_path), mode='r')
        manual_mappings_file = csv.DictReader(csv_file)
        manual_mappings = {}
        for term in manual_mappings_file:
            manual_mappings[term['BSync String']] = term['BEDES String']

        # read the fields from the database, right now default to schema 0.3
        schema = Schema.objects.filter(version=schema_version).first()
        results = {}
        for attribute in schema.attributes.all().order_by('id'):

            # use id as the key since name is not unique
            results[attribute.id] = []

            # run function to find and replace words that are in the manual mappings table
            bsync_term = self.manual_mapping(attribute.name, manual_mappings)

            for bt in bedes.terms:
                distance = jellyfish.jaro_winkler(bsync_term.lower(),
                                                  bt['Term'].lower())

                if distance >= 0.98:
                    results[attribute.id].append({
                        "attribute_name": attribute.name,
                        "transformed_name": bsync_term,
                        "attribute_path": attribute.path,
                        "bedes_term": bt['Term'],
                        "bedes_object": bt,
                        "distance": distance,
                        "term_or_lo": 'Term'
                    })

            # if no matches found in BEDES terms, check list options
            if not results[attribute.id]:
                for be in bedes.enumerations:
                    # .lower() function used to neutralize upper/lower case discrepancies (there are many in enumerations/list options)
                    distance = jellyfish.jaro_winkler(
                        bsync_term.lower(), be['List-Option'].lower())

                    if distance >= 0.98:
                        results[attribute.id].append({
                            "attribute_name":
                            attribute.name,
                            "transformed_name":
                            bsync_term,
                            "attribute_path":
                            attribute.path,
                            "bedes_term":
                            be['List-Option'],
                            "bedes_object":
                            be,
                            "distance":
                            distance,
                            "term_or_lo":
                            'List-Option'
                        })

            # sort matched terms by distance value (highest matched in first index position)
            results[attribute.id] = sorted(results[attribute.id],
                                           key=lambda k: -k['distance'])

            if not results[attribute.id]:
                # didn't find any term-to-term or term-to-list-option matches, start word-level matching

                words_data = defaultdict(list)

                bsync_words = []
                # split BSync term into a list of individual words
                bsync_words = self.acronym_check(
                    re.findall('[^:^(^)^,^ ][^,^ ^:^(^)]*', bsync_term))

                # go through entire list, converting to an array that assigns an availability flag to each individual word
                for i in range(len(bsync_words)):
                    bsync_words[i] = [bsync_words[i], True]

                if len(bsync_words) > 2:
                    restart_process = True
                    while restart_process:
                        restart_process = False

                        # generate word groups, starting with largest groups first
                        for number_of_words in range(
                                len(bsync_words) - 1, 1, -1):
                            word_groups = {}

                            # search through bsync_words for all possible combinations of word groups of proper length
                            # made up of concurrent words with availability flags set to True, save in dict word_groups
                            for starting_index in range(
                                    len(bsync_words) - number_of_words + 1):
                                word_construction = ''
                                word_construction_success = True
                                for i in range(
                                        starting_index,
                                        starting_index + number_of_words):
                                    if bsync_words[i][1]:
                                        word_construction += bsync_words[i][
                                            0] + ' '
                                    else:
                                        word_construction_success = False
                                        break
                                if word_construction_success:
                                    word_groups[word_construction.strip(
                                        ' ')] = starting_index

                            # attempt to match each word group generated against BEDES terms and list options using jaro_winkler distance
                            for word_group in word_groups.keys():

                                for bt in bedes.terms:
                                    distance = jellyfish.jaro_winkler(
                                        word_group.lower(), bt['Term'].lower())

                                    if distance >= 0.98:

                                        for i in range(
                                                word_groups[word_group],
                                                word_groups[word_group] +
                                                number_of_words):
                                            bsync_words[i][1] = False

                                        words_data['matched_to_term'].append(
                                            word_group)
                                        words_data['matched_term_URL'].append(
                                            bt['URL'])
                                        words_data['term_or_lo'].append('Term')

                                        # start process to break out of loops, restart entire word grouping process (now with appropriate words unavailable for grouping)
                                        restart_process = True
                                        break

                                if restart_process:
                                    break
                                else:
                                    # if no matches found in BEDES terms, check list options
                                    for be in bedes.enumerations:

                                        distance = jellyfish.jaro_winkler(
                                            word_group.lower(),
                                            be['List-Option'].lower())

                                        if distance >= 0.98:
                                            for i in range(
                                                    word_groups[word_group],
                                                    word_groups[word_group] +
                                                    number_of_words):
                                                bsync_words[i][1] = False

                                            words_data[
                                                'matched_to_term'].append(
                                                    word_group)
                                            words_data[
                                                'matched_term_URL'].append(
                                                    be['URL'])
                                            words_data['term_or_lo'].append(
                                                'List-Option')

                                            # start process to break out of loops, restart entire word grouping process (now with appropriate words unavailable for grouping)
                                            restart_process = True
                                            break

                            if restart_process:
                                break

                # check through remaining individual words that weren't matched as part of a word group
                # individual words will only match 1-to-1 and will also search through individual words
                # of BEDES terms and list options for partial matches
                for i in range(len(bsync_words)):
                    if bsync_words[i][1]:
                        bsync_word = bsync_words[i][0]

                        # run word matching function against BEDES terms and list options (enumerations)
                        term_match_status, term_match_URL = self.word_matching(
                            bsync_word, bedes.terms)
                        lo_match_status, lo_match_URL = self.word_matching(
                            bsync_word, bedes.enumerations)

                        if term_match_status == 'Matched-Term':
                            words_data['matched_to_term'].append(bsync_word)
                            words_data['matched_term_URL'].append(
                                term_match_URL)
                            words_data['term_or_lo'].append('Term')

                        else:
                            if lo_match_status == 'Matched-Term':
                                words_data['matched_to_term'].append(
                                    bsync_word)
                                words_data['matched_term_URL'].append(
                                    lo_match_URL)
                                words_data['term_or_lo'].append('List-Option')

                            else:
                                if term_match_status == 'Matched-Word':
                                    words_data['matched_to_word'].append(
                                        bsync_word)
                                    words_data[
                                        'matched_word_example_URL'].append(
                                            term_match_URL)

                                else:
                                    if lo_match_status == 'Matched-Word':
                                        words_data['matched_to_word'].append(
                                            bsync_word)
                                        words_data[
                                            'matched_word_example_URL'].append(
                                                lo_match_URL)

                                    else:
                                        words_data['unmatched_words'].append(
                                            bsync_word)

                results[attribute.id].append({
                    "attribute_name":
                    attribute.name,
                    "transformed_name":
                    bsync_term,
                    "attribute_path":
                    attribute.path,
                    "word_matching":
                    True,
                    "term_or_lo":
                    words_data['term_or_lo'],
                    "matched_to_term":
                    words_data['matched_to_term'],
                    "matched_term_URL":
                    words_data['matched_term_URL'],
                    "matched_to_word":
                    words_data['matched_to_word'],
                    "matched_word_example_URL":
                    words_data['matched_word_example_URL'],
                    "unmatched_words":
                    words_data['unmatched_words']
                })

        # store the results to CSV
        the_path = os.path.join(os.path.dirname(__file__), '../../lib/bedes',
                                bedes_version, "schema" + schema_version)

        print("THE PATH: {}".format(the_path))
        if not os.path.exists(the_path):
            os.makedirs(the_path)

        unique_column_words = {}
        unique_column_words['matched_to_term'] = []
        unique_column_words['matched_to_word'] = []
        unique_column_words['unmatched_words'] = []
        content_uuids = []
        with open("%s/bedes-mappings-terms.csv" % (the_path), 'w',
                  newline='') as file:
            writer = csv.writer(file, delimiter=',')

            # write row of column headers
            writer.writerow([
                'attribute_name', 'transformed_name', 'attribute_id',
                'attribute_path', 'bedes_content_uuid', 'bedes_term',
                'bedes_category', 'bedes_definition', 'bedes_url', 'distance',
                'match_type', 'matched_to_term', 'term_or_lo',
                'matched_term_URL', 'matched_to_word',
                'matched_word_example_URL', 'unmatched_words'
            ])
            for id, be in results.items():
                if len(be) > 0 and 'bedes_object' in be[0]:
                    # 'if' structures to grab relevant information from appropriate fields depending on whether a term or a list option was matched
                    if be[0]['term_or_lo'] == 'Term':
                        output_category = be[0]['bedes_object']['Category']
                        output_definition = be[0]['bedes_object'][
                            'Term-Definition']
                        output_match_type = 'Term-to-Term Match'
                    elif be[0]['term_or_lo'] == 'List-Option':
                        output_category = ''
                        output_definition = be[0]['bedes_object'][
                            'List-Option-Definition']
                        output_match_type = 'Term-to-List-Option Match'

                    out = [
                        be[0]['attribute_name'], be[0]['transformed_name'], id,
                        be[0]['attribute_path'],
                        be[0]['bedes_object']['Content-UUID'],
                        be[0]['bedes_term'], output_category,
                        output_definition, be[0]['bedes_object']['URL'],
                        be[0]['distance'], output_match_type, '', '', '', '',
                        '', ''
                    ]
                    content_uuids.append(be[0]['bedes_object']['Content-UUID'])
                else:
                    # output word matching data if no direct term matches were found
                    if 'word_matching' in be[0]:

                        # determine the match type tag depending on the specific combination of word match types
                        if len(be[0]['matched_to_term']) > 0:
                            if len(be[0]['matched_to_word']) > 0:
                                if len(be[0]['unmatched_words']) > 0:
                                    output_match_type = 'Words: Term, Word, Unmatched'
                                else:
                                    output_match_type = 'Words: Term, Word'
                            elif len(be[0]['unmatched_words']) > 0:
                                output_match_type = 'Words: Term, Unmatched'
                            else:
                                output_match_type = 'Words: all Term'
                        elif len(be[0]['matched_to_word']) > 0:
                            if len(be[0]['unmatched_words']) > 0:
                                output_match_type = 'Words: Word, Unmatched'
                            else:
                                output_match_type = 'Words: all Word'
                        else:
                            output_match_type = 'Words: all Unmatched'

                        # store all unique words from each word-matching column
                        for key in unique_column_words.keys():
                            if be[0][key] != []:
                                for word in be[0][key]:
                                    if word not in unique_column_words[key]:
                                        unique_column_words[key].append(word)

                        out = [
                            be[0]['attribute_name'], be[0]['transformed_name'],
                            id, be[0]['attribute_path'], '', '', '', '', '',
                            '', output_match_type,
                            ',  '.join(be[0]['matched_to_term']),
                            ',  '.join(be[0]['term_or_lo']),
                            ', '.join(be[0]['matched_term_URL']),
                            ',  '.join(be[0]['matched_to_word']),
                            ', '.join(be[0]['matched_word_example_URL']),
                            ',  '.join(be[0]['unmatched_words'])
                        ]
                    else:
                        # this code should never run if working properly.  Can search 'debug134' in output file to make sure it hasn't run
                        out = [
                            be[0]['attribute_name'], '', id,
                            be[0]['attribute_path'], '', '', 'debug134', '',
                            '', '', '', '', '', '', '', '', ''
                        ]
                writer.writerow(out)

        list_set = set(content_uuids)
        # convert the set to the list
        unique_cnt = len(list(list_set))
        self.stdout.write(
            '*******There are {} unique BEDES terms to add*******'.format(
                unique_cnt)
        )  # not sure if this code is relevent or accurate anymore

        # output file with list of unique words from each word-matching column - useful data to find candidates for manual mapping
        with open("%s/bsync_unique_words.csv" % (the_path), 'w',
                  newline='') as file:
            writer = csv.writer(file, delimiter=',')

            # write out row of column headers
            writer.writerow([
                'Unique Words Matched to Term',
                'Unique Words Matched to Words', 'Unique Unmatched Words'
            ])

            mt_length = len(unique_column_words['matched_to_term'])
            mw_length = len(unique_column_words['matched_to_word'])
            uw_length = len(unique_column_words['unmatched_words'])
            rows = max(mt_length, mw_length, uw_length)
            for i in range(rows):
                val1 = ''
                val2 = ''
                val3 = ''
                if i + 1 <= mt_length:
                    val1 = unique_column_words['matched_to_term'][i]
                if i + 1 <= mw_length:
                    val2 = unique_column_words['matched_to_word'][i]
                if i + 1 <= uw_length:
                    val3 = unique_column_words['unmatched_words'][i]
                writer.writerow([val1, val2, val3])

        results = {}
        for enumeration in Enumeration.objects.filter(schema=schema):
            results[enumeration.id] = []

            # retrieve associated attribute ID for CSV
            attrs = AttributeEnumerationClass.objects.filter(
                enumeration_class=enumeration.enumeration_class)

            associated_attrs = []
            for attr in attrs:
                associated_attrs.append(attr.attribute_id)
            print(associated_attrs)

            for be in bedes.enumerations:
                distance = jellyfish.jaro_winkler(enumeration.name,
                                                  be['List-Option'])

                if distance >= 0.95:
                    results[enumeration.id].append({
                        "enumeration_name":
                        enumeration.name,
                        "bedes_term":
                        be['List-Option'],
                        "bedes_object":
                        be,
                        "distance":
                        distance,
                        "associated_attribute_ids":
                        ' '.join([str(item) for item in associated_attrs])
                    })
            results[enumeration.id] = sorted(results[enumeration.id],
                                             key=lambda k: -k['distance'])
            if not results[enumeration.id]:
                # didn't find anything
                results[enumeration.id].append({
                    "enumeration_name":
                    enumeration.name,
                    "associated_attribute_ids":
                    ' '.join([str(item) for item in associated_attrs])
                })

        # store the results to CSV
        content_uuids = []
        with open("%s/bedes-mappings-enumerations.csv" % (the_path),
                  'w',
                  encoding='utf-8',
                  newline='') as file:
            writer = csv.writer(file, delimiter=',')
            # headers: enumeration name, enumeration id,
            # bedes Content-UUID, bedes term, bedes definition, bedes URL, bedes Related Term UUID, distance
            writer.writerow([
                'enum_name', 'enum_id', 'bedes_content_uuid', 'bedes_term',
                'bedes_definition', 'bedes_url', 'bedes_related_term_uuid',
                'distance', 'associated_attribute_ids'
            ])
            for enum, be in results.items():
                if len(be) > 0 and 'bedes_object' in be[0]:
                    content_uuids.append(be[0]['bedes_object']['Content-UUID'])
                    out = [
                        be[0]['enumeration_name'], enum,
                        be[0]['bedes_object']['Content-UUID'],
                        be[0]['bedes_term'],
                        be[0]['bedes_object']['List-Option-Definition'],
                        be[0]['bedes_object']['URL'],
                        be[0]['bedes_object']['Related-Term-UUID'],
                        be[0]['distance'], be[0]['associated_attribute_ids']
                    ]

                else:
                    out = [
                        be[0]['enumeration_name'], enum, '', '', '', '', '',
                        '', be[0]['associated_attribute_ids']
                    ]
                writer.writerow(out)

        list_set = set(content_uuids)
        # convert the set to the list
        unique_cnt = len(list(list_set))
        self.stdout.write(
            '*******There are {} unique BEDES enum values to add*******'.
            format(unique_cnt))

        self.stdout.write('Finished parsing bedes')
Ejemplo n.º 43
0
 def _is_similar(self, line1, line2):
     similarity_score = jellyfish.jaro_winkler(line1, line2)
     log.debug('companring: ({}, {}) similarity_score: {}'.format(
         line1, line2, similarity_score))
     return similarity_score >= self.threshold
Ejemplo n.º 44
0
import pandas as pd
import jellyfish
import nltk
data = pd.read_csv('TestdafFT01.txt', sep='\t', encoding='UTF-8')
df = pd.DataFrame()
df = df.append(data)
pattern = input("Please enter search pattern:\n")
distance = []
df['dist'] = -1
for index, row in df.iterrows():
    dist = jellyfish.jaro_winkler(row['Antwort'], pattern)
    if (dist > 0):
        distance.append(dist)
    else:
        distance.append(0)
df['dist'] = distance
final_df = df.sort_values('dist', ascending=False)
# final_df = final_df[(final_df['lexicalfuzz'] > (lexicalVariance))]
final_df.to_csv('jwdist.tsv', index=False, sep='\t')
Ejemplo n.º 45
0
def headquarters():
    positive = 0
    negative = 0
    not_found = 0

    f_not_found = open("not_found.txt", "w")
    f_negative = open("negative.txt", "w")
    f_positive = open("positive.txt", "w")

    tuples_not_found = set()

    for t in results:
        # first, try a direct match
        org_extracted = t[0].decode("utf8").upper().strip()
        locations_groundtruth = ground_truth.get(org_extracted)

        # if its a direct match with a ground truth organization, compare the locations
        if locations_groundtruth:
            loc_extracted = t[1].decode("utf8").upper().strip()
            found = False
            for locations in locations_groundtruth:
                # some locations in DBpedia contain diferente references, e.g., city,state
                # e.g.,: AUBURN HILLS, MICHIGAN
                # split and compare with both

                # in case it was found and got outside the for-loop below
                # no need to check more references
                if found == True:
                    break
                locations_parts = locations.split(",")
                for loc in locations_parts:
                    # match locations with Jaro-Winkler, keep those >=0.8 similarity score
                    score = jellyfish.jaro_winkler(
                        loc_extracted.encode("utf8"),
                        loc.strip().encode("utf8"))
                    if score >= 0.8:
                        f_positive.write(t[0] + '\t' + t[1] + '\n')
                        positive += 1
                        found = True
                        break

                    # if ground-truth (from DBpedia) is a country, and extracted is a city
                    # check if the city is in that country
                    elif loc in countries:
                        if loc_extracted.encode("utf8") in country_cities[loc]:
                            f_positive.write(t[0] + '\t' + t[1] + '\t' + '\n')
                            positive += 1
                            found = True
                            break

                    #TODO
                    # if ground-truth (from DBpedia) is a city, and extracted location is a country
                    # check if that city is located in that country only
                    # elif

            if found == False:
                negative += 1
                f_negative.write(
                    t[0] + '\t' + t[1] + '\t\t:' +
                    ';'.join(locations_groundtruth).encode("utf8") + '\n')

        else:
            tuples_not_found.add(t)

    # try to expand the acronyms
    names_found = set()
    for name in tuples_not_found:
        # if it is a single token with all uppercase letters
        if len(name[0].split()) == 1 and name[0].isupper():
            found = False
            # get all the possible expansions that match this acronym
            expansions = acronyms.get(name[0])
            if expansions:
                # check if any of these expansions is an organization in the
                # ground_truth database and if it is, extract the locations
                for e in expansions:
                    locations_groundtruth = ground_truth.get(e.upper())
                    if locations_groundtruth:
                        for location in locations_groundtruth:
                            locations_parts = location.split(",")
                            for loc in locations_parts:
                                # approximate similarity
                                score = jellyfish.jaro_winkler(
                                    loc.encode("utf8"), name[1].upper())
                                if score >= 0.8:
                                    #f_positive.write(name[0]+' ('+e+')\t'+name[1]+'\t'+str(avg_score)+'\n')
                                    f_positive.write(name[0] + ' (' + e +
                                                     ')\t' + name[1] + '\n')
                                    positive += 1
                                    found = True
                                    names_found.add(name)
                                    break

                        if (found == True):
                            break

    for n in names_found:
        tuples_not_found.remove(n)

    # for tuples not found query Freebase
    # cache of strings that were already queried to Freebase
    queried = []
    for line in fileinput.input(
            '/home/dsbatista/gigaword/ground-truth/freebase-queried.txt'):
        queried.append(line.strip())
    fileinput.close()

    # file to save Freebase query results
    output = codecs.open(
        '/home/dsbatista/gigaword/ground-truth/freebase-output.txt', 'a',
        "utf-8")

    # open file for append, update 'freebase-queried.txt' with new issue queries
    f_queried = open(
        '/home/dsbatista/gigaword/ground-truth/freebase-queried.txt', "a")

    tuples_found = []

    for t in tuples_not_found:
        org = t[0].strip()
        # for now do not query acronyms to Freebase with ~=, too many false positives
        if not (len(t[0].split()) == 1 and name[0].isupper()):
            # first check if that query string was already issued to Freebase
            # if not, query Freebase and save the result
            if org not in queried:
                if org == "Star-Times": continue
                response = queryFreebase(org)
                queried.append(org)
                if response != 'error':
                    try:
                        if response['result']:
                            print "found:\t", org
                            parseResponse(org, response, output)
                        else:
                            print "not found:\t", org
                        f_queried.write(org + '\n')
                        f_queried.flush()

                    except TypeError, e:
                        print org
                        print e
                        print response
                        f_queried.close()
                        output.close()
                        sys.exit(0)

                    except Exception, e:
                        print org
                        print e
                        print response
                        f_queried.close()
                        output.close()
                        sys.exit(0)
 def jaro_dist(scan_res, desired):
     scan_line = get_file_as_string(scan_res)
     desired_line = get_file_as_string(desired)
     return jellyfish.jaro_winkler(scan_line,
                                   desired_line,
                                   long_tolerance=True)
Ejemplo n.º 47
0
def similarity(ori, inp):
    return (jellyfish.jaro_winkler(inp, ori))
Ejemplo n.º 48
0
def get_similarity(value_to_check: str, against: str) -> float:
    result = jellyfish.jaro_winkler(value_to_check, against)
    if value_to_check.startswith(against):
        result += 1.0
    return result
Ejemplo n.º 49
0
def word_similarity(word_to_compare='Vignir',
                    list_of_words=["Heigigr","Beðurni"],
                    return_top_n=20,
                    use_cut_off=False,
                    cut_off = 0.5,
                    sim_measure='Levenshtein' ,#SequenceMatcher #Jaro-Winkler #Hamming,
                    min_characters=2, #Null for no restriction,
                    filter_non_capital_letters = True
                    ):
    
    """Compare similarity between a word and a list of words

    Returns list of similar words/names based on a similarity measure
    
    Args:
        word_to_compare (str) -word to compare with each value in list
        list_of_words (lst) - list of strings to compare against
        return_top_n (int) - return only top n 10 results based on similarity measure
        use_cut_off (bool) - whether to use a cut off value based on similarity
        cut_off (int) - cut off value
        
    Returns:
         Returns two ints; average epoc_loss and epoch_accuracy
         
    """       
    word_similarity_list=[]
    for word in list_of_words:
        dict_Words ={}
        dict_Words['word_to_compare']=word_to_compare
        dict_Words['word_to_compare_against']=word
        if sim_measure=='Levenshtein':
            ##dict_Words['similarity']=Levenshtein.ratio(word_to_compare, word)
            dict_Words['similarity']=jellyfish.levenshtein_distance(word_to_compare, word)*-1
            dict_Words['similarity_measure']='Levenshtein'
        elif sim_measure=='SequenceMatcher':
            dict_Words['similarity']=SequenceMatcher(None,word_to_compare, word).ratio()
            dict_Words['similarity_measure']='SequenceMatcher'
            #https://docs.python.org/2.4/lib/sequencematcher-examples.html
        elif sim_measure=='Jaro-Winkler':
            dict_Words['similarity']=jellyfish.jaro_winkler(word_to_compare, word)
            dict_Words['similarity_measure']='Jaro-Winkler'
        elif sim_measure=='Hamming':
            dict_Words['similarity']=jellyfish.hamming_distance(word_to_compare, word)*-1
            dict_Words['similarity_measure']='Hamming'
        word_similarity_list.append(dict_Words)
        
    #Convert to frame
    df_word_similarity = pd.DataFrame(word_similarity_list)
   
    #Sort
    df_word_similarity=df_word_similarity.sort_values(by='similarity', ascending=False)
    
    #Return top results
    if return_top_n>0:
        if len(df_word_similarity)>return_top_n:
            df_word_similarity=df_word_similarity[0:return_top_n]
    else:
        return df_word_similarity[0:0]
    
    #Whether to use cutoff        
    if use_cut_off:
        df_word_similarity=df_word_similarity[df_word_similarity.similarity>cut_off]
    
    #Filter min characters
    if min_characters>0:
        df_word_similarity=df_word_similarity[df_word_similarity.word_to_compare_against.str.len()>min_characters]
        
    #Filter out words that does not start with a large character
    if filter_non_capital_letters:
        df_word_similarity=df_word_similarity[df_word_similarity.word_to_compare_against.str.istitle()]
    
    return df_word_similarity 
Ejemplo n.º 50
0
 def get_version_name_similarity(self, candidate):
     import jellyfish
     return jellyfish.jaro_winkler(self.version_name, candidate.version_name)
Ejemplo n.º 51
0
 def jaro_winkler_distance(self, row):
     gn_name = self.df_source.loc[row['geonamesid'], 'name']
     sn_name = self.df_target.loc[row['swissnamesid'], 'NAME']
     dist = jellyfish.jaro_winkler(gn_name, sn_name)
     return dist
def word_similarity(s1, s2):
    return jellyfish.jaro_winkler(unicode(s1.lower()), unicode(s2.lower()))
Ejemplo n.º 53
0
 def jaro(self, cand1, cand2):
     return jellyfish.jaro_winkler(cand1, cand2)
Ejemplo n.º 54
0
def theme_network_creation(G_themes, list_actor, dataframe, themes_of_interest,
                           tf_idf):
    '''
    Creation of a graph between the actors and the themes. For each theme mentioned in the articles, 
    we draw an edge between this theme and the closest actor in terms of offset. This will give us a
    bipartite graph, with the actors on one side and the themes on the other side. The goal is to see if 
    some actors are strongly linked to very specific themes, as detected by GDELT
    '''
    uncommon_theme = [
        'GOV_DIVISIONOFPOWER', 'HATE_SPEECH', 'INFO_HOAX',
        'POLITICAL_PRISONER', 'MEDIA_CENSORSHIP'
    ]
    for actor_list, theme_list, doc_id in zip(
            dataframe.V2ENHANCEDPERSONS.unique(),
            dataframe.V2ENHANCEDTHEMES.unique(), dataframe.GKGRECORDID):

        actor_list_temp, offset_list_temp = [], []
        #print("begin: ", actor_list, theme_list, doc_id)

        if not isinstance(actor_list, float):
            for actor in actor_list.split(';'):
                actor_list_temp.append(actor.split(',')[0])
                offset_list_temp.append(int(actor.split(',')[1]))

        # First, we need to get the themes and their respective offset in two separate lists

        if not isinstance(theme_list, float) and not isinstance(
                actor_list, float):

            #print("Here: ", doc_id)
            number_theme = len(theme_list)
            max_offset_diff = maximum_offset_difference(actor_list, theme_list)

            for theme in theme_list.split(';'):
                if theme:
                    theme_temp = theme.split(',')[0]
                    offset_temp = int(theme.split(',')[1])

                    if theme_temp in themes_of_interest:

                        if not G_themes.has_node(theme_temp):
                            G_themes.add_node(theme_temp)

                        index_actor = np.argmin(
                            np.abs([
                                offset - offset_temp
                                for offset in offset_list_temp
                            ]))
                        actor_offset = actor_list_temp[index_actor]

                        # We need to find this actor in the nodes of the network

                        similarity_max = np.max([
                            jellyfish.jaro_winkler(actor_offset, actor2)
                            for actor2 in list_actor
                        ])
                        index_max = np.argmax([
                            jellyfish.jaro_winkler(actor_offset, actor2)
                            for actor2 in list_actor
                        ])
                        actor_max = list_actor[index_max]
                        '''
                        for (actor, offset_actor) in zip(actor_list_temp, offset_list_temp):
                            offset_diff = np.abs(offset_actor - offset_temp)
                            
                            similarity_max = [jellyfish.jaro_winkler(actor, actor2) for 
                                                     actor2 in list_actor]
                            index_max = np.argmax(similarity_max)
                            actor_max = list_actor[index_max]
                        
                        # The weight associated with this theme and article is extracted from the tf-idf dictionary
                            weight_theme = tf_idf[doc_id][theme_temp] * (1 - offset_diff / max_offset_diff)

                        # Now that we have the theme and the actor, we can draw an edge between the two

                            if G_themes.has_edge(actor_max, theme_temp):
                                G_themes[actor_max][theme_temp]['weight'] += weight_theme
                            else:
                                #print("New edge! ", actor_max, theme_temp)
                                G_themes.add_edge(actor_max, theme_temp, weight = weight_theme)
                        '''
                        #print("Theme: ", doc_id, theme_temp)
                        weight_theme = tf_idf[doc_id][theme_temp]

                        if G_themes.has_edge(actor_max, theme_temp):
                            G_themes[actor_max][theme_temp][
                                'weight'] += weight_theme
                        else:
                            G_themes.add_edge(actor_max,
                                              theme_temp,
                                              weight=weight_theme)

    return G_themes
Ejemplo n.º 55
0
def _get_jaro(word1, word2):
    """ Calculate Jaro-Winkler distance between two words """
    return jellyfish.jaro_winkler(unicode(word1), unicode(word2))
Ejemplo n.º 56
0
def tracker_message_handler(message):

    tracker_magnitude_regexes = [{
        "label":
        "Spend",
        "units":
        "$",
        "regex":
        re.compile(r"(\$[0-9]?[0-9]\.?[0-9]?[0-9]?)|[0-9]?[0-9]\.[0-9][0-9]",
                   flags=re.IGNORECASE)
    }, {
        "label":
        "Calories",
        "units":
        "Cal",
        "regex":
        re.compile(r"[0-9][0-9]+ ?cal|calo?r?i?e?s? ?[0-9][0-9]",
                   flags=re.IGNORECASE)
    }, {
        "label":
        "Distance",
        "units":
        "km",
        "regex":
        re.compile(r"[0-9][0-9]* ?(k.?m?|m.?i?)", flags=re.IGNORECASE)
    }]

    from_user = message.from_user
    chat_info = message.chat

    dash_message_id = str(datetime.datetime.now()) + str(from_user.id)
    dash_message_id = hashlib.md5(dash_message_id.encode('utf-8')).hexdigest()

    latest_tracker_message = {
        "message_id": dash_message_id,
        "chat_id": chat_info.id,
        "type": "tracker",
        "status": "Unassigned",
        "title": "Unassigned",
        "user_id": from_user.id,
        "user_name": "{} {}".format(from_user.first_name, from_user.last_name),
        "datetime_logged": str(datetime.datetime.now()),
        "message_date": message.date,
        "input_datetime": str(datetime.datetime.now()),
        "content": "Unassigned",
        "magnitude": 30,
        "units": "$",
        "attributes": {
            "example_attribute_1": "example_attribute_value"
        },
        "estimate": "NA"
    }

    for search_logic in tracker_magnitude_regexes:
        regexp = search_logic["regex"]
        label = search_logic["label"]
        regex_search_result = regexp.search(message.text)
        #look for the magnitude

        if regex_search_result:
            #figured out what type of magnitude it is

            latest_tracker_message["status"] = label
            magnitude = regex_search_result.group(1)
            try:
                latest_tracker_message["magnitude"] = float(
                    re.sub("[a-z]|[A-Z]|\\$", "", magnitude))
                latest_tracker_message["estimate"] = "{} {}".format(
                    re.sub("[a-z]|[A-Z]|\\$", "", magnitude),
                    search_logic["units"])

            except Exception as e:
                print(e)
                #set_trace()
            description = message.text.replace(magnitude,
                                               "").replace("/t", '').strip()
            message_distance_arr = [
                (x, jellyfish.jaro_winkler(description, x))
                for x in distinct_message_bins
                if jellyfish.jaro_winkler(description, x) > 0.91
            ]

            #assign it accordingly

            #TODO
            #handle the exact match case better

            #TODO NEXT
            ##his is also writing empty stuff in when it doesn't know
            if len(message_distance_arr) >= 1:
                most_likely_title, dist = max(message_distance_arr,
                                              key=operator.itemgetter(1))
                latest_tracker_message["title"] = most_likely_title
                latest_tracker_message["content"] = most_likely_title

                #update_firebase_with_message(latest_tracker_message)
                update_google_sheet_tracker(latest_tracker_message)


#                 with open("{}/dash_app/data/tracker_data/{}.json".format(TELEGRAMPA_PROJECT_HOME,dash_message_id),"w") as f:
#                     json.dump(latest_tracker_message, f, indent= 2)
            else:
                #if it is unrecognized ask user if they want to create a new list
                latest_tracker_message["title"] = description
                latest_tracker_message["content"] = description

                msg = bot.reply_to(
                    message, 'Would you like to create a new tracker list?')
                try:
                    bot.register_next_step_handler(
                        msg, lambda x: confirm_new_tracker(
                            x, latest_tracker_message))
                    update_google_sheet_tracker(latest_tracker_message)

                except Exception as e:
                    print(e)
Ejemplo n.º 57
0
        sys.stdout.flush()

        all_res.append(snapshot_res)
        if stratified_attribute == 'screen_name':
            source_name = source_screen_names[source_user]
            target_name = target_screen_names[testing_map[source_user]]
        else:
            source_name = source_user_names[source_user]
            target_name = target_user_names[testing_map[source_user]]

        if source_name is None:
            source_name = u''
        if target_name is None:
            target_name = u''

        name_dis_list.append(jellyfish.jaro_winkler(source_name, target_name))

    print time.time() - start_time, 'seconds used.'

print 'all_res', len(all_res)
all_res = np.array(all_res)
name_dis_list = np.array(name_dis_list)

np.save(PATH + OUTPUT + 'all_res.npy', all_res)
np.save(PATH + OUTPUT + 'name_dis_list.npy', name_dis_list)

precision = np.mean(all_res, axis=0) if len(all_res) > 0 else 0.
print precision

f = open(PATH + OUTPUT + 'report.txt', 'w')
f.write(str(precision))
Ejemplo n.º 58
0
def test_jaro_winkler_deprecation(jf):
    # backwards compatibility function
    from jellyfish import jaro_winkler

    with pytest.deprecated_call():
        assert jaro_winkler("a", "a") == 1
Ejemplo n.º 59
0
     info = sentence.replace("\n", "").split(" ")
     ##        print str(info) + " vs. " + str(cl)
     if distance == "stringdist":
         caseline = StringDist.compare(info, cl.split(" "))
     if distance == "levensthein":
         dist = jellyfish.levenshtein_distance(
             unicode(sentence.replace("\n", "")), unicode(cl))
         caseline = (decimal.Decimal(
             max(len(sentence.replace("\n", "")), len(cl))) -
                     dist) / decimal.Decimal(
                         max(len(sentence.replace("\n", "")), len(cl)))
     ##                print "levensthein distance:" + str(caseline)
     ##                print "Jaro distance:" + str(caseline)
     if distance == "jaro-winkler":
         try:
             caseline = jellyfish.jaro_winkler(
                 unicode(sentence.replace("\n", "")), unicode(cl))
         except:
             caseline = 0
     ##                print "jaro-winkler distance:" + str(caseline)
     if distance == "w2vec":
         model = train_W2vecmodel()
         caseline = StringDist.compare_Word2vec(info, cl, model)
     #print caseline
     if caseline == 1:
         break
     if caseline > best_case:
         best_case = caseline
         decided_class = cl
 if best_case >= 0.3:
     nb_individuals += 1
     if dict_classes.has_key(decided_class):
Ejemplo n.º 60
0
 def get_handle_similarity(self, candidate):
     import jellyfish
     return jellyfish.jaro_winkler(self.handle, candidate.handle)