Beispiel #1
0
 async def similarity(username: str, password: str):
     jw = JaroWinkler()
     res = jw.similarity(username, password)
     if res <= 0.6:
         return True
     else:
         return False
Beispiel #2
0
 def compare_distance(self, other):
     '''
     '''
     jarowinkler = JaroWinkler()
     dist_1 = jarowinkler.similarity(str.lower(self.data['name']),
                                     str.lower(other.data['name']))
     dist_2 = jarowinkler.similarity(str.lower(self.data['address']),
                                     str.lower(other.data['address']))
     if dist_2 > 0.85:
         return dist_1
     return 0
def jaro_winkler(keyword, domain):
    """Compute Jaro Winkler similarity

    Args:
        keyword:
        domain:

    Returns:
        jarowinkler.similarity: (float) between 0.0 and 1.0

    """
    jarowinkler = JaroWinkler()
    return jarowinkler.similarity(keyword, domain)
Beispiel #4
0
def check_for_text_match(self,str1,strlist):
    jarowinkler = JaroWinkler()
    similarities=[]
    for str2 in strlist:
        similarities.append(jarowinkler.similarity(str1, str2))
    index_max = np.argmax(similarities)
    if index_max >= .70:
        return strlist[index_max]
    similarities=[]
    for str2 in strlist:
        similarities.append(jarowinkler.similarity(str1.lower(), str2.lower())) 
    index_max = np.argmax(similarities) 
    if index_max >= .70:
        return strlist[index_max]  
    else:
        return None
 def build_column(self, data):
     left_col, right_col, algo = (self.cfg.get(p)
                                  for p in ["left", "right", "algo"])
     normalized = self.cfg.get("normalized", False)
     if algo == "levenshtein":
         if normalized:
             similarity = strsimpy.normalized_levenshtein.NormalizedLevenshtein(
             )
         else:
             similarity = strsimpy.levenshtein.Levenshtein()
     elif algo == "damerau-leveneshtein":
         similarity = strsimpy.damerau.Damerau()
         if normalized:
             similarity = SimilarityNormalizeWrapper(similarity)
     elif algo == "jaro-winkler":
         similarity = JaroWinkler()
     elif algo == "jaccard":
         similarity = strsimpy.jaccard.Jaccard(int(self.cfg.get("k", 3)))
         if normalized:
             similarity = SimilarityNormalizeWrapper(similarity)
     distances = apply(
         data[[left_col, right_col]].fillna(""),
         lambda rec: similarity.distance(*rec),
         axis=1,
     )
     return pd.Series(distances, index=data.index, name=self.name)
Beispiel #6
0
def similarity_function(similarity_measure):
    if similarity_measure == 'exact':
        return exact_similarity
    if similarity_measure == 'mlcs':
        return lambda s1, s2: 1 - MetricLCS().distance(s1, s2)
    elif similarity_measure == 'nlevs':
        return lambda s1, s2: 1 - NormalizedLevenshtein().distance(s1, s2)
    elif similarity_measure == 'jaro':
        return JaroWinkler().similarity
    else:
        raise ValueError('Invalid similarity measure.')
Beispiel #7
0
def create_or_update_matches(obscurities: List[str]) -> None:
    if not os.path.exists('jaro_sim.csv'):
        print('Running experiment similarities...')
        jaro = JaroWinkler().similarity
        obscurity_count: Dict[str, bool] = {
            x: obscurities.count(x)
            for x in obscurities
        }
        with open('jaro_sim.csv', 'a') as file_jaro_sim, \
                open('jaro_sim_orphan.csv', 'a') as file_jaro_sim_orphan:
            jaro_csv_writer = csv.writer(file_jaro_sim)
            orphan_csv_writer = csv.writer(file_jaro_sim_orphan)
            fields: List[str] = [
                'Obscurity', 'Count', 'KGV1', 'Similarity1', 'KGV2',
                'Similarity2', 'KGV3', 'Similarity3'
            ]
            jaro_csv_writer.writerow(fields)
            orphan_csv_writer.writerow(fields)

            current_known_good_values = get_known_good_values()
            for obscurity in set(sorted(obscurities)):
                sim_val: List[Tuple[str, float]] = []
                for good_value in current_known_good_values:
                    similarity_value: float = jaro(obscurity, good_value)
                    sim_val.append((good_value, similarity_value))
                    sim_val.sort(reverse=True, key=operator.itemgetter(1))
                topthree: List[Union[str, float, int]] = []
                topthree.extend([x for y in sim_val[:3] for x in y])
                output = [obscurity, obscurity_count[obscurity]] + topthree
                if obscurity_count[obscurity] <= 10:
                    orphan_csv_writer.writerow(output)
                else:
                    jaro_csv_writer.writerow(output)
    else:
        print('Simulation file already exists.')
        return
Beispiel #8
0
import csv
import json

from strsimpy.jaro_winkler import JaroWinkler

COMPARE_MODE = ['csv', 'json', 'string']
METHOD = JaroWinkler()
SPLITTERS = ','
EXCLUSIONS = '_- '
REPLACEMENTS = {'0': 'Oo'}
THRESHOLD = 0.98


def clean_string(string,
                 splitters=SPLITTERS,
                 exclusions=EXCLUSIONS,
                 replacements=REPLACEMENTS):
    s = string
    for splitter in splitters:
        s = s.split(splitter)[0]
    for exclusion in exclusions:
        s = s.replace(exclusion, '')
    for new, olds in replacements.items():
        for old in olds:
            s = s.replace(old, new)
    s = s.lower()
    s = s.strip()
    return s


def compare_string(s1, s2, method=METHOD, clean=True):
]  # comment IDs of parent comments in WOdin mastery threads for wind and earth-weak
wodinThreadIds = [
    ['k8pd7q'],
    ['k8petf'
     ],  # thread IDs for individual phys/mag weak threads for lighting-weak
    ['kj1gdp'],
    ['kj1fcw'],  # water-weak
    ['lc3fe6'],
    ['lc3fey']
]  # fire-weak
sbTypes = [
    'LBO', 'LBG', 'ADSB', 'SASB', 'AASB', 'GSB+', 'CSB', 'AOSB', 'USB', 'OSB',
    'GSB', 'BSB', 'SSB', 'Unique'
]  # cleanSbNames() maps to these
heroNameList = getHeroNameList()
strsim = JaroWinkler(
)  # string similarity module for catching typos/abbreviations

## Run for Dreambreaker
outputLines = []  # buffer to put output strings into
summaryLines = ['#Summary table\n\n\n']
appendTableHeader(summaryLines, sbTypes)
summaryLines[-2] = summaryLines[-2].replace('|Hero|Used', '|Realm')
summaryLines[-1] = summaryLines[-1][4:]
teamTableTextLines = []
for threadId in dbThreadIds:
    submission = reddit.submission(id=threadId[0])
    threadTitle = submission.title
    realm = threadTitle[threadTitle.find("(") + 1:threadTitle.find(")")]
    print('\n*****************\n{}\n*****************\n'.format(threadTitle))
    commentsList = []
    postUrl = []
Beispiel #10
0
class STSWikiReader:
    """Reads data from website, creates a lookup map of item names, and does
        soft string matching to find possible mentions of the item parsed
    """
    strcmp = JaroWinkler()

    def __init__(self, name, links, ignore_list, parse_names):
        self.last_update = datetime.datetime.utcnow()
        self.name = name
        self.links = links
        self.ignore_list = ignore_list
        self.parse_names = parse_names
        self.base_set = set()
        self.real_names = set()
        self.fake_name_map = dict()
        self.cur = None
        self.max_name_word_cnt = 0
        self.max_match = 0
        self.FORCE_IGNORE_NAME = '~~FORCE~IGNORE~~'

        self.update_info()

    def format_name(self, name):
        """Used to get a clean, uniform name with pesky characters removed"""
        return self._rm_double_space(
            self._rm_symbol(self._rm_squote(self._rm_hyph(name.lower()))))

    def _rm_symbol(self, name):
        """removes odd characters that should never be in a obj name"""
        return name.replace('?', ' ').replace(',', ' ').replace('.', ' ') \
            .replace('!', ' ').replace('(', ' ').replace(')', ' ') \
            .replace(':', ' ').replace('"', ' ').replace('+', ' ') \
            .replace('[', ' ').replace(']', ' ')

    def _rm_squote(self, name):
        """removes single quotes"""
        return name.replace("'", '').replace('’', '')

    def _lower(self, name):
        """exists to pass along to alternative names func"""
        return name.lower()

    def _rm_hyph(self, name):
        """swaps typical joining characters with spaces"""
        return name.replace('-', ' ').replace('_', ' ')

    def _rm_beta(self, name):
        """removes beta tag (possible error from wiki)"""
        return name.replace('_beta', '').replace('_Beta', '') \
            .replace('Beta', '').replace('beta', '')

    def _append_s(self, name):
        """makes things plural
            (simple method prone to error, but will do for now)
        """
        return f'{name}s'

    def _rm_double_space(self, name):
        while '  ' in name:
            pos = name.find('  ')
            name = name[:pos] + name[pos + 1:]
        return name

    def _rm_article_at_start(self, name):
        articles = ['the', 'a', 'an']
        test_name = name.lower()
        for article in articles:
            if test_name.startswith(article + ' '):
                return name[len(article) + 1:]
        return name

    def _gen_alternative_names(self, name):
        """creates a massive list of possible mistypes for a
            specific name, used as an aid for matching user input
        """
        names = set()
        actions = [
            self._rm_symbol, self._rm_squote, self._lower,
            self._rm_article_at_start, self._rm_hyph, self._rm_beta,
            self._append_s
        ]
        # Weird edge case for beta tag on wiki vs beta the card
        if name.lower().strip() == 'beta':
            actions.remove(self._rm_beta)

        for outer in range(len(actions)):
            temp_name = name
            for inner in range(len(actions) - outer):
                temp_name = self._rm_double_space(actions[outer +
                                                          inner](temp_name))
                names.add(temp_name)
        return list(names)

    def update_info(self):
        """goes to the web and finds information provided by the links"""
        log(f'Updating {self.name}s...')
        seen_list = set()

        # fetch data from links and update object with most recent info
        for link in self.links:
            res = requests.get(link, verify=False)
            for cur_name in self.parse_names(
                    soup(res.text, features="html.parser")):
                if cur_name.lower() in self.ignore_list:
                    continue
                seen_list.add(cur_name)
                # if we haven't seen it before, add it to our look up list.
                if (cur_name not in self.base_set) \
                        and (not cur_name.startswith('Category:')):
                    self.base_set.add(cur_name)
                    self.real_names.add(cur_name)
                    self.fake_name_map[cur_name] = cur_name
                    self.max_name_word_cnt = max(self.max_name_word_cnt,
                                                 len(cur_name.split(' ')))

                    for new_name in self._gen_alternative_names(cur_name):
                        if new_name.strip():
                            self.base_set.add(new_name)
                            self.fake_name_map[new_name] = cur_name

        # handle deleted data from wiki
        recalc_max_name_word_cnt = False
        for cur_name in self.real_names - seen_list:
            for new_name in self._gen_alternative_names(cur_name):
                self.base_set.remove(new_name)
                del self.fake_name_map[new_name]

            if not recalc_max_name_word_cnt \
                    and self.max_name_word_cnt == len(cur_name.split(' ')):
                recalc_max_name_word_cnt = True
            self.base_set.remove(cur_name)
            self.real_names.remove(cur_name)
            del self.fake_name_map[cur_name]

        if recalc_max_name_word_cnt:
            self.max_name_word_cnt = 0
            for cur_name in self.real_names:
                self.max_name_word_cnt = max(self.max_name_word_cnt,
                                             len(cur_name.split(' ')))

        # finalize update
        self.last_update = datetime.datetime.utcnow()
        log(f'Found {len(self.real_names)} {self.name}s')

    def check_if_similar(self, name):
        """uses similarity check to see if the passed in name may match
            any of our found or generated names
        """
        name = self.format_name(name)
        split_name = name.split(' ')
        word_thresh = 0.9**len(split_name)
        self.max_match = 0
        self.cur = None
        for item_name in self.base_set:
            split_item_name = item_name.split(' ')
            if len(split_name) == len(split_item_name):
                word_check = 1
                for i in range(len(split_name)):
                    word_check *= self.strcmp.similarity(
                        split_name[i], split_item_name[i])
                    word_check *= self.strcmp.similarity(
                        split_name[i][::-1], split_item_name[i][::-1])

                if word_check > self.max_match:
                    self.max_match = word_check
                    if word_check >= word_thresh:
                        self.cur = self.fake_name_map[item_name]
        return self.cur is not None

    def check_if_exists(self, name, update=True):
        """Used to check if a name is a perfect match for any found
            names or is close enough to call a match
        """
        if update and datetime.datetime.utcnow() - self.last_update \
                > datetime.timedelta(days=15):
            self.update_info()

        if name.lower() in self.ignore_list:
            self.cur = self.FORCE_IGNORE_NAME
            self.max_match = 1
            return True

        res = name in self.real_names
        if res:
            self.cur = name
            self.max_match = 1
        elif name in self.fake_name_map.keys():
            self.cur = self.fake_name_map[name]
            self.max_match = 1
            res = True
        else:
            res = self.check_if_similar(name)
        return res
Beispiel #11
0
from strsimpy.jaro_winkler import JaroWinkler
from strsimpy.ngram import NGram
from strsimpy.qgram import QGram

qgram = QGram(2)
print(qgram.distance('ABCD', 'ABCE'))

twogram = NGram(2)
print(twogram.distance('ABCD', 'ABTUIO'))

s1 = 'Adobe CreativeSuite 5 Master Collection from cheap 4zp'
s2 = 'Adobe CreativeSuite 5 Master Collection from cheap d1x'
fourgram = NGram(4)
print(fourgram.distance(s1, s2))

jarowinkler = JaroWinkler()
print(jarowinkler.similarity('My string', 'My tsring'))
print(jarowinkler.similarity('My string', 'My ntrisg'))

optimal_string_alignment = OptimalStringAlignment()
print(optimal_string_alignment.distance('CA', 'ABC'))

damerau = Damerau()
print(damerau.distance('ABCDEF', 'ABDCEF'))
print(damerau.distance('ABCDEF', 'BACDFE'))
print(damerau.distance('ABCDEF', 'ABCDE'))
print(damerau.distance('ABCDEF', 'BCDEF'))
print(damerau.distance('ABCDEF', 'ABCGDEF'))
print(damerau.distance('ABCDEF', 'POIU'))

normalized_levenshtein = NormalizedLevenshtein()
Beispiel #12
0
from faktotum.typing import Entities, Pipeline, TaggedTokens
from faktotum.utils import (
    align_index,
    cosine_similarity,
    extract_features,
    get_best_candidate,
    group_mentions,
    pool_tokens,
    predict_labels,
    sentencize,
    vectorize_context,
)

NER_MODELS = NamedEntityRecognition()
NED_MODELS = NamedEntityDisambiguation()
JARO_WINKLER = JaroWinkler()


def nel(text: str, kb: KnowledgeBase, domain: str) -> TaggedTokens:
    """Named Entity Linking.

    Parameters
    ----------
    text : str
        The text to process.
    kb : KnowledgeBase
        The knowledge base to link entities.
    domain : str
        Domain of the text, either `literary-texts` or `press-texts`.

    Returns
Beispiel #13
0
    last_song = json.loads(f.read())
    f.close()
logger.write('   JSON: Last song was %s by %s\n' %
             (last_song[0], last_song[1]))

############### LIKING Song #######

if liked:
    network.get_track(artist, title).love()
    logger.write('   LOVE: Loved the Song on LastFm\n')
else:
    network.get_track(artist, title).unlove()
    logger.write('   LOVE: Unloved the Song on LastFm\n')

############### String Compare ####
jarowinkler = JaroWinkler()

if last_song[
        0] != title:  # Check, so that this program doesn't scrobble the song multiple times
    last_scrobble = network.get_user(
        lastFmCreds['username']).get_recent_tracks(limit=1)

    logger.write('   LastFM: Last song was %s by %s\n' %
                 (last_scrobble[0][0].title, last_scrobble[0][0].artist))

    if jarowinkler.similarity(str(last_scrobble[0][0].title.lower(
    )), title.lower()) < 0.9:  # check that "nobody else" scrobbled the song
        unix_timestamp = int(time.mktime(datetime.datetime.now().timetuple()))
        if 'album' in locals():
            network.scrobble(artist=artist,
                             title=title,
Beispiel #14
0
f = open("akcigerhastaligi.txt", encoding="utf8")
df = f.read()


def basic_clean(text):
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('turkish')
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]


words = basic_clean(df)
unigrams = nltk.ngrams(words, 1)
unigramsFrequency = Counter(unigrams)
valuesOfUnigrams = list(unigramsFrequency.values())
unigramlist = list(unigramsFrequency)

for x in range(0, len(unigramlist)):
    if (valuesOfUnigrams[x] > 4):
        print(unigramlist[x], "is used", valuesOfUnigrams[x], "times")

jarowinkler = JaroWinkler()
print(jarowinkler.similarity('öksürük', 'öksürk'))
print(jarowinkler.similarity('akciğer', 'akciğr'))
print(jarowinkler.similarity('kanser', 'akciğr'))
print(jarowinkler.similarity('kanser', 'öksürk'))
print(jarowinkler.similarity('akciğer', 'öksürk'))

print('öksürk     ' 'öksürük')
print('akciğr     ' 'akciğer')
print('kansr      ' 'kanser')
Beispiel #15
0
def string_distance(a, b):
    jarowinkler = JaroWinkler()
    return jarowinkler.similarity(a, b) + jaccard(a, b)
Beispiel #16
0
# from .ngram import NGram
# from .normalized_levenshtein import NormalizedLevenshtein
# from .optimal_string_alignment import OptimalStringAlignment
# from .qgram import QGram
# from .shingle_based import ShingleBased
# from .sorensen_dice import SorensenDice
# from .string_distance import StringDistance
# from .string_similarity import StringSimilarity
# from .weighted_levenshtein import WeightedLevenshtein
# from .sift4 import SIFT4Options, SIFT4

cosine = Cosine(2)
sorensenDice = SorensenDice(2)
jaccard = Jaccard(2)
qgram = QGram(2)
jaroWinkler = JaroWinkler()
normalizedLevenshtein = NormalizedLevenshtein()
stringSimilarity = StringDistance()
s0 = '烟台大学人文学院'
s1 = '江西农业大学'
# print(cosine.get_profile(s1))
# print(sorensenDice.get_profile(s1))
print(cosine.similarity(s0, s1))
print(sorensenDice.similarity(s0, s1))
print(jaccard.similarity(s0, s1))
print(jaroWinkler.similarity(s0, s1))
print(normalizedLevenshtein.similarity(s0, s1))
# print(qgram.distance(s0, s1))
similarity_list = [jaroWinkler, cosine, jaccard, normalizedLevenshtein]

er_process_with_similarity(path_o1, path_o2, path_t,
Beispiel #17
0
    if (title.find('(') > -1):
        strA = title[:title.find('(')]
        strB = title[title.find('(') + 1:title.find(',')]
    return [strA, strB]


# stringsAB = [[[a, b] for a, b in getStringsAB(str(title))] for title in df['original_title'].to_list()]
stringsAB = [getStringsAB(str(title)) for title in df['title'].to_list()]
npStringsAB = np.array(stringsAB)

# %%
levenshtein = Levenshtein()
normalized_levenshtein = NormalizedLevenshtein()
damerau = Damerau()
optimal_string_alignment = OptimalStringAlignment()
jarowinkler = JaroWinkler()
lcs = LongestCommonSubsequence()
metric_lcs = MetricLCS()
twogram = NGram(2)
qgram = QGram(2)
cosine = Cosine(2)

strAs = npStringsAB[:, 0].tolist()
strBs = npStringsAB[:, 1].tolist()
results = {
    'str A':
    strAs,
    'str B':
    strBs,
    # 'Levenshtein': [
    #     levenshtein.distance(str1a, str1b),
Beispiel #18
0
        csv_reader = csv.reader(file)
        lst_experiments = next(csv_reader)

obscurities: List[str] = []

for obscurity in lst_experiments:
    if obscurity not in known_good_values and obscurity not in pynmrstar.definitions.NULL_VALUES:
        obscurities.append(obscurity)

#####
obscurity_count: Dict[str,
                      bool] = {x: obscurities.count(x)
                               for x in obscurities}
#####

jaro = JaroWinkler().similarity
sift = SIFT4().distance
threegram = NGram(3).distance

mapping = {'jaro': jaro, 'sift': sift, 'threegram': threegram}

for algorithm in mapping:
    print(f'Running {algorithm}...')
    with open(f'{algorithm}_sim.csv',
              'w') as file_a, open(f'{algorithm}_sim_orphan.csv',
                                   'w') as file_b:
        csv_writer_a = csv.writer(file_a)
        csv_writer_b = csv.writer(file_b)
        fields: List[str] = [
            'Obscurity', 'Count', 'KGV1', 'Similarity1', 'KGV2', 'Similarity2',
            'KGV3', 'Similarity3'