def compute_levensthein_distances_in_ground_truth():
    # Computes the levensthein distances within similar and non similar pairs of the ground truth
    #
    labels, docs = load_data()
    s, y_t = load_SME_binary()
    df = pd.read_excel(r'test-lab.xlsx')
    dval = df.values

    for i in range(len(dval)):
        doc1 = dval[i, 0].strip()
        doc1_ = doc1
        for x in labels:
            if x.startswith(doc1):
                doc1_ = x
        doc2 = dval[i, 1].strip()
        doc2_ = doc2
        for x in labels:
            if x.startswith(doc2):
                doc2_ = x
        label = dval[i, 2]
        m = StringMatcher(seq1=docs[labels.index(doc1_)],
                          seq2=docs[labels.index(doc2_)])
        print(doc1, doc2, label, m.ratio(), sep=",")

    return
Ejemplo n.º 2
0
    async def getBannedUser(ctx, user):

        user = str(user).lower()

        matcher = StringMatcher()

        userdata = []

        for _user in await ctx.guild.bans():

            _user = _user.user

            member = {
                "name": _user.name,
                "fullname": _user.name + "#" + str(_user.discriminator),
                "mention": "<@!" + str(_user.id) + ">",
                "nickname": _user.display_name,
                "discriminator": str(_user.discriminator),
                "discrim2": "#" + str(_user.discriminator),
                "id": str(_user.id)
            }

            data = {}

            for item in member:

                matcher.set_seqs(user, member[item].lower())

                data[item] = matcher.quick_ratio()

            data["_id"] = member["id"]

            userdata.append(data)

        matches = {}

        for user in userdata:

            for key in user:

                if key != "_id":

                    if user[key] > .5:

                        matches[user["_id"]] = user[key]

                        break

        if not matches:

            raise ValueError(f"NoSuchUser: {user}")

        id = int(max(matches.items(), key=itemgetter(1))[0])

        for banned in await ctx.guild.bans():

            if banned.user.id == id:

                return banned.user
Ejemplo n.º 3
0
def levenshtein(string1, string2):
    if string1 is None:
        string1 = ""
    if string2 is None:
        string2 = ""

    string_matcher = StringMatcher(seq1=string1.lower(), seq2=string2.lower())
    return string_matcher.distance()
Ejemplo n.º 4
0
def levenshtein(string1, string2):
    if string1 is None:
        string1 = ""
    if string2 is None:
        string2 = ""

    string_matcher = StringMatcher(seq1=string1.lower(), seq2=string2.lower())
    return string_matcher.distance()
    def compare(self, first_statement, second_statement):

        statement = first_statement.lower()

        other_statement = second_statement.lower()

        similarity = SequenceMatcher(None, statement, other_statement)

        return round(similarity.ratio(), 10)
Ejemplo n.º 6
0
def get_match_score(phrase, words, min_distance=2):
    score = 0
    phrase_len = len(''.join(phrase))
    for p in phrase:
        matcher = StringMatcher(seq1=p)
        for w in words:
            matcher.set_seq2(w)
            match_distance = matcher.distance()
            if match_distance <= min_distance:
                score += max(0, len(p) - match_distance) / phrase_len
    return score
Ejemplo n.º 7
0
def match_ngrams(tokens: List, min_similarity=.96):
    found_topics = defaultdict(list)
    matches = set()
    for n in range(3, 0, -1):
        # i indexes the same token in the text whether we're matching by unigram, bigram, or trigram
        for i, grams in enumerate(ngrams(tokens, n)):
            # if we already matched the current token to a topic, don't reprocess it
            if i in matches:
                continue
            # otherwise unsplit the ngram for matching so ('quick', 'brown') => 'quick brown'
            gram = " ".join(grams)
            try:
                # if there isn't an exact match on the first 4 characters of the ngram and a topic, move on
                topic_block = TOPIC_STEMS[gram[:4]]
            except KeyError:
                continue
            for topic in topic_block:
                # otherwise look for an inexact match
                match_ratio = StringMatcher(None, topic, gram).ratio()
                if match_ratio >= min_similarity:
                    try:
                        # if a 'primary label' exists for the current topic, use it instead of the matched topic
                        topic = CSO['primary_labels'][topic]
                    except KeyError:
                        pass
                    # note the tokens that matched the topic and how closely
                    found_topics[topic].append({'matched': gram, 'similarity': match_ratio})
                    # don't reprocess the current token
                    matches.add(i)
    return found_topics
Ejemplo n.º 8
0
    def make_comparator():
        matcher = StringMatcher()

        def compare(str1, str2):
            matcher.set_seqs(str1, str2)
            return matcher.distance()
        return compare
Ejemplo n.º 9
0
    def statistic_similarity(self):
        """Function that finds the similarity between the previously extracted concepts and topics in the ontology
        Returns:
            found_topics (dictionary): containing the found topics with their similarity and the n-gram analysed.
        """

        found_topics = dict()
        concepts = self.paper.get_syntactic_chunks()
        for concept in concepts:
            matched_trigrams = set()
            matched_bigrams = set()
            for comprehensive_grams in self.get_ngrams(concept):
                position = comprehensive_grams["position"]
                size = comprehensive_grams["size"]
                grams = comprehensive_grams["ngram"]
                # if we already matched the current token to a topic, don't reprocess it
                if size <= 1 and (position in matched_bigrams
                                  or position - 1 in matched_bigrams):
                    continue
                if size <= 2 and (position in matched_trigrams
                                  or position - 1 in matched_trigrams
                                  or position - 2 in matched_trigrams):
                    continue
                # otherwise unsplit the ngram for matching so ('quick', 'brown') => 'quick brown'
                gram = " ".join(grams)
                try:
                    # if there isn't an exact match on the first 4 characters of the ngram and a topic, move on
                    # topic_block = [key for key, _ in self.cso.topics.items() if key.startswith(gram[:4])]
                    topic_block = self.cso.topic_stems[gram[:4]]

                except KeyError:
                    continue
                for topic in topic_block:
                    # otherwise look for an inexact match
                    match_ratio = StringMatcher(None, topic, gram).ratio()
                    if match_ratio >= self.min_similarity:
                        try:
                            # if a 'primary label' exists for the current topic, use it instead of the matched topic
                            topic = self.cso.primary_labels[topic]
                        except KeyError:
                            pass
                        # note the tokens that matched the topic and how closely
                        if topic not in found_topics:
                            found_topics[topic] = list()
                        found_topics[topic].append({
                            'matched': gram,
                            'similarity': match_ratio
                        })
                        # don't reprocess the current token

                        if size == 2: matched_bigrams.add(position)
                        elif size == 3: matched_trigrams.add(position)

                        # explanation bit
                        if topic not in self.explanation:
                            self.explanation[topic] = set()

                        self.explanation[topic].add(gram)
        return found_topics
Ejemplo n.º 10
0
    def find_matches(self, candidates, verbose=False):
        if verbose:
            print('Searching for matches')

        found_topics = dict()

        for candidate in candidates:
            matched_trigrams = set()
            matched_bigrams = set()
            for comprehensive_grams in self.__get_ngrams__(candidate):
                position = comprehensive_grams["position"]
                size = comprehensive_grams["size"]
                grams = comprehensive_grams["ngram"]
                # if we already matched the current token to a topic, don't reprocess it
                if size <= 1 and (position in matched_bigrams
                                  or position - 1 in matched_bigrams):
                    continue
                if size <= 2 and (position in matched_trigrams
                                  or position - 1 in matched_trigrams
                                  or position - 2 in matched_trigrams):
                    continue
                # otherwise unsplit the ngram for matching so ('quick', 'brown') => 'quick brown'
                gram = ' '.join(grams)
                if verbose:
                    print(gram)

                try:
                    # if there isn't an exact match on the first 4 characters of the ngram and a topic, move on
                    #topic_block = [key for key, _ in self.cso.topics.items() if key.startswith(gram[:4])]
                    topic_block = self.key_stems[gram[:4]]
                except KeyError:
                    continue

                for topic in topic_block:
                    # otherwise look for an inexact match
                    match_ratio = StringMatcher(None, topic, gram).ratio()
                    if match_ratio >= self.min_similarity:
                        #try:
                        #    # if a 'primary label' exists for the current topic, use it instead of the matched topic
                        #    topic = self.cso.primary_labels[topic]
                        #except KeyError:
                        #    pass
                        # note the tokens that matched the topic and how closely
                        if gram in found_topics:
                            current_similarity = found_topics[gram][
                                'similarity']
                            if current_similarity >= match_ratio:
                                continue

                        found_topics[gram] = {
                            'matched': topic,
                            'similarity': match_ratio
                        }
                        # don't reprocess the current token
                        if size == 2: matched_bigrams.add(position)
                        elif size == 3: matched_trigrams.add(position)

        return found_topics
Ejemplo n.º 11
0
def char_to_char(source: str, target: str) -> np.ndarray:
    """Find the character adjacency matrix mapping source string chars to target string chars.

    Uses StringMatcher from Levenshtein package to find non-overlapping matching subsequences in
    input strings. Uses the result to create a character adjacency matrix from source to target.
    (https://docs.python.org/2/library/difflib.html#difflib.SequenceMatcher.get_matching_blocks)

    Args:
        source (str): string of source chars.
        target (str): string of target chars.

    Returns:
        np.ndarray adjacency matrix mapping chars in the source str to chars in the target str.

    """
    sm = StringMatcher(seq1=source, seq2=target)
    mb = sm.get_matching_blocks()
    return _mat_from_blocks(mb, len(source), len(target))
Ejemplo n.º 12
0
def levenshtein_distance(statement, other_statement):
    """
    Compare two statements based on the Levenshtein distance
    of each statement's text.

    For example, there is a 65% similarity between the statements
    "where is the post office?" and "looking for the post office"
    based on the Levenshtein distance algorithm.

    :return: The percent of similarity between the text of the statements.
    :rtype: float
    """
    import sys

    # Use python-Levenshtein if available
    try:
        from Levenshtein.StringMatcher import StringMatcher as SequenceMatcher
    except ImportError:
        from difflib import SequenceMatcher

    PYTHON = sys.version_info[0]

    # Return 0 if either statement has a falsy text value
    if not statement.text or not other_statement.text:
        return 0

    # Get the lowercase version of both strings
    if PYTHON < 3:
        statement_text = unicode(statement.text.lower())
        other_statement_text = unicode(other_statement.text.lower())
    else:
        statement_text = str(statement.text.lower())
        other_statement_text = str(other_statement.text.lower())

    similarity = SequenceMatcher(
        None,
        statement_text,
        other_statement_text
    )

    # Calculate a decimal percent of the similarity
    percent = int(round(100 * similarity.ratio())) / 100.0

    return percent
Ejemplo n.º 13
0
def levenshtein_distance(statement, other_statement):
    """
    Compare two statements based on the Levenshtein distance
    of each statement's text.

    For example, there is a 65% similarity between the statements
    "where is the post office?" and "looking for the post office"
    based on the Levenshtein distance algorithm.

    :return: The percent of similarity between the text of the statements.
    :rtype: float
    """
    import sys

    # Use python-Levenshtein if available
    try:
        from Levenshtein.StringMatcher import StringMatcher as SequenceMatcher
    except ImportError:
        from difflib import SequenceMatcher

    PYTHON = sys.version_info[0]

    # Return 0 if either statement has a falsy text value
    if not statement.text or not other_statement.text:
        return 0

    # Get the lowercase version of both strings
    if PYTHON < 3:
        statement_text = unicode(statement.text.lower())
        other_statement_text = unicode(other_statement.text.lower())
    else:
        statement_text = str(statement.text.lower())
        other_statement_text = str(other_statement.text.lower())

    similarity = SequenceMatcher(
        None,
        statement_text,
        other_statement_text
    )

    # Calculate a decimal percent of the similarity
    percent = round(similarity.ratio(), 2)

    return percent
def compute_levensthein_distances_in_clusters(output_file="",
                                              embedder="doc2vec"):
    # Computes the levensthein distances within clusters
    #
    # In:
    # embedder - "doc2vec" or "sbert"

    doc_labels, docs = load_data()
    if embedder == "doc2vec":
        l, x, labels = perform_clustering()
    elif embedder == "sbert":
        l, x, labels = sbert_labels()
    else:
        return print("wrong input to eval method")
    valid_labels = set(l)
    for lbl in valid_labels:
        if lbl == -1:
            continue
        print(lbl, end=",")
        # Create a subset of labels that belong to cluster lbl
        current_cluster = []
        for k, v in labels.items():
            if v == lbl:
                current_cluster.append(k)
        # breakpoint()
        for a, b in combinations(current_cluster, 2):
            idx_a = -1
            idx_b = -1
            for i in range(len(doc_labels)):
                if doc_labels[i].startswith(a):
                    idx_a = i
                elif doc_labels[i].startswith(b):
                    idx_b = i
            doc1 = docs[idx_a]
            doc2 = docs[idx_b]
            # print(doc1,doc2,"a b", a,b)
            # assert(doc1)
            # assert(doc2)
            m = StringMatcher(seq1=doc1, seq2=doc2)
            print(m.ratio(), end=",")
        print()
    return
Ejemplo n.º 15
0
def levenshtein_distance(statement, other_statement):
    """
    Compare two statements based on the Levenshtein distance
    (fuzzy string comparison) of each statement's text.

    :return: The percent of similarity between the text of the statements.
    :rtype: float
    """
    import sys
    import warnings

    # Use python-Levenshtein if available
    try:
        from Levenshtein.StringMatcher import StringMatcher as SequenceMatcher
    except ImportError:
        from difflib import SequenceMatcher

    PYTHON = sys.version_info[0]

    # Return 0 if either statement has a falsy text value
    if not statement.text or not statement.text:
        return 0

    # Get the lowercase version of both strings
    if PYTHON < 3:
        statement_text = unicode(statement.text.lower())
        other_statement_text = unicode(other_statement.text.lower())
    else:
        statement_text = str(statement.text.lower())
        other_statement_text = str(other_statement.text.lower())

    similarity = SequenceMatcher(
        None,
        statement_text,
        other_statement_text
    )

    # Calculate a decimal percent of the similarity
    percent = int(round(100 * similarity.ratio())) / 100.0

    return percent
Ejemplo n.º 16
0
    def compare(self, statement, other_statement):
        """
        比较两个输入

        :return: 返回两个句子之间的相似度
        :rtype: 浮点型
        """

        # Return 0 if either statement has a falsy text value
        if not statement.text or not other_statement.text:
            return 0

        statement_text = str(statement.text.lower())
        other_statement_text = str(other_statement.text.lower())

        similarity = SequenceMatcher(None, statement_text,
                                     other_statement_text)

        # Calculate a decimal percent of the similarity
        percent = round(similarity.ratio(), 2)

        return percent
Ejemplo n.º 17
0
    def compare(self, statement, other_statement):
        """
        Compare the two input statements.

        :return: The percent of similarity between the text of the statements.
        :rtype: float
        """
        import sys

        # Use python-Levenshtein if available
        try:
            from Levenshtein.StringMatcher import StringMatcher as SequenceMatcher
        except ImportError:
            from difflib import SequenceMatcher

        PYTHON = sys.version_info[0]

        # Return 0 if either statement has a falsy text value
        if not statement.text or not other_statement.text:
            return 0

        # Get the lowercase version of both strings
        if PYTHON < 3:
            statement_text = unicode(statement.text.lower()) # NOQA
            other_statement_text = unicode(other_statement.text.lower()) # NOQA
        else:
            statement_text = str(statement.text.lower())
            other_statement_text = str(other_statement.text.lower())

        similarity = SequenceMatcher(
            None,
            statement_text,
            other_statement_text
        )

        # Calculate a decimal percent of the similarity
        percent = round(similarity.ratio(), 2)

        return percent
Ejemplo n.º 18
0
    def _search(self, artist, title=None):
        artist = self._remove_accents(artist.lower())
        if title != None:
            search = urllib.parse.quote_plus("%s %s" % (artist, title))
        else:
            search = urllib.parse.quote_plus(artist.encode('utf-8'))
        url = u'http://www.songlyrics.com/index.php?section=search&searchW=%s&submit=Search&searchIn1=artist&searchIn3=song' % search
        user_agent = "(X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
        headers = {'User-Agent': user_agent}
        req = Request(url, headers=headers)
        content = urlopen(req)
        page = lxml.html.parse(content)
        links = page.getroot().cssselect('.serpresult')
        resp = []
        #search_original = u"%s - %s" % (artist.lower(), title.lower())
        if len(links):
            for link in links:
                current_song = link.cssselect('h3>a')
                if len(current_song) and current_song[0].text != None:
                    song_title = current_song[0].text.replace('Lyrics', '')
                    song_artist = link.cssselect('.serpdesc-2>p>a')[0].text

                    search = u"%s - %s" % (song_artist.lower(),
                                           song_title.lower())
                    dist_artist = StringMatcher(
                        seq1=artist.lower(),
                        seq2=song_artist.lower()).distance()
                    dist_title = 0
                    if title != None:
                        dist_title = StringMatcher(
                            seq1=title.lower(),
                            seq2=song_title.lower()).distance()
                    #print (dist_artist, artist, song_artist)
                    #print (title, dist_title)
                    if dist_artist < 5 and (title != None or dist_title < 10):
                        resp.append(current_song[0].get('href'))
        return resp
Ejemplo n.º 19
0
    def get_loosly_matching_keyword(self, term):
        splitted_terms = self.tokenize_text(term)
        max_nb_commun = 0
        most_common_keys = []
        for key, key_tokens in self.tokenized_keys_.items():
            nb_words_common = 0
            for potential_word in splitted_terms:
                if potential_word in key_tokens:
                    nb_words_common += 1
            if nb_words_common > max_nb_commun:
                max_nb_commun = nb_words_common
                most_common_keys = []
            if nb_words_common == max_nb_commun:
                most_common_keys.append(key)

        min_distance = 9999999
        result = None
        for key in most_common_keys:
            match = StringMatcher(seq1=key, seq2=term)
            distance = match.distance()
            if distance < min_distance:
                min_distance = distance
                result = key
        return result
def find_closest_match(types_dict, search_str):

    closest_match = ''
    closest_distance = DEFAULT_DISTANCE

    for key in types_dict:

        key_distance = StringMatcher(seq1=search_str, seq2=key).distance()

        if key_distance < closest_distance:
            closest_match = key
            closest_distance = key_distance

    closest_type = types_dict[closest_match]

    return closest_match, closest_type
Ejemplo n.º 21
0
    def get_artists(self, region, country, number_pages=4):
        country = country.lower()
        country_iso = pycountry.countries.get(name=country).alpha_2
        artists = set()

        for page_number in range(1, number_pages + 1):
            url = "https://freemuse.org/regions" + "/" + region + "/" + country + "/page/" + str(
                page_number)
            page = requests.get(url)
            soup = BeautifulSoup(page.content, 'html.parser')
            items = soup.find_all(class_="item-list")
            for item in items:
                entry_title = item.find(class_="post-box-title")
                entry = item.find(class_="entry")
                title = entry_title.find('a').getText()
                article = entry.find('p').getText()
                doc = self._nlp(f'{title}. {article}')
                # sometimes actually ORG is relevant too
                entities = [
                    x.text for x in doc.ents
                    if ((x.label_ == 'PERSON') | (x.label_ == 'ORG'))
                ]
                if entities:
                    for entity in set(entities):
                        # cross check person with musicbrainz so we make sure the artist exists in the specified country
                        # TODO: not very robust. find some way to double check
                        try:
                            query = musicbrainzngs.search_artists(
                                artist=entity, country=country_iso)
                            artist_in_musicbrainz = query['artist-list'][0][
                                'name']
                            dist_artist = StringMatcher(
                                seq1=artist_in_musicbrainz.lower(),
                                seq2=entity.lower()).distance()
                            if (dist_artist < 5
                                    and query['artist-list'][0]['country']
                                    == country_iso):
                                artists.add(artist_in_musicbrainz)
                                print(f'Success! Found artist {entity}')
                            else:
                                print(
                                    f'Found name {entity} but not in Musicbrainz, closest artists is {artist_in_musicbrainz}'
                                )
                        except Exception as e:
                            print(f'Something failed for {entity}', e)
        return ({country_iso: list(artists)})
Ejemplo n.º 22
0
def get_artists_from_freemuse(region, country):
    artists = set()
    # TODO: parse through all pages not just first one
    url = "https://freemuse.org/regions" + "/" + region + "/" + country
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    items = soup.find_all(class_="item-list")
    for item in items:

        entry_title = item.find(class_="post-box-title")
        entry = item.find(class_="entry")

        title = entry_title.find('a').getText()
        article = entry.find('p').getText()

        doc = nlp(f'{title}. {article}')
        # keep only persons
        # TODO: sometimes actually ORG is relevant too
        entities = [x.text for x in doc.ents if (x.label_ == 'PERSON')]
        # entities2 = [x.text for x in doc.ents if (x.label_ == 'ORG')]
        if entities:
            for entity in set(entities):
                # cross check person with musicbrainz so we make sure the artist exists in the specified country
                # TODO: not very robust. find some way to double check
                try:
                    artist_in_musicbrainz = musicbrainzngs.search_artists(
                        artist=entity,
                        country=pycountry.countries.get(
                            name=country).alpha_2)['artist-list'][0]['name']
                    dist_artist = StringMatcher(
                        seq1=artist_in_musicbrainz.lower(),
                        seq2=entity.lower()).distance()
                    # A higher value will allow to match different artists, I think between 5 and 10 should be a good range.
                    if dist_artist < 5:
                        artists.add(artist_in_musicbrainz)
                        print(f'Success! Found artist {entity}')
                    else:
                        print(
                            f'Found name {entity} but not in Musicbrainz, closest artists is {artist_in_musicbrainz} with distance {dist_artist}'
                        )
                except:
                    print(f'Something failed for {entity}')
                    pass
    return (artists)
Ejemplo n.º 23
0
def get_requestable_from_sentence(sentence: str, requestable_dict):

    no_punctual_sentence = re.sub(r'[^\w\s]', '', sentence)
    lowercase_sentence = no_punctual_sentence.lower()
    split_sentence = re.split(r'\s+', lowercase_sentence)

    results = []
    for word, requestable in requestable_dict.items():

        if word in sentence:
            results.append((word, requestable))

    if len(results) > 0:
        return results

    for word in split_sentence:
        closest_match, closest_requestable = find_closest_match(
            requestable_dict, word)
        word_distance = StringMatcher(seq1=word, seq2=closest_match).distance()

        if word_distance <= 2:
            results.append((closest_match, closest_requestable))

    return results
Ejemplo n.º 24
0
 def char_to_char(self, source: str, target: str) -> Matrix:
     # Run Levenshtein at character level.
     sm = StringMatcher(seq1=source, seq2=target)
     mb = sm.get_matching_blocks()
     return self._mat_from_blocks(mb, len(source), len(target))
def levenshtein_ratio(s1, s2):
    m = StringMatcher(None, s1, s2)
    return truncate(m.ratio(), 2), m.distance()
Ejemplo n.º 26
0
def is_typo(word, word_from_dict):
    sm = StringMatcher()
    sm.set_seq1(word)
    sm.set_seq2(word_from_dict)
    dist = sm.distance()
    return dist == 1 or (dist == 2 and fl(word, word_from_dict))
 def getMatch(self, seq1, seq2):
     m = StringMatcher(seq1=seq2, seq2=seq1)
     ration = m.quick_ratio()
     ration = round(ration * 100, 2)
     self.scores.append((seq2, ration))
Ejemplo n.º 28
0
    async def getClosestUser(ctx, user, return_member=False):

        # Thanks to discord's new intent system, this entire function is broken if you are not verified.
        # Because of this, we migrated back to discord.User, which is what this now returns.
        return user

        user = str(user).lower()

        matcher = StringMatcher()

        userdata = []

        for _user in ctx.guild.members:

            member = {
                "name": _user.name,
                "fullname": _user.name + "#" + str(_user.discriminator),
                "mention": "<@!" + str(_user.id) + ">",
                "nickname": _user.display_name,
                "discriminator": str(_user.discriminator),
                "discrim2": "#" + str(_user.discriminator),
                "id": str(_user.id)
            }

            data = {}

            for item in member:

                matcher.set_seqs(user, member[item].lower())

                data[item] = matcher.quick_ratio()

            data["_id"] = member["id"]

            userdata.append(data)

        matches = {}

        for user in userdata:

            for key in user:

                if key != "_id":

                    if user[key] > .5:

                        matches[user["_id"]] = user[key]

                        break

        if not matches:

            raise ValueError(f"NoSuchUser: {user}")

        id = int(max(matches.items(), key=itemgetter(1))[0])

        if not return_member:

            return bot.get_user(id)

        return ctx.guild.get_member(id)
 def getMatchwithThreshold(self, seq1, seq2, score_cutoff=0):
     m = StringMatcher(seq1=seq2, seq2=seq1)
     ration = m.quick_ratio()
     ration = round(ration * 100, 2)
     if ration >= score_cutoff:
         self.scores.append((seq2, ration))
    def compare(self, statement, other_statement):
        """
        Compare the two input statements.
        
        :return: The percent of similarity between the text of the statements.
        :rtype: float
        """
        import sys
        from nltk import word_tokenize
        from chatterbot import utils
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
        global counter
        #global model
        # Use python-Levenshtein if available
        try:
            from Levenshtein.StringMatcher import StringMatcher as SequenceMatcher
        except ImportError:
            from difflib import SequenceMatcher
        
        PYTHON = sys.version_info[0]
        
        # Return 0 if either statement has a falsy text value
        if not statement or not other_statement:
            return 0
        # Get the lowercase version of both strings
        if PYTHON < 3:
            statement_text = unicode(statement.lower())
            other_statement_text = unicode(other_statement.lower())
        else:
            statement_text = str(statement.text.lower())
            other_statement_text = str(other_statement.text.lower())
        
        similarity = SequenceMatcher(
            None,
            statement_text,
            other_statement_text
        )
        counter += 1
        #print "calculating similarity ****************************************************************************",counter 
        # Calculate a decimal percent of the similarity
        percent = int(round(100 * similarity.ratio())) / 100.0

        sentence_1 = clean_sent(statement_text).lower().split()
        sentence_2 = clean_sent(other_statement_text).lower().split()

        tokens1 = (sentence_1)
        tokens2 = (sentence_2)
        # Remove all stop words from the list of word tokens
        s1 = utils.remove_stopwords(tokens1, language='english')
        s2 = utils.remove_stopwords(tokens2, language='english')
        #s1 = [w for w in sentence_1 if w not in stop_words]
        #s2 = [w for w in sentence_2 if w not in stop_words]
        
        distance = model.wmdistance(s1, s2)
        distance_gensim = model.wmdistance(s1, s2)
        if distance == infinity:
            return percent
	
        elif percent > distance:
            if percent - distance < 0.25:
                #print other_statement_text, percent + 0.08, '%', '***DECENT MATCH****'
                #print 'percent: ', percent, 'distance: ', distance
                #print
                return percent + 0.08 + (0.15 * abs(1 - distance))
            else:
                #print other_statement_text, '*****CLOSE MATCH*****'
                #print 'percent: ', percent, 'distance: ', distance
                #print
                return percent + 1.0 + (0.15 * abs(1 - distance))
        elif percent > 0.4:
            if distance - percent < 0.15:
                #print other_statement_text, percent + 0.06, '%'
                #print 'percent: ', percent, 'distance: ', distance
                #print
                return percent + 0.06 + (0.15 * abs(1 - distance))
            else:
                #print other_statement_text, percent - 0.04, '%'
                #print 'percent: ', percent, 'distance: ', distance
                #print
                return (percent - 0.04) - (0.15 * abs(1 - distance))
Ejemplo n.º 31
-1
    def compare(self, statement, other_statement):
        """
        Compare the two input statements.

        :return: The percent of similarity between the text of the statements.
        :rtype: float
        """

        PYTHON = sys.version_info[0]

        # Return 0 if either statement has a falsy text value
        if not statement.text or not other_statement.text:
            return 0

        # Get the lowercase version of both strings
        if PYTHON < 3:
            statement_text = unicode(statement.text.lower()) # NOQA
            other_statement_text = unicode(other_statement.text.lower()) # NOQA
        else:
            statement_text = str(statement.text.lower())
            other_statement_text = str(other_statement.text.lower())

        similarity = SequenceMatcher(
            None,
            statement_text,
            other_statement_text
        )

        # Calculate a decimal percent of the similarity
        percent = round(similarity.ratio(), 2)

        return percent
Ejemplo n.º 32
-1
    def compare(self, statement, other_statement):
        """
        Compare the two input statements.

        :return: The percent of similarity between the text of the statements.
        :rtype: float
        """

        # Return 0 if either statement has a falsy text value
        # if not statement.text or not other_statement.text:
        #     return 0
        #
        # statement_text = str(statement.text.lower())
        # other_statement_text = str(other_statement.text.lower())
        if not statement or not other_statement:
            return 0

        statement_text = str(statement.lower())
        other_statement_text = str(other_statement.lower())

        similarity = SequenceMatcher(None, statement_text,
                                     other_statement_text)

        # Calculate a decimal percent of the similarity
        percent = round(similarity.ratio(), 4)

        return percent
Ejemplo n.º 33
-1
    def compare(self, statement, other_statement):
        """
        Compare the two input statements.

        :return: The percent of similarity between the text of the statements.
        :rtype: float
        """

        PYTHON = sys.version_info[0]

        # Return 0 if either statement has a falsy text value
        if not statement.text or not other_statement.text:
            return 0

        # Get the lowercase version of both strings
        if PYTHON < 3:
            statement_text = unicode(statement.text.lower())  # NOQA
            other_statement_text = unicode(
                other_statement.text.lower())  # NOQA
        else:
            statement_text = str(statement.text.lower())
            other_statement_text = str(other_statement.text.lower())

        similarity = SequenceMatcher(None, statement_text,
                                     other_statement_text)

        # Calculate a decimal percent of the similarity
        percent = round(similarity.ratio(), 2)

        return percent