def compute_levensthein_distances_in_ground_truth():
    # Computes the levensthein distances within similar and non similar pairs of the ground truth
    #
    labels, docs = load_data()
    s, y_t = load_SME_binary()
    df = pd.read_excel(r'test-lab.xlsx')
    dval = df.values

    for i in range(len(dval)):
        doc1 = dval[i, 0].strip()
        doc1_ = doc1
        for x in labels:
            if x.startswith(doc1):
                doc1_ = x
        doc2 = dval[i, 1].strip()
        doc2_ = doc2
        for x in labels:
            if x.startswith(doc2):
                doc2_ = x
        label = dval[i, 2]
        m = StringMatcher(seq1=docs[labels.index(doc1_)],
                          seq2=docs[labels.index(doc2_)])
        print(doc1, doc2, label, m.ratio(), sep=",")

    return
    def compare(self, first_statement, second_statement):

        statement = first_statement.lower()

        other_statement = second_statement.lower()

        similarity = SequenceMatcher(None, statement, other_statement)

        return round(similarity.ratio(), 10)
Ejemplo n.º 3
0
def levenshtein_distance(statement, other_statement):
    """
    Compare two statements based on the Levenshtein distance
    of each statement's text.

    For example, there is a 65% similarity between the statements
    "where is the post office?" and "looking for the post office"
    based on the Levenshtein distance algorithm.

    :return: The percent of similarity between the text of the statements.
    :rtype: float
    """
    import sys

    # Use python-Levenshtein if available
    try:
        from Levenshtein.StringMatcher import StringMatcher as SequenceMatcher
    except ImportError:
        from difflib import SequenceMatcher

    PYTHON = sys.version_info[0]

    # Return 0 if either statement has a falsy text value
    if not statement.text or not other_statement.text:
        return 0

    # Get the lowercase version of both strings
    if PYTHON < 3:
        statement_text = unicode(statement.text.lower())
        other_statement_text = unicode(other_statement.text.lower())
    else:
        statement_text = str(statement.text.lower())
        other_statement_text = str(other_statement.text.lower())

    similarity = SequenceMatcher(
        None,
        statement_text,
        other_statement_text
    )

    # Calculate a decimal percent of the similarity
    percent = round(similarity.ratio(), 2)

    return percent
Ejemplo n.º 4
0
def levenshtein_distance(statement, other_statement):
    """
    Compare two statements based on the Levenshtein distance
    of each statement's text.

    For example, there is a 65% similarity between the statements
    "where is the post office?" and "looking for the post office"
    based on the Levenshtein distance algorithm.

    :return: The percent of similarity between the text of the statements.
    :rtype: float
    """
    import sys

    # Use python-Levenshtein if available
    try:
        from Levenshtein.StringMatcher import StringMatcher as SequenceMatcher
    except ImportError:
        from difflib import SequenceMatcher

    PYTHON = sys.version_info[0]

    # Return 0 if either statement has a falsy text value
    if not statement.text or not other_statement.text:
        return 0

    # Get the lowercase version of both strings
    if PYTHON < 3:
        statement_text = unicode(statement.text.lower())
        other_statement_text = unicode(other_statement.text.lower())
    else:
        statement_text = str(statement.text.lower())
        other_statement_text = str(other_statement.text.lower())

    similarity = SequenceMatcher(
        None,
        statement_text,
        other_statement_text
    )

    # Calculate a decimal percent of the similarity
    percent = int(round(100 * similarity.ratio())) / 100.0

    return percent
def compute_levensthein_distances_in_clusters(output_file="",
                                              embedder="doc2vec"):
    # Computes the levensthein distances within clusters
    #
    # In:
    # embedder - "doc2vec" or "sbert"

    doc_labels, docs = load_data()
    if embedder == "doc2vec":
        l, x, labels = perform_clustering()
    elif embedder == "sbert":
        l, x, labels = sbert_labels()
    else:
        return print("wrong input to eval method")
    valid_labels = set(l)
    for lbl in valid_labels:
        if lbl == -1:
            continue
        print(lbl, end=",")
        # Create a subset of labels that belong to cluster lbl
        current_cluster = []
        for k, v in labels.items():
            if v == lbl:
                current_cluster.append(k)
        # breakpoint()
        for a, b in combinations(current_cluster, 2):
            idx_a = -1
            idx_b = -1
            for i in range(len(doc_labels)):
                if doc_labels[i].startswith(a):
                    idx_a = i
                elif doc_labels[i].startswith(b):
                    idx_b = i
            doc1 = docs[idx_a]
            doc2 = docs[idx_b]
            # print(doc1,doc2,"a b", a,b)
            # assert(doc1)
            # assert(doc2)
            m = StringMatcher(seq1=doc1, seq2=doc2)
            print(m.ratio(), end=",")
        print()
    return
Ejemplo n.º 6
0
def levenshtein_distance(statement, other_statement):
    """
    Compare two statements based on the Levenshtein distance
    (fuzzy string comparison) of each statement's text.

    :return: The percent of similarity between the text of the statements.
    :rtype: float
    """
    import sys
    import warnings

    # Use python-Levenshtein if available
    try:
        from Levenshtein.StringMatcher import StringMatcher as SequenceMatcher
    except ImportError:
        from difflib import SequenceMatcher

    PYTHON = sys.version_info[0]

    # Return 0 if either statement has a falsy text value
    if not statement.text or not statement.text:
        return 0

    # Get the lowercase version of both strings
    if PYTHON < 3:
        statement_text = unicode(statement.text.lower())
        other_statement_text = unicode(other_statement.text.lower())
    else:
        statement_text = str(statement.text.lower())
        other_statement_text = str(other_statement.text.lower())

    similarity = SequenceMatcher(
        None,
        statement_text,
        other_statement_text
    )

    # Calculate a decimal percent of the similarity
    percent = int(round(100 * similarity.ratio())) / 100.0

    return percent
Ejemplo n.º 7
0
    def compare(self, statement, other_statement):
        """
        Compare the two input statements.

        :return: The percent of similarity between the text of the statements.
        :rtype: float
        """
        import sys

        # Use python-Levenshtein if available
        try:
            from Levenshtein.StringMatcher import StringMatcher as SequenceMatcher
        except ImportError:
            from difflib import SequenceMatcher

        PYTHON = sys.version_info[0]

        # Return 0 if either statement has a falsy text value
        if not statement.text or not other_statement.text:
            return 0

        # Get the lowercase version of both strings
        if PYTHON < 3:
            statement_text = unicode(statement.text.lower()) # NOQA
            other_statement_text = unicode(other_statement.text.lower()) # NOQA
        else:
            statement_text = str(statement.text.lower())
            other_statement_text = str(other_statement.text.lower())

        similarity = SequenceMatcher(
            None,
            statement_text,
            other_statement_text
        )

        # Calculate a decimal percent of the similarity
        percent = round(similarity.ratio(), 2)

        return percent
Ejemplo n.º 8
0
    def compare(self, statement, other_statement):
        """
        比较两个输入

        :return: 返回两个句子之间的相似度
        :rtype: 浮点型
        """

        # Return 0 if either statement has a falsy text value
        if not statement.text or not other_statement.text:
            return 0

        statement_text = str(statement.text.lower())
        other_statement_text = str(other_statement.text.lower())

        similarity = SequenceMatcher(None, statement_text,
                                     other_statement_text)

        # Calculate a decimal percent of the similarity
        percent = round(similarity.ratio(), 2)

        return percent
    def compare(self, statement, other_statement):
        """
        Compare the two input statements.
        
        :return: The percent of similarity between the text of the statements.
        :rtype: float
        """
        import sys
        from nltk import word_tokenize
        from chatterbot import utils
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
        global counter
        #global model
        # Use python-Levenshtein if available
        try:
            from Levenshtein.StringMatcher import StringMatcher as SequenceMatcher
        except ImportError:
            from difflib import SequenceMatcher
        
        PYTHON = sys.version_info[0]
        
        # Return 0 if either statement has a falsy text value
        if not statement or not other_statement:
            return 0
        # Get the lowercase version of both strings
        if PYTHON < 3:
            statement_text = unicode(statement.lower())
            other_statement_text = unicode(other_statement.lower())
        else:
            statement_text = str(statement.text.lower())
            other_statement_text = str(other_statement.text.lower())
        
        similarity = SequenceMatcher(
            None,
            statement_text,
            other_statement_text
        )
        counter += 1
        #print "calculating similarity ****************************************************************************",counter 
        # Calculate a decimal percent of the similarity
        percent = int(round(100 * similarity.ratio())) / 100.0

        sentence_1 = clean_sent(statement_text).lower().split()
        sentence_2 = clean_sent(other_statement_text).lower().split()

        tokens1 = (sentence_1)
        tokens2 = (sentence_2)
        # Remove all stop words from the list of word tokens
        s1 = utils.remove_stopwords(tokens1, language='english')
        s2 = utils.remove_stopwords(tokens2, language='english')
        #s1 = [w for w in sentence_1 if w not in stop_words]
        #s2 = [w for w in sentence_2 if w not in stop_words]
        
        distance = model.wmdistance(s1, s2)
        distance_gensim = model.wmdistance(s1, s2)
        if distance == infinity:
            return percent
	
        elif percent > distance:
            if percent - distance < 0.25:
                #print other_statement_text, percent + 0.08, '%', '***DECENT MATCH****'
                #print 'percent: ', percent, 'distance: ', distance
                #print
                return percent + 0.08 + (0.15 * abs(1 - distance))
            else:
                #print other_statement_text, '*****CLOSE MATCH*****'
                #print 'percent: ', percent, 'distance: ', distance
                #print
                return percent + 1.0 + (0.15 * abs(1 - distance))
        elif percent > 0.4:
            if distance - percent < 0.15:
                #print other_statement_text, percent + 0.06, '%'
                #print 'percent: ', percent, 'distance: ', distance
                #print
                return percent + 0.06 + (0.15 * abs(1 - distance))
            else:
                #print other_statement_text, percent - 0.04, '%'
                #print 'percent: ', percent, 'distance: ', distance
                #print
                return (percent - 0.04) - (0.15 * abs(1 - distance))
def levenshtein_ratio(s1, s2):
    m = StringMatcher(None, s1, s2)
    return truncate(m.ratio(), 2), m.distance()
Ejemplo n.º 11
-1
    def compare(self, statement, other_statement):
        """
        Compare the two input statements.

        :return: The percent of similarity between the text of the statements.
        :rtype: float
        """

        PYTHON = sys.version_info[0]

        # Return 0 if either statement has a falsy text value
        if not statement.text or not other_statement.text:
            return 0

        # Get the lowercase version of both strings
        if PYTHON < 3:
            statement_text = unicode(statement.text.lower()) # NOQA
            other_statement_text = unicode(other_statement.text.lower()) # NOQA
        else:
            statement_text = str(statement.text.lower())
            other_statement_text = str(other_statement.text.lower())

        similarity = SequenceMatcher(
            None,
            statement_text,
            other_statement_text
        )

        # Calculate a decimal percent of the similarity
        percent = round(similarity.ratio(), 2)

        return percent
Ejemplo n.º 12
-1
    def compare(self, statement, other_statement):
        """
        Compare the two input statements.

        :return: The percent of similarity between the text of the statements.
        :rtype: float
        """

        PYTHON = sys.version_info[0]

        # Return 0 if either statement has a falsy text value
        if not statement.text or not other_statement.text:
            return 0

        # Get the lowercase version of both strings
        if PYTHON < 3:
            statement_text = unicode(statement.text.lower())  # NOQA
            other_statement_text = unicode(
                other_statement.text.lower())  # NOQA
        else:
            statement_text = str(statement.text.lower())
            other_statement_text = str(other_statement.text.lower())

        similarity = SequenceMatcher(None, statement_text,
                                     other_statement_text)

        # Calculate a decimal percent of the similarity
        percent = round(similarity.ratio(), 2)

        return percent
Ejemplo n.º 13
-1
    def compare(self, statement, other_statement):
        """
        Compare the two input statements.

        :return: The percent of similarity between the text of the statements.
        :rtype: float
        """

        # Return 0 if either statement has a falsy text value
        # if not statement.text or not other_statement.text:
        #     return 0
        #
        # statement_text = str(statement.text.lower())
        # other_statement_text = str(other_statement.text.lower())
        if not statement or not other_statement:
            return 0

        statement_text = str(statement.lower())
        other_statement_text = str(other_statement.lower())

        similarity = SequenceMatcher(None, statement_text,
                                     other_statement_text)

        # Calculate a decimal percent of the similarity
        percent = round(similarity.ratio(), 4)

        return percent