Python remove_stopwordsの例、chatterbot.utils.remove_stopwords Pythonの例

コード例 #1

0

ファイルを表示

ファイル: comparisons.py プロジェクト: Gustavo6046/ChatterBot

def synset_distance(statement, other_statement):
    """
    Calculate the similarity of two statements.
    This is based on the total maximum synset similarity between each word in each sentence.

    This algorithm uses the `wordnet`_ functionality of `NLTK`_ to determine the similarity
    of two statements based on the path similarity between each token of each statement.
    This is essentially an evaluation of the closeness of synonyms.

    :return: The percent of similarity between the closest synset distance.
    :rtype: float

    .. _wordnet: http://www.nltk.org/howto/wordnet.html
    .. _NLTK: http://www.nltk.org/
    """
    from nltk.corpus import wordnet
    from nltk import word_tokenize
    from chatterbot import utils
    import itertools

    tokens1 = word_tokenize(statement.text.lower())
    tokens2 = word_tokenize(other_statement.text.lower())

    # Remove all stop words from the list of word tokens
    tokens1 = utils.remove_stopwords(tokens1, language='english')
    tokens2 = utils.remove_stopwords(tokens2, language='english')

    # The maximum possible similarity is an exact match
    # Because path_similarity returns a value between 0 and 1,
    # max_possible_similarity is the number of words in the longer
    # of the two input statements.
    max_possible_similarity = max(
        len(statement.text.split()),
        len(other_statement.text.split())
    )

    max_similarity = 0.0

    # Get the highest matching value for each possible combination of words
    for combination in itertools.product(*[tokens1, tokens2]):

        synset1 = wordnet.synsets(combination[0])
        synset2 = wordnet.synsets(combination[1])

        if synset1 and synset2:

            # Get the highest similarity for each combination of synsets
            for synset in itertools.product(*[synset1, synset2]):
                similarity = synset[0].path_similarity(synset[1])

                if similarity and (similarity > max_similarity):
                    max_similarity = similarity

    if max_possible_similarity == 0:
        return 0

    return max_similarity / max_possible_similarity

コード例 #2

0

ファイルを表示

def synset_distance(statement, other_statement):
    """
    Calculate the similarity of two statements.
    This is based on the total maximum synset similarity between each word in each sentence.

    This algorithm uses the `wordnet`_ functionality of `NLTK`_ to determine the similarity
    of two statements based on the path similarity between each token of each statement.
    This is essentially an evaluation of the closeness of synonyms.

    :return: The percent of similarity between the closest synset distance.
    :rtype: float

    .. _wordnet: http://www.nltk.org/howto/wordnet.html
    .. _NLTK: http://www.nltk.org/
    """
    from nltk.corpus import wordnet
    from nltk import word_tokenize
    from chatterbot import utils
    import itertools

    tokens1 = word_tokenize(statement.text.lower())
    tokens2 = word_tokenize(other_statement.text.lower())

    # Remove all stop words from the list of word tokens
    tokens1 = utils.remove_stopwords(tokens1, language='english')
    tokens2 = utils.remove_stopwords(tokens2, language='english')

    # The maximum possible similarity is an exact match
    # Because path_similarity returns a value between 0 and 1,
    # max_possible_similarity is the number of words in the longer
    # of the two input statements.
    max_possible_similarity = max(
        len(statement.text.split()),
        len(other_statement.text.split())
    )

    max_similarity = 0.0

    # Get the highest matching value for each possible combination of words
    for combination in itertools.product(*[tokens1, tokens2]):

        synset1 = wordnet.synsets(combination[0])
        synset2 = wordnet.synsets(combination[1])

        if synset1 and synset2:

            # Get the highest similarity for each combination of synsets
            for synset in itertools.product(*[synset1, synset2]):
                similarity = synset[0].path_similarity(synset[1])

                if similarity and (similarity > max_similarity):
                    max_similarity = similarity

    if max_possible_similarity == 0:
        return 0

    return max_similarity / max_possible_similarity

コード例 #3

0

ファイルを表示

ファイル: comparisons.py プロジェクト: mohd14shoeb/Chatbot

    def compare(self, statement, other_statement):
        """
        Compare the two input statements.

        :return: The percent of similarity between the closest synset distance.
        :rtype: float

        .. _wordnet: http://www.nltk.org/howto/wordnet.html
        .. _NLTK: http://www.nltk.org/
        """
        from nltk.corpus import wordnet
        from nltk import word_tokenize
        import itertools

        tokens1 = word_tokenize(statement.text.lower())
        tokens2 = word_tokenize(other_statement.text.lower())

        # Remove all stop words from the list of word tokens
        tokens1 = utils.remove_stopwords(tokens1, language='english')
        tokens2 = utils.remove_stopwords(tokens2, language='english')

        # The maximum possible similarity is an exact match
        # Because path_similarity returns a value between 0 and 1,
        # max_possible_similarity is the number of words in the longer
        # of the two input statements.
        max_possible_similarity = min(
            len(statement.text.split()),
            len(other_statement.text.split())
        ) / max(
            len(statement.text.split()),
            len(other_statement.text.split())
        )

        max_similarity = 0.0

        # Get the highest matching value for each possible combination of words
        for combination in itertools.product(*[tokens1, tokens2]):

            synset1 = wordnet.synsets(combination[0])
            synset2 = wordnet.synsets(combination[1])

            if synset1 and synset2:

                # Get the highest similarity for each combination of synsets
                for synset in itertools.product(*[synset1, synset2]):
                    similarity = synset[0].path_similarity(synset[1])

                    if similarity and (similarity > max_similarity):
                        max_similarity = similarity

        if max_possible_similarity == 0:
            return 0

        return max_similarity / max_possible_similarity

コード例 #4

0

ファイルを表示

ファイル: test_utils.py プロジェクト: PrabinPC/ChatterBot-1

    def test_remove_stop_words(self):
        from chatterbot.utils import nltk_download_corpus

        nltk_download_corpus('stopwords')

        tokens = ['this', 'is', 'a', 'test', 'string']
        words = utils.remove_stopwords(tokens, 'english')

        # This example list of words should end up with only two elements
        self.assertEqual(len(words), 2)
        self.assertIn('test', list(words))
        self.assertIn('string', list(words))

コード例 #5

0

ファイルを表示

ファイル: test_utils.py プロジェクト: mholden60/SE419.80-Big-Data

    def test_remove_stop_words(self):
        from chatterbot.utils import nltk_download_corpus

        nltk_download_corpus('stopwords')

        tokens = ['this', 'is', 'a', 'test', 'string']
        words = utils.remove_stopwords(tokens, 'english')

        # This example list of words should end up with only two elements
        self.assertEqual(len(words), 2)
        self.assertIn('test', list(words))
        self.assertIn('string', list(words))

コード例 #6

0

ファイルを表示

ファイル: levenstein-comparison.py プロジェクト: syedad218/multithreading

    def compare(self, statement, other_statement):
        """
        Compare the two input statements.
        
        :return: The percent of similarity between the text of the statements.
        :rtype: float
        """
        import sys
        from nltk import word_tokenize
        from chatterbot import utils
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
        global counter
        #global model
        # Use python-Levenshtein if available
        try:
            from Levenshtein.StringMatcher import StringMatcher as SequenceMatcher
        except ImportError:
            from difflib import SequenceMatcher
        
        PYTHON = sys.version_info[0]
        
        # Return 0 if either statement has a falsy text value
        if not statement or not other_statement:
            return 0
        # Get the lowercase version of both strings
        if PYTHON < 3:
            statement_text = unicode(statement.lower())
            other_statement_text = unicode(other_statement.lower())
        else:
            statement_text = str(statement.text.lower())
            other_statement_text = str(other_statement.text.lower())
        
        similarity = SequenceMatcher(
            None,
            statement_text,
            other_statement_text
        )
        counter += 1
        #print "calculating similarity ****************************************************************************",counter 
        # Calculate a decimal percent of the similarity
        percent = int(round(100 * similarity.ratio())) / 100.0

        sentence_1 = clean_sent(statement_text).lower().split()
        sentence_2 = clean_sent(other_statement_text).lower().split()

        tokens1 = (sentence_1)
        tokens2 = (sentence_2)
        # Remove all stop words from the list of word tokens
        s1 = utils.remove_stopwords(tokens1, language='english')
        s2 = utils.remove_stopwords(tokens2, language='english')
        #s1 = [w for w in sentence_1 if w not in stop_words]
        #s2 = [w for w in sentence_2 if w not in stop_words]
        
        distance = model.wmdistance(s1, s2)
        distance_gensim = model.wmdistance(s1, s2)
        if distance == infinity:
            return percent
	
        elif percent > distance:
            if percent - distance < 0.25:
                #print other_statement_text, percent + 0.08, '%', '***DECENT MATCH****'
                #print 'percent: ', percent, 'distance: ', distance
                #print
                return percent + 0.08 + (0.15 * abs(1 - distance))
            else:
                #print other_statement_text, '*****CLOSE MATCH*****'
                #print 'percent: ', percent, 'distance: ', distance
                #print
                return percent + 1.0 + (0.15 * abs(1 - distance))
        elif percent > 0.4:
            if distance - percent < 0.15:
                #print other_statement_text, percent + 0.06, '%'
                #print 'percent: ', percent, 'distance: ', distance
                #print
                return percent + 0.06 + (0.15 * abs(1 - distance))
            else:
                #print other_statement_text, percent - 0.04, '%'
                #print 'percent: ', percent, 'distance: ', distance
                #print
                return (percent - 0.04) - (0.15 * abs(1 - distance))