def test_run_pronouns(self):
     self.op = SplatPronouns()
     self.test_data = [
         Corpus(
             "0", "Test",
             "He and she jumped over my fence.\nI saw them do so, and I told you."
         )
     ]
     results = json.loads(self.op.run(self.test_data))
     print(results)
     desired_results = [{
         'corpus_id':
         '0',
         'first-person': {
             'MYSELF': [0, '1st-Person', 'Reflexive', 'Singular'],
             'OURSELVES': [0, '1st-Person', 'Reflexive', 'Plural'],
             'WE': [0, '1st-Person', 'Personal', 'Plural'],
             'ME': [0, '1st-Person', 'Personal', 'Singular'],
             'OUR': [0, '1st-Person', 'Possessive', 'Plural'],
             'MY': [1, '1st-Person', 'Possessive', 'Singular'],
             'MINE': [0, '1st-Person', 'Possessive', 'Singular'],
             'US': [0, '1st-Person', 'Personal', 'Plural'],
             'I': [2, '1st-Person', 'Personal', 'Singular'],
             'OURS': [0, '1st-Person', 'Possessive', 'Plural']
         },
         'second-person': {
             'YOU': [1, '2nd-Person', 'Personal', 'Singular/Plural'],
             'YOURSELVES': [0, '2nd-Person', 'Reflexive', 'Plural'],
             'YOURS': [0, '2nd-Person', 'Possessive', 'Singular/Plural'],
             'YOUR': [0, '2nd-Person', 'Possessive', 'Singular/Plural'],
             'YOURSELF': [0, '2nd-Person', 'Reflexive', 'Singular']
         },
         'third-person': {
             'THEY': [0, '3rd-Person', 'Personal', 'Plural'],
             'ITSELF': [0, '3rd-Person', 'Reflexive', 'Singular'],
             'HERS': [0, '3rd-Person', 'Possessive', 'Singular'],
             'HIM': [0, '3rd-Person', 'Personal', 'Singular'],
             'SHE': [1, '3rd-Person', 'Personal', 'Singular'],
             'HERSELF': [0, '3rd-Person', 'Reflexive', 'Singular'],
             'ITS': [0, '3rd-Person', 'Possessive', 'Singular'],
             'HIMSELF': [0, '3rd-Person', 'Reflexive', 'Singular'],
             'THEIRS': [0, '3rd-Person', 'Possessive', 'Plural'],
             'THEIR': [0, '3rd-Person', 'Possessive', 'Plural'],
             'HIS': [0, '3rd-Person', 'Possessive', 'Singular'],
             'IT': [0, '3rd-Person', 'Personal', 'Singular'],
             'HE': [1, '3rd-Person', 'Personal', 'Singular'],
             'HER':
             [0, '3rd-Person', 'Personal/Possessive', 'Singular/Plural'],
             'THEMSELVES': [0, '3rd-Person', 'Reflexive', 'Plural'],
             'THEM': [1, '3rd-Person', 'Personal', 'Plural']
         },
         'sentences': [
             'He and she jumped over my fence.',
             'I saw them do so, and I told you.'
         ]
     }]
     self.assertEqual(results, desired_results)
 def test_run_pos_frequencies(self):
     self.op = SplatPOSFrequencies()
     self.test_data = [
         Corpus(
             "0", "Test",
             "The very quick brown fox jumped over the lazy dog.\nI saw it happen."
         )
     ]
     results = json.loads(self.op.run(self.test_data))
     desired_results = [{
         "corpus_id": "0",
         "pos_tags": {
             "PRP": ["I", "it"],
             "VB": ["happen"],
             "RB": ["very"],
             ".": ["."],
             "VBD": ["jumped", "saw"],
             "DT": ["The", "the"],
             "IN": ["over"],
             "JJ": ["quick", "lazy"],
             "NN": ["brown", "fox", "dog"]
         },
         "pos_counts": {
             "PRP": 2,
             "VB": 1,
             "RB": 1,
             ".": 2,
             "VBD": 2,
             "DT": 2,
             "IN": 1,
             "JJ": 2,
             "NN": 3
         }
     }]
     self.assertEqual(results, desired_results)
    def test_run_disfluency(self):
        self.op = SplatDisfluency()
        self.test_data = [
            Corpus(
                "0", "Test",
                "The quick brown fox {sl} jumped over the lazy dog.\nI uh saw it happen."
            )
        ]
        results = json.loads(self.op.run(self.test_data))
        print(results)
        desired_results = [{
            'corpus_id': '0',
            'sentences': {
                'uh saw it happen.': {
                    'SILENT PAUSE': 0,
                    'HM': 0,
                    'BREAK': 0,
                    'UH': 1,
                    'UM': 0,
                    'AH': 0,
                    'REPETITION': 0,
                    'ER': 0
                },
                'The quick brown fox {sl} jumped over the lazy dog.I': {
                    'SILENT PAUSE': 1,
                    'HM': 0,
                    'BREAK': 0,
                    'UH': 0,
                    'UM': 0,
                    'AH': 0,
                    'REPETITION': 0,
                    'ER': 0
                }
            },
            'average_disfluencies_per_sentence': 1.0,
            'total_disfluencies': {
                'SILENT PAUSE': 1,
                'HM': 0,
                'BREAK': 0,
                'TOTAL': 2,
                'UM': 0,
                'AH': 0,
                'UH': 1,
                'REPETITION': 0,
                'ER': 0
            }
        }]

        self.assertEqual(results, desired_results)
 def test_run_syllables(self):
     self.op = SplatSyllables()
     self.test_data = [
         Corpus(
             "0", "Test",
             "The very quick brown fox jumped over the lazy dog.\nI saw it happen."
         )
     ]
     results = json.loads(self.op.run(self.test_data))
     desired_results = [{
         'corpus_id': '0',
         'syllables': {
             '1': [
                 'the', 'quick', 'brown', 'fox', 'jumped', 'dog', 'i',
                 'saw', 'it'
             ],
             '2': ['very', 'over', 'lazy', 'happen']
         }
     }]
     self.assertEqual(results, desired_results)
 def test_run_complexity(self):
     self.op = SplatComplexity()
     self.test_data = [
         Corpus(
             "0", "Test",
             "The quick brown fox jumped over the lazy dog.\nI saw it happen."
         )
     ]
     results = json.loads(self.op.run(self.test_data))
     desired_results = [{
         'corpus_id': '0',
         'content_density': [2.0, 2.0, 2.0],
         'idea_density': 0.5,
         'flesch_score': 96.1,
         'kincaid_score': 1.5,
         'types': 12,
         'tokens': 13,
         'type_token_ratio': 0.9230769230769231
     }]
     self.assertEqual(round_json_floats(results),
                      round_json_floats(desired_results))
 def test_run_ngrams(self):
     self.op = SplatNGrams()
     self.test_data = [
         Corpus("0", "Test",
                "The quick brown fox jumped over the lazy dog.\n")
     ]
     results = json.loads(self.op.run(self.test_data))
     desired_results = [{
         "corpus_id": "0",
         "unigrams": {
             "dog": 1,
             "the": 2,
             "fox": 1,
             "jumped": 1,
             "over": 1,
             "lazy": 1,
             "brown": 1,
             "quick": 1
         },
         "bigrams": {
             "the quick": 1,
             "quick brown": 1,
             "lazy dog": 1,
             "brown fox": 1,
             "fox jumped": 1,
             "jumped over": 1,
             "over the": 1,
             "the lazy": 1
         },
         "trigrams": {
             "the quick brown": 1,
             "quick brown fox": 1,
             "the lazy dog": 1,
             "jumped over the": 1,
             "over the lazy": 1,
             "brown fox jumped": 1,
             "fox jumped over": 1
         }
     }]
     self.assertEqual(results, desired_results)
def get_operation_handler(operation):
    if operation == 'lemmatize_wordnet':
        return LemmatizerWordNet()
    elif operation == 'removecapsgreedy':
        return RemoveCapsGreedy()
    elif operation == 'removecapsnnp':
        return RemoveCapsPreserveNNP()
    elif operation == 'removepunct':
        return RemovePunct()
    elif operation == 'removesilence':
        return RemoveSilence()
    elif operation == 'remove_stopwords':
        return RemoveStopwords()
    elif operation == 'sentence_tokenize':
        return SentenceTokenize()
    elif operation == 'removehashtags':
        return RemoveHashtags()
    elif operation == 'removequotes':
        return RemoveQuotes()
    elif operation == 'stem_porter':
        return StemmerPorter()
    elif operation == 'stop_words':
        return RemoveStopwords()
    elif operation == 'tfidf':
        return Tfidf()
    elif operation == 'wordcloudop':
        return WordCloudOp()
    elif operation == 'word_tokenize_treebank':
        return WordTokenizeTreebank()
    elif operation == 'word_tokenize_whitespace_punct':
        return WordTokenizeWhitespacePunct()
    elif operation == 'word_tokenize_stanford':
        return WordTokenizeStanford()
    elif operation == 'nlp-pos':
        return StanfordCoreNLP('pos')
    elif operation == 'nlp-ner':
        return StanfordCoreNLP('ner')
    elif operation == 'nlp-sentiment':
        return StanfordCoreNLP('sentiment')
    elif operation == 'nlp-coref':
        return StanfordCoreNLP('coref')
    elif operation == 'nlp-relation':
        return StanfordCoreNLP('relation')
    elif operation == 'splat-disfluency':
        print("YOU GOT SPLATTED")
        return SplatDisfluency()
    elif operation == 'splat-ngrams':
        print("YOU GOT SPLATTED")
        return SplatNGrams()
    elif operation == 'splat-complexity':
        print("YOU GOT SPLATTED")
        return SplatComplexity()
    elif operation == 'splat-pos':
        print("YOU GOT SPLATTED")
        return SplatPOSFrequencies()
    elif operation == 'splat-syllables':
        print("YOU GOT SPLATTED")
        return SplatSyllables()
    elif operation == 'splat-pronouns':
        print("YOU GOT SPLATTED")
        return SplatPronouns()
    elif operation == 'char-ngrams':
        return CharNgrams()
    elif operation == 'length-stats':
        return LengthStatistics()
    elif operation == 'topic-model-10':
        return TopicModel(10)
    elif operation == 'topic-model-30':
        return TopicModel(30)
    elif operation == 'word-vector':
        return WordVector()
    elif operation == 'unsup-morph':
        return UnsupervisedMorphology()
    elif operation == 'bigram-array':
        return BigramArray()
    elif operation == 'speech-token-stats':
        return SpeechTokenStatistics()
    elif operation == 'extract_transcript':
        return ExtractTranscript()
    elif operation == 'noop':
        return NoOp()
    else:
        raise TransactionException(
            f'The requested operation "{operation}" does not exist.')
Beispiel #8
0
def get_operation_handler(operation):
    if operation == 'lemmatize_wordnet':
        return LemmatizerWordNet()
    elif operation == 'pos_tag':
        return PosTag()
    elif operation == 'removecapsgreedy':
        return RemoveCapsGreedy()
    elif operation == 'removecapsnnp':
        return RemoveCapsPreserveNNP()
    elif operation == 'removepunct':
        return RemovePunct()
    elif operation == 'removesilence':
        return RemoveSilence()
    elif operation == 'remove_stopwords':
        return RemoveStopwords()
    elif operation == 'sentence_tokenize':
        return SentenceTokenize()
    elif operation == 'stem_porter':
        return StemmerPorter()
    elif operation == 'stop_words':
        return RemoveStopwords()
    elif operation == 'tfidf':
        return Tfidf()
    elif operation == 'topic_model':
        return TopicModel()
    elif operation == 'wordcloudop':
        return WordCloudOp()
    elif operation == 'word_tokenize_treebank':
        return WordTokenizeTreebank()
    elif operation == 'word_tokenize_whitespace_punct':
        return WordTokenizeWhitespacePunct()
    elif operation == 'word_tokenize_stanford':
        return WordTokenizeStanford()
    elif operation == 'word_tokenize_spaces':
        return WordTokenizeSpaces()
    elif operation == 'word_tokenize_tabs':
        return WordTokenizeTabs()
    elif operation == 'nlp-pos':
        return StanfordCoreNLP(['pos'])
    elif operation == 'nlp-ner':
        return StanfordCoreNLP(['pos', 'ner'])
    elif operation == 'nlp-sentiment':
        return StanfordCoreNLP(['parse', 'sentiment'])
    elif operation == 'nlp-parse':
        return StanfordCoreNLP(['parse'])
    elif operation == 'nlp-coref':
        return StanfordCoreNLP(['tokenize', 'ssplit', 'coref'])
    elif operation == 'nlp-relation':
        return StanfordCoreNLP(['parse', 'relation'])
    elif operation == 'splat-disfluency':
        print("YOU GOT SPLATTED")
        return SplatDisfluency()
    elif operation == 'splat-ngrams':
        print("YOU GOT SPLATTED")
        return SplatNGrams()
    elif operation == 'splat-complexity':
        print("YOU GOT SPLATTED")
        return SplatComplexity()
    elif operation == 'splat-pos':
        print("YOU GOT SPLATTED")
        return SplatPOSFrequencies()
    elif operation == 'splat-syllables':
        print("YOU GOT SPLATTED")
        return SplatSyllables()
    elif operation == 'splat-pronouns':
        print("YOU GOT SPLATTED")
        return SplatPronouns()
    elif operation == 'noop':
        return NoOp()
    else:
        raise TransactionException("The requested operation does not exist.")
class SplatTest(unittest.TestCase):
    def test_run_disfluency(self):
        self.op = SplatDisfluency()
        self.test_data = [
            Corpus(
                "0", "Test",
                "The quick brown fox {sl} jumped over the lazy dog.\nI uh saw it happen."
            )
        ]
        results = json.loads(self.op.run(self.test_data))
        print(results)
        desired_results = [{
            'corpus_id': '0',
            'sentences': {
                'uh saw it happen.': {
                    'SILENT PAUSE': 0,
                    'HM': 0,
                    'BREAK': 0,
                    'UH': 1,
                    'UM': 0,
                    'AH': 0,
                    'REPETITION': 0,
                    'ER': 0
                },
                'The quick brown fox {sl} jumped over the lazy dog.I': {
                    'SILENT PAUSE': 1,
                    'HM': 0,
                    'BREAK': 0,
                    'UH': 0,
                    'UM': 0,
                    'AH': 0,
                    'REPETITION': 0,
                    'ER': 0
                }
            },
            'average_disfluencies_per_sentence': 1.0,
            'total_disfluencies': {
                'SILENT PAUSE': 1,
                'HM': 0,
                'BREAK': 0,
                'TOTAL': 2,
                'UM': 0,
                'AH': 0,
                'UH': 1,
                'REPETITION': 0,
                'ER': 0
            }
        }]

        self.assertEqual(results, desired_results)

    def test_run_ngrams(self):
        self.op = SplatNGrams()
        self.test_data = [
            Corpus("0", "Test",
                   "The quick brown fox jumped over the lazy dog.\n")
        ]
        results = json.loads(self.op.run(self.test_data))
        desired_results = [{
            "corpus_id": "0",
            "unigrams": {
                "dog": 1,
                "the": 2,
                "fox": 1,
                "jumped": 1,
                "over": 1,
                "lazy": 1,
                "brown": 1,
                "quick": 1
            },
            "bigrams": {
                "the quick": 1,
                "quick brown": 1,
                "lazy dog": 1,
                "brown fox": 1,
                "fox jumped": 1,
                "jumped over": 1,
                "over the": 1,
                "the lazy": 1
            },
            "trigrams": {
                "the quick brown": 1,
                "quick brown fox": 1,
                "the lazy dog": 1,
                "jumped over the": 1,
                "over the lazy": 1,
                "brown fox jumped": 1,
                "fox jumped over": 1
            }
        }]
        self.assertEqual(results, desired_results)

    def test_run_complexity(self):
        self.op = SplatComplexity()
        self.test_data = [
            Corpus(
                "0", "Test",
                "The quick brown fox jumped over the lazy dog.\nI saw it happen."
            )
        ]
        results = json.loads(self.op.run(self.test_data))
        desired_results = [{
            'corpus_id': '0',
            'content_density': [2.0, 2.0, 2.0],
            'idea_density': 0.5,
            'flesch_score': 96.1,
            'kincaid_score': 1.5,
            'types': 12,
            'tokens': 13,
            'type_token_ratio': 0.9230769230769231
        }]
        self.assertEqual(round_json_floats(results),
                         round_json_floats(desired_results))

    def test_run_pos_frequencies(self):
        self.op = SplatPOSFrequencies()
        self.test_data = [
            Corpus(
                "0", "Test",
                "The very quick brown fox jumped over the lazy dog.\nI saw it happen."
            )
        ]
        results = json.loads(self.op.run(self.test_data))
        desired_results = [{
            "corpus_id": "0",
            "pos_tags": {
                "PRP": ["I", "it"],
                "VB": ["happen"],
                "RB": ["very"],
                ".": ["."],
                "VBD": ["jumped", "saw"],
                "DT": ["The", "the"],
                "IN": ["over"],
                "JJ": ["quick", "lazy"],
                "NN": ["brown", "fox", "dog"]
            },
            "pos_counts": {
                "PRP": 2,
                "VB": 1,
                "RB": 1,
                ".": 2,
                "VBD": 2,
                "DT": 2,
                "IN": 1,
                "JJ": 2,
                "NN": 3
            }
        }]
        self.assertEqual(results, desired_results)

    def test_run_syllables(self):
        self.op = SplatSyllables()
        self.test_data = [
            Corpus(
                "0", "Test",
                "The very quick brown fox jumped over the lazy dog.\nI saw it happen."
            )
        ]
        results = json.loads(self.op.run(self.test_data))
        desired_results = [{
            'corpus_id': '0',
            'syllables': {
                '1': [
                    'the', 'quick', 'brown', 'fox', 'jumped', 'dog', 'i',
                    'saw', 'it'
                ],
                '2': ['very', 'over', 'lazy', 'happen']
            }
        }]
        self.assertEqual(results, desired_results)

    def test_run_pronouns(self):
        self.op = SplatPronouns()
        self.test_data = [
            Corpus(
                "0", "Test",
                "He and she jumped over my fence.\nI saw them do so, and I told you."
            )
        ]
        results = json.loads(self.op.run(self.test_data))
        print(results)
        desired_results = [{
            'corpus_id':
            '0',
            'first-person': {
                'MYSELF': [0, '1st-Person', 'Reflexive', 'Singular'],
                'OURSELVES': [0, '1st-Person', 'Reflexive', 'Plural'],
                'WE': [0, '1st-Person', 'Personal', 'Plural'],
                'ME': [0, '1st-Person', 'Personal', 'Singular'],
                'OUR': [0, '1st-Person', 'Possessive', 'Plural'],
                'MY': [1, '1st-Person', 'Possessive', 'Singular'],
                'MINE': [0, '1st-Person', 'Possessive', 'Singular'],
                'US': [0, '1st-Person', 'Personal', 'Plural'],
                'I': [2, '1st-Person', 'Personal', 'Singular'],
                'OURS': [0, '1st-Person', 'Possessive', 'Plural']
            },
            'second-person': {
                'YOU': [1, '2nd-Person', 'Personal', 'Singular/Plural'],
                'YOURSELVES': [0, '2nd-Person', 'Reflexive', 'Plural'],
                'YOURS': [0, '2nd-Person', 'Possessive', 'Singular/Plural'],
                'YOUR': [0, '2nd-Person', 'Possessive', 'Singular/Plural'],
                'YOURSELF': [0, '2nd-Person', 'Reflexive', 'Singular']
            },
            'third-person': {
                'THEY': [0, '3rd-Person', 'Personal', 'Plural'],
                'ITSELF': [0, '3rd-Person', 'Reflexive', 'Singular'],
                'HERS': [0, '3rd-Person', 'Possessive', 'Singular'],
                'HIM': [0, '3rd-Person', 'Personal', 'Singular'],
                'SHE': [1, '3rd-Person', 'Personal', 'Singular'],
                'HERSELF': [0, '3rd-Person', 'Reflexive', 'Singular'],
                'ITS': [0, '3rd-Person', 'Possessive', 'Singular'],
                'HIMSELF': [0, '3rd-Person', 'Reflexive', 'Singular'],
                'THEIRS': [0, '3rd-Person', 'Possessive', 'Plural'],
                'THEIR': [0, '3rd-Person', 'Possessive', 'Plural'],
                'HIS': [0, '3rd-Person', 'Possessive', 'Singular'],
                'IT': [0, '3rd-Person', 'Personal', 'Singular'],
                'HE': [1, '3rd-Person', 'Personal', 'Singular'],
                'HER':
                [0, '3rd-Person', 'Personal/Possessive', 'Singular/Plural'],
                'THEMSELVES': [0, '3rd-Person', 'Reflexive', 'Plural'],
                'THEM': [1, '3rd-Person', 'Personal', 'Plural']
            },
            'sentences': [
                'He and she jumped over my fence.',
                'I saw them do so, and I told you.'
            ]
        }]
        self.assertEqual(results, desired_results)