def test_run(self):
     test_data = [
         Corpus("0", "hello", "hello world"),
         Corpus("1", "goodbye", "goodbye world")
     ]
     test_data = WordTokenizeWhitespacePunct().run(test_data)
     desired_results = [{
         "term": "hello",
         "importance": 0.0,
         "corpus_id": "0"
     }, {
         "term": "world",
         "importance": -0.4054651081081644,
         "corpus_id": "0"
     }, {
         "term": "goodbye",
         "importance": 0.0,
         "corpus_id": "1"
     }, {
         "term": "world",
         "importance": -0.4054651081081644,
         "corpus_id": "1"
     }]
     desired_results = round_json_floats(desired_results)
     results = round_json_floats(self.op.run(test_data))
     for result in results:
         self.assertTrue(result in desired_results)
 def test_run(self):
     test_data = [Corpus("0", "hello", "hello world hello hello world test")]
     WordTokenizeWhitespacePunct().run(test_data)
     desired_results = [{"term": "hello", "frequency": 3},
                        {"term": "world", "frequency": 2},
                        {"term": "test", "frequency": 1}]
     results = self.op.run(test_data)
     self.assertEqual(results["sentences"], desired_results)
Example #3
0
 def test_run(self):
     test_data = [
         Corpus("0", "", "the quick brown fox jumps over the lazy dog")
     ]
     test_data = WordTokenizeWhitespacePunct().run(test_data)
     desired_results = {"0": "quick brown fox jumps lazy dog"}
     results = self.op.run(test_data)
     self.assertIsNotNone(results)
     for corpus in results:
         self.assertEqual(corpus.contents, desired_results[corpus.id])
Example #4
0
 def test_run_whitespace_punct(self):
     self.op = WordTokenizeWhitespacePunct()
     test_data = [
         Corpus("0", "hello", "hello world"),
         Corpus("1", "goodbye", "goodbye world")
     ]
     results = self.op.run(test_data)
     desired_results = {"0": ["hello", "world"], "1": ["goodbye", "world"]}
     self.assertIsNotNone(results)
     for corpus in results:
         self.assertEqual(corpus.tokenized_contents,
                          desired_results[corpus.id])
 def test_run(self):
     test_data = [
         Corpus(
             "0", "",
             "strange women lying in ponds distributing swords is no basis for a system of government"
         )
     ]
     test_data = WordTokenizeWhitespacePunct().run(test_data)
     desired_results = {
         "0":
         "strange woman lie in pond distribute sword be no basis for a system of government"
     }
     results = self.op.run(test_data)
     self.assertIsNotNone(results)
     for corpus in results:
         self.assertEqual(corpus.contents, desired_results[corpus.id])
Example #6
0
def get_operation_handler(operation):
    if operation == 'lemmatize_wordnet':
        return LemmatizerWordNet()
    elif operation == 'pos_tag':
        return PosTag()
    elif operation == 'removecapsgreedy':
        return RemoveCapsGreedy()
    elif operation == 'removecapsnnp':
        return RemoveCapsPreserveNNP()
    elif operation == 'removepunct':
        return RemovePunct()
    elif operation == 'remove_stopwords':
        return RemoveStopwords()
    elif operation == 'sentence_tokenize':
        return SentenceTokenize()
    elif operation == 'stem_porter':
        return StemmerPorter()
    elif operation == 'stem_lancaster':
        return StemmerLancaster()
    elif operation == 'stem_snowball':
        return StemmerSnowball()
    elif operation == 'tfidf':
        return Tfidf()
    elif operation == 'topic_model':
        return TopicModel()
    elif operation == 'wordcloudop':
        return WordCloudOp()
    elif operation == 'word_tokenize_treebank':
        return WordTokenizeTreebank()
    elif operation == 'word_tokenize_whitespace_punct':
        return WordTokenizeWhitespacePunct()
    elif operation == 'word_tokenize_stanford':
        return WordTokenizeStanford()
    elif operation == 'word_tokenize_spaces':
        return WordTokenizeSpaces()
    elif operation == 'word_tokenize_tabs':
        return WordTokenizeTabs()
    elif operation == 'nlp-pos':
        return StanfordCoreNLP(['pos'])
    elif operation == 'nlp-ner':
        return StanfordCoreNLP(['pos', 'ner'])
    elif operation == 'noop':
        return NoOp()
    else:
        raise TransactionException("The requested operation does not exist.")
Example #7
0
 def test_porter(self):
     test_data = [
         Corpus(
             "0", "", ' '.join([
                 'strange', 'women', 'lying', 'ponds', 'distributing',
                 'swords', 'no', 'basis', 'system', 'government'
             ]))
     ]
     test_data = WordTokenizeWhitespacePunct().run(test_data)
     desired_results = {
         "0": [
             'strang', 'women', 'lie', 'pond', 'distribut', 'sword', 'no',
             'basi', 'system', 'govern'
         ]
     }
     results = self.op.run(test_data)
     self.assertIsNotNone(results)
     for corpus in results:
         self.assertEqual(corpus.tokenized_contents,
                          desired_results[corpus.id])
def get_operation_handler(operation):
    if operation == 'lemmatize_wordnet':
        return LemmatizerWordNet()
    elif operation == 'removecapsgreedy':
        return RemoveCapsGreedy()
    elif operation == 'removecapsnnp':
        return RemoveCapsPreserveNNP()
    elif operation == 'removepunct':
        return RemovePunct()
    elif operation == 'removesilence':
        return RemoveSilence()
    elif operation == 'remove_stopwords':
        return RemoveStopwords()
    elif operation == 'sentence_tokenize':
        return SentenceTokenize()
    elif operation == 'removehashtags':
        return RemoveHashtags()
    elif operation == 'removequotes':
        return RemoveQuotes()
    elif operation == 'stem_porter':
        return StemmerPorter()
    elif operation == 'stop_words':
        return RemoveStopwords()
    elif operation == 'tfidf':
        return Tfidf()
    elif operation == 'wordcloudop':
        return WordCloudOp()
    elif operation == 'word_tokenize_treebank':
        return WordTokenizeTreebank()
    elif operation == 'word_tokenize_whitespace_punct':
        return WordTokenizeWhitespacePunct()
    elif operation == 'word_tokenize_stanford':
        return WordTokenizeStanford()
    elif operation == 'nlp-pos':
        return StanfordCoreNLP('pos')
    elif operation == 'nlp-ner':
        return StanfordCoreNLP('ner')
    elif operation == 'nlp-sentiment':
        return StanfordCoreNLP('sentiment')
    elif operation == 'nlp-coref':
        return StanfordCoreNLP('coref')
    elif operation == 'nlp-relation':
        return StanfordCoreNLP('relation')
    elif operation == 'splat-disfluency':
        print("YOU GOT SPLATTED")
        return SplatDisfluency()
    elif operation == 'splat-ngrams':
        print("YOU GOT SPLATTED")
        return SplatNGrams()
    elif operation == 'splat-complexity':
        print("YOU GOT SPLATTED")
        return SplatComplexity()
    elif operation == 'splat-pos':
        print("YOU GOT SPLATTED")
        return SplatPOSFrequencies()
    elif operation == 'splat-syllables':
        print("YOU GOT SPLATTED")
        return SplatSyllables()
    elif operation == 'splat-pronouns':
        print("YOU GOT SPLATTED")
        return SplatPronouns()
    elif operation == 'char-ngrams':
        return CharNgrams()
    elif operation == 'length-stats':
        return LengthStatistics()
    elif operation == 'topic-model-10':
        return TopicModel(10)
    elif operation == 'topic-model-30':
        return TopicModel(30)
    elif operation == 'word-vector':
        return WordVector()
    elif operation == 'unsup-morph':
        return UnsupervisedMorphology()
    elif operation == 'bigram-array':
        return BigramArray()
    elif operation == 'speech-token-stats':
        return SpeechTokenStatistics()
    elif operation == 'extract_transcript':
        return ExtractTranscript()
    elif operation == 'noop':
        return NoOp()
    else:
        raise TransactionException(
            f'The requested operation "{operation}" does not exist.')
Example #9
0
def get_operation_handler(operation):
    if operation == 'lemmatize_wordnet':
        return LemmatizerWordNet()
    elif operation == 'pos_tag':
        return PosTag()
    elif operation == 'removecapsgreedy':
        return RemoveCapsGreedy()
    elif operation == 'removecapsnnp':
        return RemoveCapsPreserveNNP()
    elif operation == 'removepunct':
        return RemovePunct()
    elif operation == 'removesilence':
        return RemoveSilence()
    elif operation == 'remove_stopwords':
        return RemoveStopwords()
    elif operation == 'sentence_tokenize':
        return SentenceTokenize()
    elif operation == 'stem_porter':
        return StemmerPorter()
    elif operation == 'stop_words':
        return RemoveStopwords()
    elif operation == 'tfidf':
        return Tfidf()
    elif operation == 'topic_model':
        return TopicModel()
    elif operation == 'wordcloudop':
        return WordCloudOp()
    elif operation == 'word_tokenize_treebank':
        return WordTokenizeTreebank()
    elif operation == 'word_tokenize_whitespace_punct':
        return WordTokenizeWhitespacePunct()
    elif operation == 'word_tokenize_stanford':
        return WordTokenizeStanford()
    elif operation == 'word_tokenize_spaces':
        return WordTokenizeSpaces()
    elif operation == 'word_tokenize_tabs':
        return WordTokenizeTabs()
    elif operation == 'nlp-pos':
        return StanfordCoreNLP(['pos'])
    elif operation == 'nlp-ner':
        return StanfordCoreNLP(['pos', 'ner'])
    elif operation == 'nlp-sentiment':
        return StanfordCoreNLP(['parse', 'sentiment'])
    elif operation == 'nlp-parse':
        return StanfordCoreNLP(['parse'])
    elif operation == 'nlp-coref':
        return StanfordCoreNLP(['tokenize', 'ssplit', 'coref'])
    elif operation == 'nlp-relation':
        return StanfordCoreNLP(['parse', 'relation'])
    elif operation == 'splat-disfluency':
        print("YOU GOT SPLATTED")
        return SplatDisfluency()
    elif operation == 'splat-ngrams':
        print("YOU GOT SPLATTED")
        return SplatNGrams()
    elif operation == 'splat-complexity':
        print("YOU GOT SPLATTED")
        return SplatComplexity()
    elif operation == 'splat-pos':
        print("YOU GOT SPLATTED")
        return SplatPOSFrequencies()
    elif operation == 'splat-syllables':
        print("YOU GOT SPLATTED")
        return SplatSyllables()
    elif operation == 'splat-pronouns':
        print("YOU GOT SPLATTED")
        return SplatPronouns()
    elif operation == 'noop':
        return NoOp()
    else:
        raise TransactionException("The requested operation does not exist.")