コード例 #1
0
class TestWordCounter(unittest.TestCase):



	def setUp(self):
	   conf = SparkConf().setAppName("appTest").setMaster("local[*]")
	   self.sc = SparkContext(conf=conf)
	   self.counter = WordCounter() 

	def tearDown(self):
	   self.sc.stop()

	def test_when_exist_one_movie_and_counter(self):
	   movieList = ["1993::Toy Story Toy (1995)::Animation|Children's|Comedy",
	                "1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy"]	 
	   result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']),
	             ('Toy', ['::Toy Story Toy (1995)::']))                
	   movies = self.sc.parallelize(movieList)
 	   self.assertEqual(self.counter.getMaxValues(movies),result)   


 	def test_when_exist_one_movie_and_counter_moreMovies(self):
	   movieList = ["1993::Toy Story Toy (1995)::Animation|Children's|Comedy",
	                "1993::ToyA StoryB ToyA (1995)::Animation|Children's|Comedy",
	                "1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy"]	 
	   result = (('ToyA', ['::ToyA StoryB ToyA (1995)::','::ToyA StoryA ToyA (1995)::']))                
	   movies = self.sc.parallelize(movieList)
 	   self.assertEqual(self.counter.getMaxValues(movies),result)   
コード例 #2
0
class TestWordCounter(unittest.TestCase):
    def setUp(self):
        conf = SparkConf().setAppName("appTest").setMaster("local[*]")
        self.sc = SparkContext(conf=conf)
        self.counter = WordCounter()

    def tearDown(self):
        self.sc.stop()

    def test_when_exist_one_movie_and_counter(self):
        movieList = [
            "1993::Toy Story Toy (1995)::Animation|Children's|Comedy",
            "1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy"
        ]
        result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']),
                  ('Toy', ['::Toy Story Toy (1995)::']))
        movies = self.sc.parallelize(movieList)
        self.assertEqual(self.counter.getMaxValues(movies), result)

    def test_when_exist_one_movie_and_counter_moreMovies(self):
        movieList = [
            "1993::Toy Story Toy (1995)::Animation|Children's|Comedy",
            "1993::ToyA StoryB ToyA (1995)::Animation|Children's|Comedy",
            "1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy"
        ]
        result = (('ToyA', [
            '::ToyA StoryB ToyA (1995)::', '::ToyA StoryA ToyA (1995)::'
        ]))
        movies = self.sc.parallelize(movieList)
        self.assertEqual(self.counter.getMaxValues(movies), result)
コード例 #3
0
    def testWordTally(self):
        wc = WordCounter()
        basicString = "I am bad bad bad at python, its amazing how quickly you can forget python"
        wc.processString(basicString)

        wordList = wc.getWordList()

        self.assertEqual(3, wordList['bad'])
        self.assertEqual(2, wordList['python'])
コード例 #4
0
 def testCountWords(self):
     """
     Function that tests the countWords function within the
     Word Counter class.
     """
     # Number of words in text4.txt
     num = 200
     tester = WordCounter("tests/text4.txt")
     self.assertEqual(tester.countWords(), num)
コード例 #5
0
ファイル: document_process.py プロジェクト: Wilfongjt/soke2
    def __init__(self, table_name_key):
        DataProcessInterface.__init__(self, table_name_key)

        #self.output_filename = parameters.get('output_filename')
        #self.output_folder = parameters.get('output_folder')
        util = Util()

        self.input_folder = util.getInputFolder(table_name_key)

        self.doc_no = 1  # assumeing all docs are read form the begining to the end
        self.para_no = 1
        self.sentance_no = 1
        self.word_no = 1
        self.wordCounter = WordCounter()
        #self.bufferedWriter = BufferedWriter(self.output_folder,
        #                                     self.output_filename)

        self.bufferedWriter = BufferedWriter(
            util.getOutputFolder(table_name_key),
            '{}.json'.format(table_name_key))

        self.dict_ = {
            'documents': {},
            'paragraphs': {},
            'words': {},
            'word-documents': {}
        }

        self.current_doc_name = None
コード例 #6
0
class WordCounterFixtureTests(TestCase):
    def setUp(self):
        """Prepare before each test"""
        MOBYDICK_SUMMARY = open('mobydick_summary.txt').read()
        self.text = TextBody(MOBYDICK_SUMMARY)
        self.counter = WordCounter(self.text)

    def test_count_months(self):
        self.assertEqual(self.counter.count_word("months"), 1)

    def test_count_the(self):
        """Count word in a longer text"""
        self.assertEqual(self.counter.count_word("the"), 6)

    def tearDown(self):
        """Clean up after a test has passed or failed."""
        pass
コード例 #7
0
class WordCounterFixtureTests(TestCase):

    def setUp(self):
        """Prepare before each test"""
        MOBYDICK_SUMMARY = open('mobydick_summary.txt').read()
        self.text = TextBody(MOBYDICK_SUMMARY)
        self.counter = WordCounter(self.text)

    def test_count_months(self):
        self.assertEqual(self.counter.count_word("months"), 1)

    def test_count_the(self):
        """Count word in a longer text"""
        self.assertEqual(self.counter.count_word("the"), 6)

    def tearDown(self):
        """Clean up after a test has passed or failed."""
        pass
コード例 #8
0
    def testGetTopOccurences(self):
        """
        Function that tests the getTopOccurrences method
        from WordCounter class. Takes a test .txt file and
        checks it against pre-checked dict

        """
        object = WordCounter("tests/testFile.txt")
        trueValue = {
            "quick": 7,
            "jumps": 7,
            "brown": 5,
            "zebra": 3,
            "table": 3,
            "over": 4,
            "dog": 4,
            "layer": 5,
            "crazy": 3,
            "productive": 4
        }
        self.assertDictEqual(object.getTopOccurrences(), trueValue)
コード例 #9
0
    def testAppostrophesHandledOk(self):
        wc = WordCounter()
        basicString = "I'm a Quick brown fox jumped over 9 the lazy dog"
        wc.processString(basicString)
        self.assertEqual(11, wc.getWordCount())

        wl = wc.getWordList()
        self.assertTrue("I\'m" in wl.keys())
コード例 #10
0
class WordCounterTest(unittest.TestCase):
    def set_up(self, filename):
        self.f = open(filename)
        self.fr = self.f.read()
        self.wc = WordCounter(self.fr)

    def test_empty(self):
        self.set_up("samples/empty.txt")
        self.assertEqual(self.wc.get_count(), 0)
        self.assertEqual(self.wc.get_most_frequent(), {})

    def test_base(self):
        self.set_up("samples/test.txt")
        self.assertEqual(self.wc.get_count(), 9)
        self.assertEqual(len(self.wc.get_most_frequent().keys()), 8)

    def test_long(self):
        self.set_up("samples/long.txt")
        self.assertEqual(self.wc.get_count(), 188)
        self.assertEqual(len(self.wc.get_most_frequent().keys()), 10)

    def tearDown(self):
        self.wc = None
        self.f.close()
コード例 #11
0
def test_when_input_is_one_word_return_count_for_one_word():
    assert WordCounter.count("hello") == "hello, 1"
コード例 #12
0
#!/usr/bin/env python
#
# example of a test generator
#

from nose.tools import assert_equal
from word_counter import TextBody, WordCounter

MOBYDICK_SUMMARY = open('mobydick_summary.txt').read()
TEXT = TextBody(MOBYDICK_SUMMARY)
COUNTER = WordCounter(TEXT)

WORD_PAIRS = [('months', 1), ('whale', 5), ('captain', 4), ('white', 2),
              ('harpoon', 1), ('Ahab', 1)]


def check_word(word, number):
    assert_equal(COUNTER.count_word(word), number)


def test_word_pairs():
    # Tests a series of example words
    # creates one test for each word
    # --- no docstring so that parameters are visible ---
    for word, number in WORD_PAIRS:
        yield check_word, word, number


# nose does
for x, y, z in test_word_pairs:
    if x(y, z):
コード例 #13
0
def test_count_word_simple():
    """Count word in a short text"""
    text = TextBody("the white white whale")
    counter = WordCounter(text)
    assert_equal(counter.count_word("white"), 2)
コード例 #14
0
def test_count_word_simple():
    """Count word in a short text"""
    text = TextBody("the white white whale")
    counter = WordCounter(text)
    assert_equal(counter.count_word("white"), 2)
コード例 #15
0
 def testBasicSentence(self):
     wc = WordCounter()
     basicString = "Quick brown fox jumped over the lazy dog."
     wc.processString(basicString)
     self.assertEqual(8, wc.getWordCount())
コード例 #16
0
def run():
    word_counter = WordCounter('kennedy.txt')
    word_counter.display()
コード例 #17
0
from word_counter import WordCounter
from ui import ui

if __name__ == "__main__":
    _ui = ui()
    _ui.handle_input()
    counter = WordCounter(_ui.file_name)
    success = counter.run()
    if success:
        _ui.display_word_count(counter)
        _ui.display_exit_message()
    else:
        _ui.display_error_message()
        exit()
コード例 #18
0
 def test_get_most_common_words(self):
     obj = WordCounter()
     obj.text_content = self.content
     self.assertEqual(obj.get_most_common_words(2), [('Four', 4),
                                                     ('Three', 3)])
コード例 #19
0
import re
from word_counter import WordCounter

# pattern for splitting words
PATTERN = "[\d\W\s_]+"

wc = WordCounter()

with open('README.md', 'r') as file:
    for line in file:
        for word in re.split(PATTERN, line.strip()):
            if word:
                wc.add_word(word)
               
# display the word counts
for word in sorted(wc):
    print(f"{wc[word]:3}  {word}")
コード例 #20
0
 def test_get_total_words(self):
     obj = WordCounter()
     obj.text_content = self.content
     self.assertEqual(obj.get_total_words(), 10)
コード例 #21
0
    def test_initialize_word_count_has_default_content(self):
        obj = WordCounter()

        self.assertIn('Lady Gaga', obj.text_content)
コード例 #22
0
 print("\nPuede escoger una de las siguientes opciones:")
 print("(1) Relación de las palabras utilizadas en el texto, así como las veces que aparecen.")
 print(f"(2) Las {words_most_used} palabras más utilizadas en el texto, asi como las veces que aparecen.")
 try:
     option = int(input("Escribe el número de una de la opción seleccionada: "))
 except ValueError:
     print(f"\nLo sentimos, la opción seleccionada no es válida.")
     another_try = input("¿Gusta volver a seleccionar una opción?  S=si / N=no: ")
     another_try = another_try.lower()
     if another_try == "s":
         continue
     else:
         print("\nMuchas gracias por haber usado esta aplicación, esperamos que vuelva pronto.")
         exit()
 if option == 1:
     word_counter = WordCounter(words_in_text)
     counted_words = word_counter.count_words()
     print(f"\nLa relación de palabras que aprecen {text.file_name}, con la cantidad de veces que aparece cada una de ellas, es la siguiente:")
     for key, value in counted_words.items():
         if value == 1:
             print(f"{key} aparece {value} vez.")
         else:
             print(f"{key} aparece {value} veces.")
     is_done = True
 elif option == 2:
     word_counter_sorter = CountedWordsSorter(words_in_text, words_to_ignore)
     counted_words = word_counter_sorter.count_words()
     sorted_counted_words = word_counter_sorter.sort_counted_words(counted_words)
     counter = 0
     if words_most_used == 1:
         print(f"\nLa palabra más utilizada en {text.file_name} es la siguiente:")
コード例 #23
0
def test_when_input_is_two_words_return_count_for_each_word():
    assert WordCounter.count("hello world") == "hello, 1\nworld, 1"
コード例 #24
0
 def test_count_word_simple(self):
     """Count a single word"""
     counter = WordCounter(MockText)
     self.assertEqual(counter.count_word("white"), 2)
コード例 #25
0
 def set_up(self, filename):
     self.f = open(filename)
     self.fr = self.f.read()
     self.wc = WordCounter(self.fr)
コード例 #26
0
 def test_count_word_simple(self):
     """Count a single word"""
     counter = WordCounter(MockText)
     self.assertEqual(counter.count_word("white"), 2)
コード例 #27
0
 def setUp(self):
     """Prepare before each test"""
     MOBYDICK_SUMMARY = open('mobydick_summary.txt').read()
     self.text = TextBody(MOBYDICK_SUMMARY)
     self.counter = WordCounter(self.text)
コード例 #28
0
ファイル: analyzer.py プロジェクト: Ageneh/trumpy

if __name__ == '__main__':
	started = datetime.now()

	keywords = ["Familie"]  # erstelle eine liste mit keywords
	analyzer = Analyzer(keywords).start()  # erstelle eine Analyzer-Instanz und starte diese

	ended = datetime.now()
	elapsed = ended - started
	print("Analyzer time: {}".format(elapsed))

	if input("WordCount errechnen? (y/n) - ").lower() == "y":
		# wenn vom User gewollt, wird die Wortanzahl berechnet
		ended = None

		# erstlle WordCounter-Instanz und starte diese
		counter = WordCounter()
		counter.start()

		# erstlle WeeklyCounter-Instanz
		weekly = WeeklyCounter(counter)
		weekly.start()

	if not ended: ended = datetime.now()

	elapsed = ended - started
	print("Time started: {}".format(started))
	print("Time ended: {}".format(ended))
	print("Total time running: {}".format(elapsed))
コード例 #29
0
 def setUp(self):
     """Prepare before each test"""
     MOBYDICK_SUMMARY = open('mobydick_summary.txt').read()
     self.text = TextBody(MOBYDICK_SUMMARY)
     self.counter = WordCounter(self.text)
コード例 #30
0
def test_when_input_has_a_word_that_repeats_return_correct_count_for_word_that_repeats(
):
    assert WordCounter.count('hello world hello') == "hello, 2\nworld, 1"
コード例 #31
0
ファイル: main.py プロジェクト: kail/csep517
def main():
    logging.info("Starting...")

    training_parser = InputParser(
        '/Users/skobovm/repos/csep517/hmm/data/twt.train.json')
    dev_parser = InputParser(
        '/Users/skobovm/repos/csep517/hmm/data/twt.dev.json')
    test_parser = InputParser(
        '/Users/skobovm/repos/csep517/hmm/data/twt.test.json')

    # First, count the words!
    counter = WordCounter()
    for parsed_sentence in training_parser.get_tokenized_sentences():
        if parsed_sentence:
            for i in range(1, len(parsed_sentence) - 1):
                counter.add_word(parsed_sentence[i][0])

    # Finalize counter and separate high frequency from low frequency
    counter.finalize()

    # Initialize the models
    bigram = BigramHMM()
    trigram = TrigramHMM()

    for parsed_sentence in training_parser.get_tokenized_sentences():
        if parsed_sentence:
            # Convert the low frequency words to classes
            counter.classify_sentence(parsed_sentence)

            bigram.add_sentence(parsed_sentence)
            trigram.add_sentence(parsed_sentence)

    # Models have been initialized at this point, finalize the distributions
    #bigram.finalize()
    trigram.finalize()

    # PICK THE PARSER HERE
    parser = dev_parser

    # Iterate over data and try to predict
    num_correct_bigram = 0
    num_correct_trigram = 0
    total_words = 0
    for parsed_sentence in parser.get_tokenized_sentences():
        if parsed_sentence:
            original_sentence = copy.deepcopy(parsed_sentence)

            # Convert the low frequency words to classes
            counter.classify_sentence(parsed_sentence)

            # Bigram lattice
            #lattice = Lattice(bigram, parsed_sentence)

            # Trigram lattice
            tri_lattice = TrigramLattice(trigram, parsed_sentence)

            # Calculate best POS using viterbi
            #pos_list_bigram = lattice.get_pos()
            pos_list_trigram = tri_lattice.get_pos()

            # Determine how many were correct
            #num_correct_bigram += get_num_correct(parsed_sentence, pos_list_bigram, lattice)
            num_correct_trigram += get_num_correct(parsed_sentence,
                                                   pos_list_trigram,
                                                   tri_lattice,
                                                   original_sentence, counter)

            # Remove the START and STOP chars
            total_words += (len(parsed_sentence) - 2)

            print("Accuracy: %s" % (num_correct_trigram / total_words))
        else:
            print('ERROR! Couldnt parse sentence')

    print("Bigram HMM Accuracy: %s/%s - %s" %
          (num_correct_bigram, total_words,
           (num_correct_bigram / total_words)))
    print("Trigram HMM Accuracy: %s/%s - %s" %
          (num_correct_trigram, total_words,
           (num_correct_trigram / total_words)))
コード例 #32
0
def test_when_input_has_comma_as_delimiter_return_count_for_words():
    assert WordCounter.count('hello,kitty') == "hello, 1\nkitty, 1"
コード例 #33
0
def test_count_word_complex():
    """Count word in a longer text"""
    text = TextBody(MOBYDICK_SUMMARY)
    counter = WordCounter(text)
    assert_equal(counter.count_word("white"), 2)
コード例 #34
0
def test_when_input_has_more_than_one_delimiter_return_count_for_words():
    assert WordCounter.count(
        'merry christmas,kitty') == "merry, 1\nchristmas, 1\nkitty, 1"
コード例 #35
0
def test_count_word_complex():
    """Count word in a longer text"""
    text = TextBody(MOBYDICK_SUMMARY)
    counter = WordCounter(text)
    assert_equal(counter.count_word("white"), 2)
コード例 #36
0
	def read(self,filePlace,type):
	   movies = self._sc.textFile(filePlace)
	   counter = WordCounter()
	   if type is 'Years' :
	   	  counter = YearsCounter()
	   return counter.getMaxValues(movies)   
コード例 #37
0
	def setUp(self):
	   conf = SparkConf().setAppName("appTest").setMaster("local[*]")
	   self.sc = SparkContext(conf=conf)
	   self.counter = WordCounter() 
コード例 #38
0
 def setUp(self):
     conf = SparkConf().setAppName("appTest").setMaster("local[*]")
     self.sc = SparkContext(conf=conf)
     self.counter = WordCounter()