Example #1
0
 def test_word_list_to_fragment_lookup(self):
     word_list = ['and', 'are', 'any',
                  'thisisanabsurdlylongwordthatisfake']
     lookup = word_list_to_fragment_lookup(word_list)
     # check explicitly that 'a__' matches three words
     self.assertEquals(lookup['a__'], ['and', 'are', 'any'])
     # check the whole thing is correct
     expected = ['__e', '__d', '_ny', '_n_', 'a__', '_re', 'a_y',
                 'ar_', '_r_', 'a_e', 'a_d', '___', 'an_', '__y', '_nd']
     self.assertEquals(lookup.keys(), expected)
Example #2
0
 def test_word_list_to_fragment_lookup(self):
     word_list = ['and', 'are', 'any', 'thisisanabsurdlylongwordthatisfake']
     lookup = word_list_to_fragment_lookup(word_list)
     # check explicitly that 'a__' matches three words
     self.assertEquals(lookup['a__'], ['and', 'are', 'any'])
     # check the whole thing is correct
     expected = [
         '__e', '__d', '_ny', '_n_', 'a__', '_re', 'a_y', 'ar_', '_r_',
         'a_e', 'a_d', '___', 'an_', '__y', '_nd'
     ]
     self.assertEquals(lookup.keys(), expected)
Example #3
0
def get_word_data():
    """
    :return: dictionary that has word_count and fragment lookup
    """
    word_count = build_word_count_from_corpus()
    frequency_min = parameters['min_frequency_word_to_fragment']
    word_count_smaller = {word: count for word, count in word_count.iteritems()
                          if count >= frequency_min}
    fragment_lookup = word_list_to_fragment_lookup(word_count_smaller.keys())
    word_data = {'word_count': word_count, 'fragment_lookup': fragment_lookup}
    return word_data
Example #4
0
def get_word_data():
    """
    :return: dictionary that has word_count and fragment lookup
    """
    word_count = build_word_count_from_corpus()
    frequency_min = parameters['min_frequency_word_to_fragment']
    word_count_smaller = {
        word: count
        for word, count in word_count.iteritems() if count >= frequency_min
    }
    fragment_lookup = word_list_to_fragment_lookup(word_count_smaller.keys())
    word_data = {'word_count': word_count, 'fragment_lookup': fragment_lookup}
    return word_data