def create_calculator(cls, parseset_index): mongodb_connection = pymongo.Connection(host='127.0.0.1') collection_map = { 1: mongodb_connection['trnltk']['wordUnigrams{}'.format( parseset_index)], 2: mongodb_connection['trnltk']['wordBigrams{}'.format( parseset_index)], 3: mongodb_connection['trnltk']['wordTrigrams{}'.format( parseset_index)] } query_cache_collection = QueryCacheCollectionCreator( mongodb_connection['trnltk']).build(drop=False) database_index_builder = DatabaseIndexBuilder(collection_map) target_form_given_context_counter = CachingTargetFormGivenContextCounter( collection_map, query_cache_collection) ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother( ) sequence_likelihood_calculator = UniformSequenceLikelihoodCalculator() return ContextParsingLikelihoodCalculator( database_index_builder, target_form_given_context_counter, ngram_frequency_smoother, sequence_likelihood_calculator)
def setUpClass(cls): super(InterpolatingLikelihoodCalculatorCalculationContextTest, cls).setUpClass() all_roots = [] lexemes = LexiconLoader.load_from_file( os.path.join(os.path.dirname(__file__), '../../../../../resources/master_dictionary.txt')) for di in lexemes: all_roots.extend(RootGenerator.generate(di)) root_map_generator = RootMapGenerator() cls.root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph( NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(cls.root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(cls.root_map) digit_numeral_root_finder = DigitNumeralRootFinder() text_numeral_root_finder = TextNumeralRootFinder(cls.root_map) proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder( ) proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder( ) cls.contextless_parser = UpperCaseSupportingContextlessMorphologicalParser( suffix_graph, predefined_paths, [ word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder ]) mongodb_connection = pymongo.Connection(host='127.0.0.1') cls.collection_map = { 1: mongodb_connection['trnltk']['wordUnigrams999'], 2: mongodb_connection['trnltk']['wordBigrams999'], 3: mongodb_connection['trnltk']['wordTrigrams999'] } database_index_builder = DatabaseIndexBuilder(cls.collection_map) target_form_given_context_counter = InMemoryCachingTargetFormGivenContextCounter( cls.collection_map) ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother( ) sequence_likelihood_calculator = UniformSequenceLikelihoodCalculator() wrapped_generator = ContextParsingLikelihoodCalculator( database_index_builder, target_form_given_context_counter, ngram_frequency_smoother, sequence_likelihood_calculator) cls.generator = InterpolatingLikelihoodCalculator(wrapped_generator)
def setUpClass(cls): super(ContextParsingLikelihoodCalculatorTest, cls).setUpClass() database_index_builder = DatabaseIndexBuilder(cls.collection_map) target_form_given_context_counter = TargetFormGivenContextCounter( cls.collection_map) ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother( ) sequence_likelihood_calculator = UniformSequenceLikelihoodCalculator() cls.generator = ContextParsingLikelihoodCalculator( database_index_builder, target_form_given_context_counter, ngram_frequency_smoother, sequence_likelihood_calculator)
def setUp(self): super(ContextParsingLikelihoodCalculatorCachingTest, self).setUp() database = self.mongodb_connection['trnltk'] query_cache_collection = QueryCacheCollectionCreator(database).build( drop=True) database_index_builder = DatabaseIndexBuilder(self.collection_map) target_form_given_context_counter = CachingTargetFormGivenContextCounter( self.collection_map, query_cache_collection) ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother( ) sequence_likelihood_calculator = UniformSequenceLikelihoodCalculator() self.generator = ContextParsingLikelihoodCalculator( database_index_builder, target_form_given_context_counter, ngram_frequency_smoother, sequence_likelihood_calculator)
def setUp(self): ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother() self.generator = ContextParsingLikelihoodCalculator(None, None, ngram_frequency_smoother, None)
class ParseResultsCartesianProductTest(unittest.TestCase): def setUp(self): ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother() self.generator = ContextParsingLikelihoodCalculator(None, None, ngram_frequency_smoother, None) def test_should_get_cartesian_products_of_parse_results_when_context_is_empty(self): assert_that(self.generator._get_cartesian_products_of_context_parse_results(None), equal_to([])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([]), equal_to([])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([[]]), equal_to([])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([[],[]]), equal_to([])) def test_should_get_cartesian_products_of_parse_results_when_context_has_one_item(self): morpheme_container_a = Mock() morpheme_container_b = Mock() assert_that(self.generator._get_cartesian_products_of_context_parse_results([[morpheme_container_a]]), equal_to([[morpheme_container_a]])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([[morpheme_container_a, morpheme_container_b]]), equal_to([[morpheme_container_a],[morpheme_container_b]])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([[morpheme_container_a],[]]), equal_to([[morpheme_container_a]])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([[morpheme_container_a, morpheme_container_b],[]]), equal_to([[morpheme_container_a],[morpheme_container_b]])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([[],[morpheme_container_a]]), equal_to([[morpheme_container_a]])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([[],[morpheme_container_a, morpheme_container_b]]), equal_to([[morpheme_container_a],[morpheme_container_b]])) def test_should_get_cartesian_products_of_parse_results_when_context_has_two_items(self): morpheme_container_a_0 = Mock() morpheme_container_a_1 = Mock() morpheme_container_b_0 = Mock() morpheme_container_b_1 = Mock() assert_that(self.generator._get_cartesian_products_of_context_parse_results([ [morpheme_container_a_0], [morpheme_container_b_0] ]), equal_to([ [morpheme_container_a_0, morpheme_container_b_0] ])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([ [morpheme_container_a_0, morpheme_container_a_1], [morpheme_container_b_0] ]), equal_to([ [morpheme_container_a_0, morpheme_container_b_0], [morpheme_container_a_1, morpheme_container_b_0] ])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([ [morpheme_container_a_0], [morpheme_container_b_0, morpheme_container_b_1] ]), equal_to([ [morpheme_container_a_0, morpheme_container_b_0], [morpheme_container_a_0, morpheme_container_b_1] ])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([ [morpheme_container_a_0, morpheme_container_a_1], [morpheme_container_b_0, morpheme_container_b_1] ]), equal_to([ [morpheme_container_a_0, morpheme_container_b_0], [morpheme_container_a_0, morpheme_container_b_1], [morpheme_container_a_1, morpheme_container_b_0], [morpheme_container_a_1, morpheme_container_b_1] ])) def test_should_get_cartesian_products_of_parse_results_when_context_has_two_items_and_blank_ones(self): morpheme_container_a_0 = Mock() morpheme_container_a_1 = Mock() morpheme_container_b_0 = Mock() morpheme_container_b_1 = Mock() assert_that(self.generator._get_cartesian_products_of_context_parse_results([ [morpheme_container_a_0], [morpheme_container_b_0], [] ]), equal_to([ [morpheme_container_a_0, morpheme_container_b_0] ])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([ [], [morpheme_container_a_0, morpheme_container_a_1], [morpheme_container_b_0] ]), equal_to([ [morpheme_container_a_0, morpheme_container_b_0], [morpheme_container_a_1, morpheme_container_b_0] ])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([ [morpheme_container_a_0], [], [morpheme_container_b_0, morpheme_container_b_1] ]), equal_to([ [morpheme_container_a_0, morpheme_container_b_0], [morpheme_container_a_0, morpheme_container_b_1] ])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([ [morpheme_container_a_0, morpheme_container_a_1], [morpheme_container_b_0, morpheme_container_b_1], [] ]), equal_to([ [morpheme_container_a_0, morpheme_container_b_0], [morpheme_container_a_0, morpheme_container_b_1], [morpheme_container_a_1, morpheme_container_b_0], [morpheme_container_a_1, morpheme_container_b_1] ])) def test_should_get_cartesian_products_of_parse_results_when_context_has_three(self): morpheme_container_a_0 = Mock() morpheme_container_b_0 = Mock() morpheme_container_c_0 = Mock() morpheme_container_a_1 = Mock() morpheme_container_b_1 = Mock() morpheme_container_c_1 = Mock() assert_that(self.generator._get_cartesian_products_of_context_parse_results([ [morpheme_container_a_0], [morpheme_container_b_0], [morpheme_container_c_0] ]), equal_to([ [morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_0] ])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([ [], [morpheme_container_a_0], [morpheme_container_b_0], [morpheme_container_c_0] ]), equal_to([ [morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_0] ])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([ [morpheme_container_a_0], [], [morpheme_container_b_0], [morpheme_container_c_0] ]), equal_to([ [morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_0] ])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([ [morpheme_container_a_0], [morpheme_container_b_0], [], [morpheme_container_c_0] ]), equal_to([ [morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_0] ])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([ [morpheme_container_a_0], [], [morpheme_container_b_0], [], [morpheme_container_c_0], [] ]), equal_to([ [morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_0] ])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([ [morpheme_container_a_0, morpheme_container_a_1], [], [morpheme_container_b_0], [morpheme_container_c_0, morpheme_container_c_1], [] ]), equal_to([ [morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_0], [morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_1], [morpheme_container_a_1, morpheme_container_b_0, morpheme_container_c_0], [morpheme_container_a_1, morpheme_container_b_0, morpheme_container_c_1], ])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([ [], [], [morpheme_container_a_0], [morpheme_container_b_0, morpheme_container_b_1], [morpheme_container_c_0, morpheme_container_c_1], ]), equal_to([ [morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_0], [morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_1], [morpheme_container_a_0, morpheme_container_b_1, morpheme_container_c_0], [morpheme_container_a_0, morpheme_container_b_1, morpheme_container_c_1], ])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([ [morpheme_container_a_0, morpheme_container_a_1], [morpheme_container_b_0, morpheme_container_b_1], [], [morpheme_container_c_0], ]), equal_to([ [morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_0], [morpheme_container_a_0, morpheme_container_b_1, morpheme_container_c_0], [morpheme_container_a_1, morpheme_container_b_0, morpheme_container_c_0], [morpheme_container_a_1, morpheme_container_b_1, morpheme_container_c_0], ])) assert_that(self.generator._get_cartesian_products_of_context_parse_results([ [morpheme_container_a_0, morpheme_container_a_1], [morpheme_container_b_0, morpheme_container_b_1], [morpheme_container_c_0, morpheme_container_c_1], [], [], ]), equal_to([ [morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_0], [morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_1], [morpheme_container_a_0, morpheme_container_b_1, morpheme_container_c_0], [morpheme_container_a_0, morpheme_container_b_1, morpheme_container_c_1], [morpheme_container_a_1, morpheme_container_b_0, morpheme_container_c_0], [morpheme_container_a_1, morpheme_container_b_0, morpheme_container_c_1], [morpheme_container_a_1, morpheme_container_b_1, morpheme_container_c_0], [morpheme_container_a_1, morpheme_container_b_1, morpheme_container_c_1], ])) def test_should_get_cartesian_products_of_parse_results_when_context_has_four(self): morpheme_container_a_0 = Mock() morpheme_container_b_0 = Mock() morpheme_container_c_0 = Mock() morpheme_container_d_0 = Mock() assert_that(self.generator._get_cartesian_products_of_context_parse_results([ [morpheme_container_a_0], [morpheme_container_b_0], [morpheme_container_c_0], [morpheme_container_d_0] ]), equal_to([ [morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_0, morpheme_container_d_0] ]))
def setUp(self): ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother( ) self.generator = ContextParsingLikelihoodCalculator( None, None, ngram_frequency_smoother, None)
class ParseResultsCartesianProductTest(unittest.TestCase): def setUp(self): ngram_frequency_smoother = CachedSimpleGoodTuringNGramFrequencySmoother( ) self.generator = ContextParsingLikelihoodCalculator( None, None, ngram_frequency_smoother, None) def test_should_get_cartesian_products_of_parse_results_when_context_is_empty( self): assert_that( self.generator._get_cartesian_products_of_context_parse_results( None), equal_to([])) assert_that( self.generator._get_cartesian_products_of_context_parse_results( []), equal_to([])) assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[]]), equal_to([])) assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[], []]), equal_to([])) def test_should_get_cartesian_products_of_parse_results_when_context_has_one_item( self): morpheme_container_a = Mock() morpheme_container_b = Mock() assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[morpheme_container_a]]), equal_to([[morpheme_container_a]])) assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[morpheme_container_a, morpheme_container_b]]), equal_to([[morpheme_container_a], [morpheme_container_b]])) assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[morpheme_container_a], []]), equal_to([[morpheme_container_a]])) assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[morpheme_container_a, morpheme_container_b], []]), equal_to([[morpheme_container_a], [morpheme_container_b]])) assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[], [morpheme_container_a]]), equal_to([[morpheme_container_a]])) assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[], [morpheme_container_a, morpheme_container_b]]), equal_to([[morpheme_container_a], [morpheme_container_b]])) def test_should_get_cartesian_products_of_parse_results_when_context_has_two_items( self): morpheme_container_a_0 = Mock() morpheme_container_a_1 = Mock() morpheme_container_b_0 = Mock() morpheme_container_b_1 = Mock() assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[morpheme_container_a_0], [morpheme_container_b_0]]), equal_to([[morpheme_container_a_0, morpheme_container_b_0]])) assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[morpheme_container_a_0, morpheme_container_a_1], [morpheme_container_b_0]]), equal_to([[morpheme_container_a_0, morpheme_container_b_0], [morpheme_container_a_1, morpheme_container_b_0]])) assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[morpheme_container_a_0], [morpheme_container_b_0, morpheme_container_b_1]]), equal_to([[morpheme_container_a_0, morpheme_container_b_0], [morpheme_container_a_0, morpheme_container_b_1]])) assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[morpheme_container_a_0, morpheme_container_a_1], [morpheme_container_b_0, morpheme_container_b_1]]), equal_to([[morpheme_container_a_0, morpheme_container_b_0], [morpheme_container_a_0, morpheme_container_b_1], [morpheme_container_a_1, morpheme_container_b_0], [morpheme_container_a_1, morpheme_container_b_1]])) def test_should_get_cartesian_products_of_parse_results_when_context_has_two_items_and_blank_ones( self): morpheme_container_a_0 = Mock() morpheme_container_a_1 = Mock() morpheme_container_b_0 = Mock() morpheme_container_b_1 = Mock() assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[morpheme_container_a_0], [morpheme_container_b_0], []]), equal_to([[morpheme_container_a_0, morpheme_container_b_0]])) assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[], [morpheme_container_a_0, morpheme_container_a_1], [morpheme_container_b_0]]), equal_to([[morpheme_container_a_0, morpheme_container_b_0], [morpheme_container_a_1, morpheme_container_b_0]])) assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[morpheme_container_a_0], [], [morpheme_container_b_0, morpheme_container_b_1]]), equal_to([[morpheme_container_a_0, morpheme_container_b_0], [morpheme_container_a_0, morpheme_container_b_1]])) assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[morpheme_container_a_0, morpheme_container_a_1], [morpheme_container_b_0, morpheme_container_b_1], []]), equal_to([[morpheme_container_a_0, morpheme_container_b_0], [morpheme_container_a_0, morpheme_container_b_1], [morpheme_container_a_1, morpheme_container_b_0], [morpheme_container_a_1, morpheme_container_b_1]])) def test_should_get_cartesian_products_of_parse_results_when_context_has_three( self): morpheme_container_a_0 = Mock() morpheme_container_b_0 = Mock() morpheme_container_c_0 = Mock() morpheme_container_a_1 = Mock() morpheme_container_b_1 = Mock() morpheme_container_c_1 = Mock() assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[morpheme_container_a_0], [morpheme_container_b_0], [morpheme_container_c_0]]), equal_to([[ morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_0 ]])) assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[], [morpheme_container_a_0], [morpheme_container_b_0], [morpheme_container_c_0]]), equal_to([[ morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_0 ]])) assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[morpheme_container_a_0], [], [morpheme_container_b_0], [morpheme_container_c_0]]), equal_to([[ morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_0 ]])) assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[morpheme_container_a_0], [morpheme_container_b_0], [], [morpheme_container_c_0]]), equal_to([[ morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_0 ]])) assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[morpheme_container_a_0], [], [morpheme_container_b_0], [], [morpheme_container_c_0], []]), equal_to([[ morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_0 ]])) assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[morpheme_container_a_0, morpheme_container_a_1], [], [morpheme_container_b_0], [morpheme_container_c_0, morpheme_container_c_1], []]), equal_to([ [ morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_0 ], [ morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_1 ], [ morpheme_container_a_1, morpheme_container_b_0, morpheme_container_c_0 ], [ morpheme_container_a_1, morpheme_container_b_0, morpheme_container_c_1 ], ])) assert_that( self.generator._get_cartesian_products_of_context_parse_results([ [], [], [morpheme_container_a_0], [morpheme_container_b_0, morpheme_container_b_1], [morpheme_container_c_0, morpheme_container_c_1], ]), equal_to([ [ morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_0 ], [ morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_1 ], [ morpheme_container_a_0, morpheme_container_b_1, morpheme_container_c_0 ], [ morpheme_container_a_0, morpheme_container_b_1, morpheme_container_c_1 ], ])) assert_that( self.generator._get_cartesian_products_of_context_parse_results([ [morpheme_container_a_0, morpheme_container_a_1], [morpheme_container_b_0, morpheme_container_b_1], [], [morpheme_container_c_0], ]), equal_to([ [ morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_0 ], [ morpheme_container_a_0, morpheme_container_b_1, morpheme_container_c_0 ], [ morpheme_container_a_1, morpheme_container_b_0, morpheme_container_c_0 ], [ morpheme_container_a_1, morpheme_container_b_1, morpheme_container_c_0 ], ])) assert_that( self.generator._get_cartesian_products_of_context_parse_results([ [morpheme_container_a_0, morpheme_container_a_1], [morpheme_container_b_0, morpheme_container_b_1], [morpheme_container_c_0, morpheme_container_c_1], [], [], ]), equal_to([ [ morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_0 ], [ morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_1 ], [ morpheme_container_a_0, morpheme_container_b_1, morpheme_container_c_0 ], [ morpheme_container_a_0, morpheme_container_b_1, morpheme_container_c_1 ], [ morpheme_container_a_1, morpheme_container_b_0, morpheme_container_c_0 ], [ morpheme_container_a_1, morpheme_container_b_0, morpheme_container_c_1 ], [ morpheme_container_a_1, morpheme_container_b_1, morpheme_container_c_0 ], [ morpheme_container_a_1, morpheme_container_b_1, morpheme_container_c_1 ], ])) def test_should_get_cartesian_products_of_parse_results_when_context_has_four( self): morpheme_container_a_0 = Mock() morpheme_container_b_0 = Mock() morpheme_container_c_0 = Mock() morpheme_container_d_0 = Mock() assert_that( self.generator._get_cartesian_products_of_context_parse_results( [[morpheme_container_a_0], [morpheme_container_b_0], [morpheme_container_c_0], [morpheme_container_d_0]]), equal_to([[ morpheme_container_a_0, morpheme_container_b_0, morpheme_container_c_0, morpheme_container_d_0 ]]))