def _generate_likelihood(self, surface, leading_context=None, following_context=None): assert leading_context or following_context likelihoods = [] results = self.contextless_parser.parse(surface) if not results: return None if len(results)==1: return [(formatter.format_morpheme_container_for_parseset(results[0]), 1.0)] for result in results: formatted_parse_result = formatter.format_morpheme_container_for_parseset(result) likelihood = 0.0 if leading_context and following_context: likelihood = self.generator.calculate_likelihood(result, leading_context, following_context) elif leading_context: likelihood = self.generator.calculate_oneway_likelihood(result, leading_context, True) elif following_context: likelihood = self.generator.calculate_oneway_likelihood(result, following_context, False) likelihoods.append((formatted_parse_result, likelihood)) return likelihoods
def _generate_likelihood(self, surface, leading_context=None, following_context=None): assert leading_context or following_context likelihoods = [] results = self.contextless_parser.parse(surface) if not results: return None if len(results) == 1: return [ (formatter.format_morpheme_container_for_parseset(results[0]), 1.0) ] for result in results: formatted_parse_result = formatter.format_morpheme_container_for_parseset( result) likelihood = 0.0 if leading_context and following_context: likelihood = self.calculator.calculate_likelihood( result, leading_context, following_context) elif leading_context: likelihood = self.calculator.calculate_oneway_likelihood( result, leading_context, True) elif following_context: likelihood = self.calculator.calculate_oneway_likelihood( result, following_context, False) likelihoods.append((formatted_parse_result, likelihood)) return likelihoods
def test_should_format_for_parseset(self): parse_result = self.parser.parse(u'kitaba')[0] assert_that( formatter.format_morpheme_container_for_parseset(parse_result), equal_to(u'kitap+Noun+A3sg+Pnon+Dat')) parse_result = self.parser.parse(u'yaptırtmayı')[0] assert_that( formatter.format_morpheme_container_for_parseset(parse_result), equal_to( u'yap+Verb+Verb+Caus+Verb+Caus+Pos+Noun+Inf+A3sg+Pnon+Acc'))
def _test_generate_likelihood(self, surface, leading_context=None, following_context=None, create_calculation_context=False): self.generator.build_indexes() assert leading_context or following_context leading_context = self._get_context(leading_context) following_context = self._get_context(following_context) likelihoods = [] results = self.contextless_parser.parse(surface) for result in results: calculation_context = None if create_calculation_context: calculation_context = {} formatted_parse_result = formatter.format_morpheme_container_for_parseset(result) likelihood = 0.0 if leading_context and following_context: likelihood = self.generator.calculate_likelihood(result, leading_context, following_context, calculation_context) elif leading_context: likelihood = self.generator.calculate_oneway_likelihood(result, leading_context, True, calculation_context) elif following_context: likelihood = self.generator.calculate_oneway_likelihood(result, following_context, False, calculation_context) likelihoods.append((formatted_parse_result, likelihood, calculation_context)) for item in likelihoods: pprint.pprint(item)
def _test_generate_likelihood(self, surface, leading_context=None, following_context=None, calculation_context=None): assert leading_context or following_context likelihoods = [] results = self.contextless_parser.parse(surface) for result in results: formatted_parse_result = formatter.format_morpheme_container_for_parseset( result) likelihood = 0.0 if leading_context and following_context: likelihood = self.generator.calculate_likelihood( result, leading_context, following_context, calculation_context) elif leading_context: likelihood = self.generator.calculate_oneway_likelihood( result, leading_context, True, calculation_context) elif following_context: likelihood = self.generator.calculate_oneway_likelihood( result, following_context, False, calculation_context) likelihoods.append((formatted_parse_result, likelihood)) for item in likelihoods: print item
def add_parse_result(self, uuid_for_parse_result, parse_result, likelihood_value, likelihood_percentage, is_correct_parse_result, calculation_context): parse_result_containers = self._context.get('parse_results') or [] parse_result_container = { 'uuid': uuid_for_parse_result, 'formatted_parse_result': formatter.format_morpheme_container_for_parseset(parse_result, add_space=True), 'likelihood_value': likelihood_value, 'likelihood_percentage': likelihood_percentage, 'likelihood_percentage_color': self._get_likelihood_percentage_color(likelihood_percentage), 'correct_parse_result': is_correct_parse_result, 'calculation_context': calculation_context } parse_result_containers.append(parse_result_container) self._context['parse_results'] = parse_result_containers
def _test_calculate(self, surface): results = self.contextless_parser.parse(surface) likelihoods = [] for result in results: formatted_parse_result = formatter.format_morpheme_container_for_parseset(result) formatted_parse_result_likelihood = self.calculator.calculate(result) likelihoods.append((formatted_parse_result, formatted_parse_result_likelihood)) pprint.pprint(likelihoods)
def _get_word_morpheme_container_tuple(self, seq, expected_result=None): res = self.parser.parse(seq) if res: if expected_result: matching_containers = filter(lambda parse_result : formatter.format_morpheme_container_for_parseset(parse_result)==expected_result, res) if matching_containers: return seq, matching_containers[0] else: return seq, None else: return seq, res[0] else: return seq, None
def save_parse_result_for_word(self, word_id, parse_result_uuid): """ @type word_id: ObjectId @type parse_result_uuid: str or unicode """ parse_result = self.sessionmanager.get_parse_result(parse_result_uuid) assert parse_result, "No parse result found with id {}".format(parse_result) word = self.dbmanager.get_word(word_id) if not word: raise Exception("Word not found for setting the correct parse result! {}".format(word_id)) # check if the parse result belongs to the given word assert word['surface'] == parse_result.get_surface() or TurkishAlphabet.lower(word['surface']) == parse_result.get_surface() self.dbmanager.set_parse_result_for_word(word, formatter.format_morpheme_container_for_parseset(parse_result), parse_result)
def add_parse_result(self, uuid_for_parse_result, parse_result, likelihood_value, likelihood_percentage, is_correct_parse_result, calculation_context): parse_result_containers = self._context.get('parse_results') or [] parse_result_container = { 'uuid' : uuid_for_parse_result, 'formatted_parse_result' : formatter.format_morpheme_container_for_parseset(parse_result, add_space=True), 'likelihood_value' : likelihood_value, 'likelihood_percentage' : likelihood_percentage, 'likelihood_percentage_color' : self._get_likelihood_percentage_color(likelihood_percentage), 'correct_parse_result' : is_correct_parse_result, 'calculation_context' : calculation_context } parse_result_containers.append(parse_result_container) self._context['parse_results'] = parse_result_containers
def _test_generate_likelihood(self, surface, leading_context=None, following_context=None, calculation_context=None): assert leading_context or following_context likelihoods = [] results = self.contextless_parser.parse(surface) for result in results: formatted_parse_result = formatter.format_morpheme_container_for_parseset(result) likelihood = 0.0 if leading_context and following_context: likelihood = self.generator.calculate_likelihood(result, leading_context, following_context, calculation_context) elif leading_context: likelihood = self.generator.calculate_oneway_likelihood(result, leading_context, True, calculation_context) elif following_context: likelihood = self.generator.calculate_oneway_likelihood(result, following_context, False, calculation_context) likelihoods.append((formatted_parse_result, likelihood)) for item in likelihoods: print item
def create_word_binding_from_morpheme_container(self, word_str, morpheme_container): assert (word_str == morpheme_container.get_surface_so_far()) or (TurkishAlphabet.lower(word_str[0])+word_str[1:] == morpheme_container.get_surface_so_far()) root_str = morpheme_container.get_root().str lemma = morpheme_container.get_root().lexeme.lemma lemma_root = morpheme_container.get_root().lexeme.root root_syntactic_category = morpheme_container.get_root().lexeme.syntactic_category root_secondary_syntactic_category = morpheme_container.get_root().lexeme.secondary_syntactic_category root = RootBinding(root_str, lemma, lemma_root, root_syntactic_category, root_secondary_syntactic_category) word_syntactic_category = morpheme_container.get_surface_syntactic_category() word_secondary_syntactic_category = morpheme_container.get_surface_secondary_syntactic_category() parse_result = formatter.format_morpheme_container_for_parseset(morpheme_container) word = WordBinding(word_str, parse_result, root, word_syntactic_category, word_secondary_syntactic_category) if morpheme_container.get_transitions(): so_far = root_str for transition in morpheme_container.get_transitions(): if isinstance(transition.suffix_form_application.suffix_form.suffix, FreeTransitionSuffix): continue suffix_name = transition.suffix_form_application.suffix_form.suffix.name suffix_pretty_name = transition.suffix_form_application.suffix_form.suffix.pretty_name suffix_form = transition.suffix_form_application.suffix_form.form suffix_application = transition.suffix_form_application.fitting_suffix_form suffix_actual_application = transition.suffix_form_application.actual_suffix_form word_with_suffix_application = None if (so_far + suffix_actual_application)==root_str: word_with_suffix_application = morpheme_container.get_root().lexeme.root + suffix_application else: word_with_suffix_application = so_far + suffix_application so_far += suffix_actual_application if transition.is_derivational(): suffix = DerivationalSuffixBinding(suffix_name, suffix_pretty_name, suffix_form, suffix_application, suffix_actual_application, word_with_suffix_application, so_far, transition.to_state.syntactic_category) word.suffixes.append(suffix) else: suffix = InflectionalSuffixBinding(suffix_name, suffix_pretty_name, suffix_form, suffix_application, suffix_actual_application, word_with_suffix_application, so_far, transition.to_state.syntactic_category) word.suffixes.append(suffix) return word
def _test_generate_likelihood(self, surface, leading_context=None, following_context=None, create_calculation_context=False): self.generator.build_indexes() assert leading_context or following_context leading_context = self._get_context(leading_context) following_context = self._get_context(following_context) likelihoods = [] results = self.contextless_parser.parse(surface) for result in results: calculation_context = None if create_calculation_context: calculation_context = {} formatted_parse_result = formatter.format_morpheme_container_for_parseset( result) likelihood = 0.0 if leading_context and following_context: likelihood = self.generator.calculate_likelihood( result, leading_context, following_context, calculation_context) elif leading_context: likelihood = self.generator.calculate_oneway_likelihood( result, leading_context, True, calculation_context) elif following_context: likelihood = self.generator.calculate_oneway_likelihood( result, following_context, False, calculation_context) likelihoods.append( (formatted_parse_result, likelihood, calculation_context)) for item in likelihoods: pprint.pprint(item)
def save_parse_result_for_word(self, word_id, parse_result_uuid): """ @type word_id: ObjectId @type parse_result_uuid: str or unicode """ parse_result = self.sessionmanager.get_parse_result(parse_result_uuid) assert parse_result, "No parse result found with id {}".format( parse_result) word = self.dbmanager.get_word(word_id) if not word: raise Exception( "Word not found for setting the correct parse result! {}". format(word_id)) # check if the parse result belongs to the given word assert word['surface'] == parse_result.get_surface( ) or TurkishAlphabet.lower( word['surface']) == parse_result.get_surface() self.dbmanager.set_parse_result_for_word( word, formatter.format_morpheme_container_for_parseset(parse_result), parse_result)
def format(self, add_space=False): return formatter.format_morpheme_container_for_parseset(self, add_space)
def go_to_word(self, word_id): """ @type word_id: ObjectId """ assert LearnerController.WORD_COUNT_TO_SHOW_IN_CONTEXT >= LearnerController.WORD_COUNT_TO_USE_AS_PARSE_CONTEXT assert word_id self.sessionmanager.delete_parse_results() # find and set new word in view word = self.dbmanager.get_word(word_id) assert word self.learnerview.set_current_word(word) word_index = word["index"] corpus_id = word["corpus_id"] # set corpus id in the view self.learnerview.set_corpus_id(corpus_id) # find and set contexts (to be shown) in view leading_start_index_to_show = word_index - LearnerController.WORD_COUNT_TO_SHOW_IN_CONTEXT leading_end_index_to_show = word_index - 1 following_start_index_to_show = word_index + 1 following_end_index_to_show = word_index + LearnerController.WORD_COUNT_TO_SHOW_IN_CONTEXT leading_words = self.dbmanager.get_words_in_range( corpus_id, leading_start_index_to_show, leading_end_index_to_show ) following_words = self.dbmanager.get_words_in_range( corpus_id, following_start_index_to_show, following_end_index_to_show ) self.learnerview.set_leading_words(leading_words) self.learnerview.set_following_words(following_words) # set counts and indices of the new word within counts in view all_nonparsed_count = self.dbmanager.count_all_nonparsed(corpus_id) prior_nonparsed_count = self.dbmanager.count_nonparsed_prior_to_index(corpus_id, word_index) all_count = self.dbmanager.count_all(corpus_id) if not word["parsed"]: self.learnerview.set_all_nonparsed_count(all_nonparsed_count) self.learnerview.set_prior_nonparsed_count(prior_nonparsed_count) self.learnerview.set_all_count(all_count) # find previous and next nonparsed words and set the stuff on the ui previous_nonparsed_word = self.dbmanager.find_previous_nonparsed_word(corpus_id, word_index) next_nonparsed_word = self.dbmanager.find_next_nonparsed_word(corpus_id, word_index) if previous_nonparsed_word: self.learnerview.set_previous_nonparsed_word(previous_nonparsed_word) if next_nonparsed_word: self.learnerview.set_next_nonparsed_word(next_nonparsed_word) next_word = self.dbmanager.find_next_word(corpus_id, word) if next_word: self.learnerview.set_next_word(next_word) # find parse context words leading_parse_context_words = ( leading_words[-LearnerController.WORD_COUNT_TO_USE_AS_PARSE_CONTEXT :] if len(leading_words) >= LearnerController.WORD_COUNT_TO_USE_AS_PARSE_CONTEXT else leading_words[:] ) following_parse_context_words = ( following_words[: LearnerController.WORD_COUNT_TO_USE_AS_PARSE_CONTEXT] if len(following_words) >= LearnerController.WORD_COUNT_TO_USE_AS_PARSE_CONTEXT else following_words[:] ) leading_parse_context = self.parse_context_creator.create(leading_parse_context_words) following_parse_context = self.parse_context_creator.create(following_parse_context_words) # parse and set parse results in view parse_results_with_likelihoods = [] calculation_context = {} parse_results = self.contextful_morphological_parser.parse_with_likelihoods( word["surface"], leading_parse_context, following_parse_context, calculation_context ) if not parse_results: return for parse_result_index, (parse_result, likelihood) in enumerate(parse_results): parse_results_with_likelihoods.append((parse_result, likelihood, calculation_context[parse_result_index])) total_likelihood = sum([t[1] for t in parse_results_with_likelihoods]) # sort by likelihood then "shortness" parse_results_with_likelihoods = sorted( parse_results_with_likelihoods, key=lambda tup: (tup[1], -len(tup[0].get_transitions())), reverse=True ) for parse_result, likelihood_value, calculation_context in parse_results_with_likelihoods: uuid_for_parse_result = self.sessionmanager.put_parse_result_in_session(parse_result, calculation_context) likelihood_percent = likelihood_value / total_likelihood * 100.0 if total_likelihood > 0.0 else 0.0 is_correct_parse_result = ( word["parsed"] and formatter.format_morpheme_container_for_parseset(parse_result) == word["parse_result"] ) self.learnerview.add_parse_result( uuid_for_parse_result, parse_result, likelihood_value, likelihood_percent, is_correct_parse_result, calculation_context, )
def go_to_word(self, word_id): """ @type word_id: ObjectId """ assert LearnerController.WORD_COUNT_TO_SHOW_IN_CONTEXT >= LearnerController.WORD_COUNT_TO_USE_AS_PARSE_CONTEXT assert word_id self.sessionmanager.delete_parse_results() # find and set new word in view word = self.dbmanager.get_word(word_id) assert word self.learnerview.set_current_word(word) word_index = word['index'] corpus_id = word['corpus_id'] # set corpus id in the view self.learnerview.set_corpus_id(corpus_id) # find and set contexts (to be shown) in view leading_start_index_to_show = word_index - LearnerController.WORD_COUNT_TO_SHOW_IN_CONTEXT leading_end_index_to_show = word_index - 1 following_start_index_to_show = word_index + 1 following_end_index_to_show = word_index + LearnerController.WORD_COUNT_TO_SHOW_IN_CONTEXT leading_words = self.dbmanager.get_words_in_range( corpus_id, leading_start_index_to_show, leading_end_index_to_show) following_words = self.dbmanager.get_words_in_range( corpus_id, following_start_index_to_show, following_end_index_to_show) self.learnerview.set_leading_words(leading_words) self.learnerview.set_following_words(following_words) # set counts and indices of the new word within counts in view all_nonparsed_count = self.dbmanager.count_all_nonparsed(corpus_id) prior_nonparsed_count = self.dbmanager.count_nonparsed_prior_to_index( corpus_id, word_index) all_count = self.dbmanager.count_all(corpus_id) if not word['parsed']: self.learnerview.set_all_nonparsed_count(all_nonparsed_count) self.learnerview.set_prior_nonparsed_count(prior_nonparsed_count) self.learnerview.set_all_count(all_count) # find previous and next nonparsed words and set the stuff on the ui previous_nonparsed_word = self.dbmanager.find_previous_nonparsed_word( corpus_id, word_index) next_nonparsed_word = self.dbmanager.find_next_nonparsed_word( corpus_id, word_index) if previous_nonparsed_word: self.learnerview.set_previous_nonparsed_word( previous_nonparsed_word) if next_nonparsed_word: self.learnerview.set_next_nonparsed_word(next_nonparsed_word) next_word = self.dbmanager.find_next_word(corpus_id, word) if next_word: self.learnerview.set_next_word(next_word) # find parse context words leading_parse_context_words = leading_words[ -LearnerController.WORD_COUNT_TO_USE_AS_PARSE_CONTEXT:] if len( leading_words ) >= LearnerController.WORD_COUNT_TO_USE_AS_PARSE_CONTEXT else leading_words[:] following_parse_context_words = following_words[:LearnerController.WORD_COUNT_TO_USE_AS_PARSE_CONTEXT] if len( following_words ) >= LearnerController.WORD_COUNT_TO_USE_AS_PARSE_CONTEXT else following_words[:] leading_parse_context = self.parse_context_creator.create( leading_parse_context_words) following_parse_context = self.parse_context_creator.create( following_parse_context_words) # parse and set parse results in view parse_results_with_likelihoods = [] calculation_context = {} parse_results = self.contextful_morphological_parser.parse_with_likelihoods( word['surface'], leading_parse_context, following_parse_context, calculation_context) if not parse_results: return for parse_result_index, (parse_result, likelihood) in enumerate(parse_results): parse_results_with_likelihoods.append( (parse_result, likelihood, calculation_context[parse_result_index])) total_likelihood = sum([t[1] for t in parse_results_with_likelihoods]) # sort by likelihood then "shortness" parse_results_with_likelihoods = sorted( parse_results_with_likelihoods, key=lambda tup: (tup[1], -len(tup[0].get_transitions())), reverse=True) for parse_result, likelihood_value, calculation_context in parse_results_with_likelihoods: uuid_for_parse_result = self.sessionmanager.put_parse_result_in_session( parse_result, calculation_context) likelihood_percent = likelihood_value / total_likelihood * 100.0 if total_likelihood > 0.0 else 0.0 is_correct_parse_result = word[ 'parsed'] and formatter.format_morpheme_container_for_parseset( parse_result) == word['parse_result'] self.learnerview.add_parse_result(uuid_for_parse_result, parse_result, likelihood_value, likelihood_percent, is_correct_parse_result, calculation_context)
def format(self, add_space=False): return formatter.format_morpheme_container_for_parseset( self, add_space)
def add_parse_result(self, contextless_parse_result, offsets): parse_result_str = formatter.format_morpheme_container_for_parseset(contextless_parse_result) self.parse_results[parse_result_str] = contextless_parse_result self.parse_result_occurrences[parse_result_str] = offsets
def add_parse_result(self, contextless_parse_result, offsets): parse_result_str = formatter.format_morpheme_container_for_parseset( contextless_parse_result) self.parse_results[parse_result_str] = contextless_parse_result self.parse_result_occurrences[parse_result_str] = offsets
def create_word_binding_from_morpheme_container(self, word_str, morpheme_container): assert (word_str == morpheme_container.get_surface_so_far()) or ( TurkishAlphabet.lower(word_str[0]) + word_str[1:] == morpheme_container.get_surface_so_far()) root_str = morpheme_container.get_root().str lemma = morpheme_container.get_root().lexeme.lemma lemma_root = morpheme_container.get_root().lexeme.root root_syntactic_category = morpheme_container.get_root( ).lexeme.syntactic_category root_secondary_syntactic_category = morpheme_container.get_root( ).lexeme.secondary_syntactic_category root = RootBinding(root_str, lemma, lemma_root, root_syntactic_category, root_secondary_syntactic_category) word_syntactic_category = morpheme_container.get_surface_syntactic_category( ) word_secondary_syntactic_category = morpheme_container.get_surface_secondary_syntactic_category( ) parse_result = formatter.format_morpheme_container_for_parseset( morpheme_container) word = WordBinding(word_str, parse_result, root, word_syntactic_category, word_secondary_syntactic_category) if morpheme_container.get_transitions(): so_far = root_str for transition in morpheme_container.get_transitions(): if isinstance( transition.suffix_form_application.suffix_form.suffix, FreeTransitionSuffix): continue suffix_name = transition.suffix_form_application.suffix_form.suffix.name suffix_pretty_name = transition.suffix_form_application.suffix_form.suffix.pretty_name suffix_form = transition.suffix_form_application.suffix_form.form suffix_application = transition.suffix_form_application.fitting_suffix_form suffix_actual_application = transition.suffix_form_application.actual_suffix_form word_with_suffix_application = None if (so_far + suffix_actual_application) == root_str: word_with_suffix_application = morpheme_container.get_root( ).lexeme.root + suffix_application else: word_with_suffix_application = so_far + suffix_application so_far += suffix_actual_application if transition.is_derivational(): suffix = DerivationalSuffixBinding( suffix_name, suffix_pretty_name, suffix_form, suffix_application, suffix_actual_application, word_with_suffix_application, so_far, transition.to_state.syntactic_category) word.suffixes.append(suffix) else: suffix = InflectionalSuffixBinding( suffix_name, suffix_pretty_name, suffix_form, suffix_application, suffix_actual_application, word_with_suffix_application, so_far, transition.to_state.syntactic_category) word.suffixes.append(suffix) return word
def test_should_format_for_parseset(self): parse_result = self.parser.parse(u'kitaba')[0] assert_that(formatter.format_morpheme_container_for_parseset(parse_result), equal_to(u'kitap+Noun+A3sg+Pnon+Dat')) parse_result = self.parser.parse(u'yaptırtmayı')[0] assert_that(formatter.format_morpheme_container_for_parseset(parse_result), equal_to(u'yap+Verb+Verb+Caus+Verb+Caus+Pos+Noun+Inf+A3sg+Pnon+Acc'))
root_map = root_map_generator.generate(all_roots) suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(ProperNounSuffixGraph(BasicSuffixGraph()))) suffix_graph.initialize() predefined_paths = PredefinedPaths(root_map, suffix_graph) predefined_paths.create_predefined_paths() word_root_finder = WordRootFinder(root_map) text_numeral_root_finder = TextNumeralRootFinder(root_map) digit_numeral_root_finder = DigitNumeralRootFinder() proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder() proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder() parser = UpperCaseSupportingContextlessMorphologicalParser(suffix_graph, predefined_paths, [word_root_finder, text_numeral_root_finder, digit_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder]) sentence = sys.argv[1].decode('utf-8') for word in sentence.split(): lst = parser.parse(word) root_set = set() for element in lst: formatted = formatter.format_morpheme_container_for_parseset(element) root = formatted[:formatted.index('+')] root_set.add(root.lower()) for root in root_set: print(root.encode('utf-8'), end=' ') print()