def create_diamond(mode, nom, prop, nested_diamonds_list): """ creates an HLDS feature structure from scratch (in contrast to convert_diamond_xml2fs, which converts an HLDS XML structure into its corresponding feature structure representation) NOTE: I'd like to simply put this into __init__, but I don't know how to subclass FeatDict properly. FeatDict.__new__ complains about Diamond.__init__(self, mode, nom, prop, nested_diamonds_list) having too many arguments. :type mode: ``Str`` :type nom: ``Str`` :type prop: ``Str`` :type nested_diamonds_list: ``list`` """ diamond = Diamond() diamond[Feature('mode')] = mode if nom: diamond.update({"nom": nom}) if prop: diamond.update({"prop": prop}) if nested_diamonds_list: for i, nested_diamond in enumerate(nested_diamonds_list): identifier = "{0}__{1}".format( str(i).zfill(2), nested_diamond[Feature("mode")]) diamond.update({identifier: nested_diamond}) return diamond
def abbreviate_textplan(textplan): """ recursive helper function that prints only the skeletton of a textplan (message types and RST relations but not the actual message content) :param textplan: a text plan, a constituent set or a message :type textplan: ``TextPlan`` or ``ConstituentSet`` or ``Message`` :return: a message (without the attribute value pairs stored in it) :rtype: ``Message`` """ if isinstance(textplan, TextPlan): score = textplan["title"]["book score"] abbreviated_textplan = abbreviate_textplan(textplan["children"]) return TextPlan(book_score=score, children=abbreviated_textplan) if isinstance(textplan, ConstituentSet): reltype = textplan[Feature("relType")] nucleus = abbreviate_textplan(textplan[Feature("nucleus")]) satellite = abbreviate_textplan(textplan[Feature("satellite")]) return ConstituentSet(relType=reltype, nucleus=nucleus, satellite=satellite) if isinstance(textplan, Message): msgtype = textplan[Feature("msgType")] return Message(msgType=msgtype)
def lexicalize_extra(extra_message_block): r""" lexicalize all the messages contained in an extra message block (aka ``Message``) :type: ``Message`` :param: a message (of type "extra") :rtype: ``List`` of ``Diamond``s :return: a list of lexicalized phrases, which can be realized with ``tccg`` directly or turned into sentences beforehand with ``lexicalization.phrase2sentence`` to remove ambiguity NOTE: "außerdem" works only in a limited number of contexts, e.g. 'das Buch ist neu, außerdem ist es auf Deutsch' but not 'das Buch ist neu, außerdem ist das Buch auf Deutsch'. therefore, no connective is used here so far. """ assert extra_message_block[Feature("msgType")] == "extra" msg_block = deepcopy(extra_message_block) authors = msg_block[Feature("reference_authors")] title = msg_block[Feature("reference_title")] #author_variations = lexicalize_authors_variations(authors) title_variations = lexicalize_title_variations(title, authors) lxed_phrses = [] for msg_name, msg in msg_block.items(): if isinstance(msg_name, str): lexicalize_function_name = "lexicalize_" + msg_name random_title = random_variation(title_variations) lxed_phrses.append( eval(lexicalize_function_name)(msg, lexicalized_title=random_title)) return lxed_phrses
def __sentence_fs2xml(sentence, mode="test"): """ transforms a sentence (in NLTK feature structure notation) into its corresponding HLDS XML <item></item> structure. :type sentence: ``Sentence`` :param sentence: a sentence in NLTK feature structure notation :type mode: ``str`` :param mode: "test", if the sentence will be part of a (regression) testbed file (ccg-test). "realize", if the sentence will be put in a file on its own (ccg-realize). :rtype: ``etree._Element`` :return: the input sentence in HLDS XML format (represented as an etree element) """ if mode is "test": expected_parses = sentence[Feature("expected_parses")] text = sentence[Feature("text")] item = etree.Element("item", numOfParses=str(expected_parses), string=ensure_unicode(text)) xml = etree.SubElement(item, "xml") lf = etree.SubElement(xml, "lf") else: # mode is "realize" lf = etree.Element("lf") root_nom = sentence[Feature("root_nom")] satop = etree.SubElement(lf, "satop", nom=root_nom) if Feature("root_prop") in sentence: root_prop = sentence[Feature("root_prop")] etree.SubElement(satop, "prop", name=root_prop) diamonds = [] for key in sorted(sentence.keys()): # keys need to be sorted, otherwise Diamonds within a Sentence will have a # different order than before. Diamond keys seem ordered, but they aren't # (keys beginning with numbers seem to be in descending order, those # beginning with letters in ascending order) if isinstance(sentence[key], Diamond): diamonds.append(sentence[key]) etree_diamonds = [] for diamond in diamonds: etree_diamonds.append(__diamond_fs2xml(diamond)) for diamond in etree_diamonds: final_position = len(satop) satop.insert(final_position, diamond) if mode is "test": return item else: return lf
def add_mode_suffix(diamond, mode="N"): matching_subdiamond_keys = [] for key in diamond.keys(): if isinstance(key, str) and key.endswith(mode): if diamond[key][Feature("mode")] == mode: matching_subdiamond_keys.append(key) sorted_subdiamond_keys = sorted(matching_subdiamond_keys) for i, key in enumerate(sorted_subdiamond_keys): diamond[key][Feature("mode")] = "{0}{1}".format(mode, i + 1) for key, value in diamond.items(): if isinstance(value, Diamond): add_mode_suffix(value, mode)
def __msgtype_print(message): """ recursive helper function for msgtypes(), which prints message types and RST relation types :type message: ``Message`` or ``ConstituentSet`` :rtype: ``str`` """ if isinstance(message, Message): return message[Feature("msgType")] if isinstance(message, ConstituentSet): nucleus = __msgtype_print(message[Feature("nucleus")]) reltype = message[Feature("relType")] satellite = __msgtype_print(message[Feature("satellite")]) return "{0}({1}, {2})".format(reltype, nucleus, satellite)
def check_and_realize_textplan(openccg, textplan, lexicalize_message_block, phrase2sentence): """ realizes a text plan and warns about message blocks that cannot be realized due to current restrictions in the OpenCC grammar. Parameters ---------- openccg : OpenCCG a running OpenCCG instance textplan : TextPlan text plan to be realized """ msg_blocks = linearize_textplan(textplan) for msg_block in msg_blocks: try: lexicalized_msg_block = lexicalize_message_block(msg_block) print "The {0} message block can be realized " \ "as follows:\n".format(msg_block[Feature("msgType")]) for lexicalized_phrase in lexicalized_msg_block: lexicalized_sentence = phrase2sentence(lexicalized_phrase) for realized_sent in openccg.realize(lexicalized_sentence): print realized_sent except NotImplementedError, err: print err print "The message block contains these messages:\n", msg_block, \ "\n\n**********\n\n" print
def lexicalize_lastbook_match(lastbook_match_message_block): r""" lexicalize all the messages contained in a lastbook_match message block (aka ``Message``) :type: ``Message`` :param: a message (of type "lastbook_match") :rtype: ``List`` of ``Diamond``s :return: a list of lexicalized phrases, which can be realized with ``tccg`` directly or turned into sentences beforehand with ``lexicalization.phrase2sentence`` to remove ambiguity possible: sowohl X als auch Y / beide Bücher implemented: beide Bücher TODO: implement lexicalize_pagerange """ assert lastbook_match_message_block[Feature("msgType")] == "lastbook_match" msg_block = deepcopy(lastbook_match_message_block) num = gen_num("plur") art = gen_art("quantbeide") agens = create_diamond("AGENS", "artefaktum", "Buch", [num, art]) lxed_phrses = [] for msg_name, msg in msg_block.items(): if isinstance(msg_name, str) and msg_name not in ("lastbook_authors", "lastbook_title", "pagerange"): lexicalize_function_name = "lexicalize_" + msg_name lxed_phrses.append( eval(lexicalize_function_name)(msg, lexicalized_title=agens)) return lxed_phrses
def __textplan_header2xml(tree_root, textplan): """ helper function for textplan2xml() and textplans2xml(). extracts meta data from the text plan (book score etc.), calls __textplantree2xml to convert the actual text plan to XML and inserts both into the tree_root XML structure. :type tree_root: ``etree._Element`` :param tree_root: the root element of the resulting text plan XML structure :type textplan: ``TextPlan`` :rtype: ``etree._Element`` :return: one <textplan></textplan> XML structure """ xml_textplan = etree.SubElement(tree_root, "textplan") book_score = str(textplan["title"]["book score"]) document_type = textplan[Feature("type")] target_string = textplan["title"]["text"] header = etree.SubElement(xml_textplan, "header", score=book_score, type=document_type) target = etree.SubElement(header, "target") target.text = target_string rst_tree = __textplantree2xml(textplan["children"]) xml_textplan.insert(1, rst_tree) return xml_textplan
def lexicalize_id(id_message_block): r""" lexicalize all the messages contained in an id message block (aka ``Message``) :type: ``Message`` :param: a message (of type "id") :rtype: ``List`` of ``Diamond``s :return: a list of lexicalized phrases, which can be realized with ``tccg`` directly or turned into sentences beforehand with ``lexicalization.phrase2sentence`` to remove ambiguity """ assert id_message_block[Feature("msgType")] == "id" msg_block = deepcopy(id_message_block) authors = msg_block["authors"] title = msg_block["title"] #author_variations = lexicalize_authors_variations(authors) title_variations = lexicalize_title_variations(title, authors) lxed_phrses = [] if "year" in msg_block: lxed_phrses.append(lexicalize_title_description(msg_block["title"], msg_block["authors"], msg_block["year"])) else: lxed_phrses.append(lexicalize_title_description(msg_block["title"], msg_block["authors"])) identifiers = set(["title", "authors", "year"]) for msg_name, msg in msg_block.items(): if isinstance(msg_name, Feature) or msg_name in identifiers: msg_block.pop(msg_name) msg_names = msg_block.keys() if "codeexamples" in msg_names: if "proglang" in msg_names and msg_block["proglang"][0]: # proglang should not be realized if the book doesn't use one lexicalized_proglang = lexicalize_proglang(msg_block["proglang"], realize="embedded") lxed_phrses.append(lexicalize_codeexamples( msg_block["codeexamples"], lexicalized_proglang, random_variation(title_variations), lexeme="random")) msg_block.pop("proglang") else: lxed_phrses.append( lexicalize_codeexamples(msg_block["codeexamples"], random_variation(title_variations), lexeme="random")) msg_block.pop("codeexamples") for msg_name, msg in msg_block.items(): lexicalize_function_name = "lexicalize_" + msg_name lxed_phrses.append( eval(lexicalize_function_name)(msg, lexicalized_title=random_variation(title_variations))) return lxed_phrses
def insert_subdiamond(self, index, subdiamond_to_insert, mode=None): """ insert a ``Diamond`` into this one before the index, while allowing to change the mode of the subdiamond. :type index: ``int`` :type subdiamond_to_insert: ``Diamond`` :type mode: ``str`` or ``NoneType`` :param mode: the mode that the subdiamond shall have. this will also be used to determine the subdiamonds identifier. if the diamond already has two subdiamonds (e.g. "00__AGENS" and "01__PATIENS") and we'll insert a third subdiamond at index '1' with mode "TEMP", its identifier will be "01__TEMP", while the remaining two subdiamond identifiers will will be changed accordingly, e.g. "00__AGENS" and "02__PATIENS". if mode is None, the subdiamonds mode will be left untouched. """ if mode: #change mode only if not None subdiamond_to_insert.update({Feature("mode"): mode}) # a featstruct is essentially a dictionary, so we'll need to sort it! existing_subdiamonds = sorted( [(dkey, d) for (dkey, d) in self.items() if isinstance(d, Diamond)], key=itemgetter(0)) prefixless_subdiamonds = [] for diamond_key, diamond in existing_subdiamonds: prefixless_subdiamonds.append(diamond) self.pop(diamond_key) prefixless_subdiamonds.insert(index, subdiamond_to_insert) for diamond in prefixless_subdiamonds: self.append_subdiamond(diamond)
def change_mode(self, mode): """ changes the mode of a ``Diamond``, which is sometimes needed when embedding it into another ``Diamond`` or ``Sentence``. :type mode: ``str`` """ self[Feature('mode')] = mode
def compute_features_dict(productions: List[Production]) -> Dict[str, List[str]]: features_dict: Dict[str, List[str]] = dict() for production in productions: for term in (production.lhs(),) + production.rhs(): if isinstance(term, Nonterminal): name = str(term[Feature("type")]).lower() features_dict.setdefault(name, []) current_features = set(features_dict[name]) new_features = set(str(feature) for feature in term if str(feature) != "*type*") features_dict[name] = list(current_features.union(new_features)) return features_dict
def parse_term(term: Union[FeatStructNonterminal, str], features_dict: Dict[str, List[str]]) -> str: if not isinstance(term, Nonterminal): return term.lower() name = str(term[Feature("type")]).lower() param_list = [] for feature in features_dict[name]: if feature in term: param_list.append(transform_value(term[feature])) else: param_list.append("_") return "%s(%s)" % (name, ", ".join(param_list)) if len(param_list) != 0 else name
def append_subdiamond(self, subdiamond, mode=None): """ appends a subdiamond structure to an existing diamond structure, while allowing to change the mode of the subdiamond :type mode: ``str`` or ``NoneType`` :param mode: the mode that the subdiamond shall have. this will also be used to determine the subdiamonds identifier. if the diamond already has two subdiamonds (e.g. "00__AGENS" and "01__PATIENS") and add a third subdiamond with mode "TEMP", its identifier will be "02__TEMP". if mode is None, the subdiamonds mode will be left untouched. """ index = last_diamond_index(self) + 1 if mode: #change mode only if not None subdiamond.update({Feature("mode"): mode}) identifier = "{0}__{1}".format( str(index).zfill(2), subdiamond[Feature("mode")]) self.update({identifier: subdiamond})
def __textplantree2xml(tree): """ helper function for __textplan_header2xml() which converts the actual text plan into XML. :type tree: ``ConstituentSet`` or ``Message`` :rtype: ``etree._Element`` """ if isinstance(tree, ConstituentSet): relation_type = tree[Feature("relType")] nucleus_tree = __textplantree2xml(tree[Feature("nucleus")]) satellite_tree = __textplantree2xml(tree[Feature("satellite")]) relation = etree.Element("relation", type=relation_type) nucleus = etree.SubElement(relation, "nucleus") nucleus.insert(0, nucleus_tree) satellite = etree.SubElement(relation, "satellite") satellite.insert(0, satellite_tree) return relation elif isinstance(tree, Message): return __message2xml(tree)
def __message2xml(message): """ converts a single ``Message`` into an XML structure. :type message: ``Message`` :rtype: ``etree._Element`` """ msg_type = message[Feature("msgType")] msg = etree.Element("message", type=msg_type) msg_elements = [(key, val) for (key, val) in message.items() if key != Feature("msgType")] for key, val in msg_elements: if isinstance(key, str): if isinstance(val, FeatDict): #relative length or recency __message_strkey_featdictval2xml(msg, key, val) elif isinstance(val, tuple): # (value, rating) tuple __message_strkey_tupleval2xml(msg, key, val) else: #if isinstance(key, Feature): __message_featurekey2xml(msg, key, val) return msg
def create_sentence(self, sent_str, expected_parses, root_nom, root_prop, diamonds): """ wraps all ``Diamond``s that were already constructed by HLDSReader.parse_sentences() plus some meta data (root verb etc.) into a NLTK feature structure that represents a complete sentence. :type sent_str: ``str`` :param sent_str: the text that should be generated :type expected_parses: ``int`` :param expected_parses: the expected number of parses :type root_prop: ``str`` :param root_prop: the root element of that text (in case we're actually generating a sentence: the main verb) :type root_nom: ``str`` :param root_nom: the root (element/verb) category, e.g. "b1:handlung" :type diamonds: ``list`` of ``Diamond``s :param diamonds: a list of the diamonds that are contained in the sentence """ self.update({Feature("text"): sent_str}) self.update({Feature("expected_parses"): int(expected_parses)}) self.update({Feature("root_nom"): root_nom}) if root_prop: # not always present, e.g. when realizing a pronoun self.update({Feature("root_prop"): root_prop}) if diamonds: for i, diamond in enumerate(diamonds): identifier = "{0}__{1}".format( str(i).zfill(2), diamond[Feature("mode")]) self.update({identifier: diamond})
def __diamond_fs2xml(diamond): """ converts a {Diamond} feature structure into its corresponding HLDS XML structure (stored in an etree element). :type diamond: ``Diamond`` :param diamond: a Diamond feature structure containing nom? prop? diamond* elements :rtype: ``etree._Element`` :return: a Diamond in HLDS XML tree notation, represented as an etree element """ E = ElementMaker() NOM = E.nom PROP = E.prop DIAMOND = E.diamond diamond_etree = DIAMOND(mode=ensure_unicode(diamond[Feature("mode")])) if "prop" in diamond: diamond_etree.insert(0, PROP(name=ensure_unicode(diamond["prop"]))) if "nom" in diamond: # if present, nom(inal) has to be the first argument/sub tag of a diamond diamond_etree.insert(0, NOM(name=ensure_unicode(diamond["nom"]))) subdiamonds = [] for key in sorted(diamond.keys()): # keys need to be sorted, otherwise Diamonds within a Sentence will have a # different order than before. Diamond keys seem ordered, but they aren't # (keys beginning with numbers seem to be in descending order, those # beginning with letters in ascending order) if isinstance(diamond[key], Diamond): subdiamonds.append(diamond[key]) etree_subdiamonds = [] for subdiamond in subdiamonds: etree_subdiamonds.append(__diamond_fs2xml(subdiamond)) for subdiamond in etree_subdiamonds: final_position = len(diamond_etree) diamond_etree.insert(final_position, subdiamond) return diamond_etree
def featstruct2avm(featstruct, mode="non-recursive"): """ converts an NLTK feature structure into an attribute-value matrix that can be printed with LaTeX's avm environment. :type featstruct: ``nltk.featstruct`` or ``Diamond`` or ``Sentence`` :rtype: ``str`` """ ret_str = "\[ " for key, val in sorted(featstruct.items()): if isinstance(val, Diamond): #handles nested Diamond structures diamond_key = val[Feature("mode")] diamond_val = featstruct2avm(val, mode="recursive") ret_str += "{0} & {1} \\\\\n".format(ensure_utf8(diamond_key), ensure_utf8(diamond_val)) elif isinstance(val, nltk.FeatStruct): #every other subclass of FeatStruct incl. FeatStruct nested_featstruct = featstruct2avm(val, mode="recursive") ret_str += "{0} & {1} \\\\\n".format( ensure_utf8(key), ensure_utf8(nested_featstruct)) else: # normal key, value pairs within a FeatStruct if key in (Feature("mode"), Feature("expected_parses")): continue # don't print "mode" or "expected_parses" keys elif key == Feature("root_nom"): key = Feature("nom") elif key == Feature("root_prop"): key = Feature("prop") ret_str += "{0} & `{1}' \\\\\n".format(ensure_utf8(key), ensure_utf8(val)) ret_str += " \]\n" if mode == "non-recursive": clean_ret_str = ret_str.replace("*", "$*$").replace("_", "\_") ret_str = "{0}\n{1}{2}".format('\\begin{avm}', clean_ret_str, '\\end{avm}') return ret_str
def _detect_conj_help(self, annotated_tree, root, agenda, extracted_tree): ''' detect conjunction tree, and it requires special treatment. ''' if (annotated_tree.height() == 2 and isinstance(annotated_tree[0], str)) or annotated_tree.height() == 1: return else: if self._is_conj_tree(annotated_tree): conjtree = annotated_tree.copy(deep=True) subs_conjtree = ParentedTree(deepcopy(annotated_tree.node), []) subs_conjtree.node['subs'] = True #print "changing the hole group information" #print subs_conjtree.node['group'] subs_conjtree.node['cand'] = subs_conjtree.node['group'] subs_conjtree.node['group'] = set() root[annotated_tree.treeposition()] = subs_conjtree # deal with conjtree for i in range(len(conjtree)): if not (conjtree[i][0] == ',' or conjtree[i].node[Feature('type')] == 'CC'): # needs to be substitued agenda.append(conjtree[i].copy(deep=True)) subs_tree = ParentedTree(deepcopy(conjtree[i].node), []) subs_tree.node['subs'] = True #print "changing the hole group information" #print newadjunctwhole.node['group'] subs_conjtree.node['cand'] = subs_conjtree.node['group'] subs_conjtree.node['group'] = set() subs_tree.node['group'] = set() conjtree[i] = subs_tree #update the grouping information of conjtree self._grouping_help(conjtree) extracted_tree[self._get_group(conjtree)].append(conjtree) else: for i in range(len(annotated_tree)): self._detect_conj_help(annotated_tree[i], root, agenda, extracted_tree)
def _detect_adjunct_help(self, annotated_tree, root, agenda, extracted_tree): #ipdb.set_trace() pos = annotated_tree.node[Feature('type')] # we assume that there is no wrap adjuction tree in our grammar # so the adjuction either happens on the most left child or most right childtree # and every time we find a new tree, one normal tree and one adjuction tree will be constructed # to be fed into the agenda isadjunct = False #print annotated_tree leftmost = annotated_tree[0] rightmost = annotated_tree[-1] leftgo = not isinstance(leftmost, str) and leftmost.height() > 1 rightgo = not isinstance(rightmost, str) and rightmost.height() > 1 while leftgo or rightgo: if leftgo: if leftmost.node[Feature('type')] == pos and leftmost.node.get('head') is True: # candidate adjuction node if self._is_well_partitioned(annotated_tree, leftmost): # it's really adjuction node # 1. deepcopy it and add substitute to the root # 2. deepcopy the whole tree, replace found node as a adjuct hole isadjunct = True holetree = leftmost.copy(deep=True) newadjunctwhole = ParentedTree(deepcopy(holetree.node), []) newadjunctwhole.node['adj'] = True #print "changing the hole group information" #print newadjunctwhole.node['group'] newadjunctwhole.node['cand'] = newadjunctwhole.node['group'] newadjunctwhole.node['group'] = set() leftmost.parent()[leftmost.parent_index()] = newadjunctwhole # copy the adjuct tree adjuncttree = annotated_tree.copy(deep=True) adjuncttree.node['adj'] = True # when the adjuction is not at the root, subsitute the whole in if not annotated_tree is root: #print "find different" annotated_tree.parent()[annotated_tree.parent_index()] = holetree # put root tree in the agenda self._grouping_help(root) agenda.append(root) # put new adjucttree in the agenda self._grouping_help(adjuncttree) #print "adding adjucttree" #adjuncttree.draw() agenda.append(adjuncttree) self._grouping_help(holetree) #print "adding holetree" #holetree.draw() agenda.append(holetree) break leftmost = leftmost[0] leftgo = not isinstance(leftmost, str) and leftmost.height() > 1 if rightgo: if rightmost.node[Feature('type')] == pos and rightmost.node.get('head') is True: if self._is_well_partitioned(annotated_tree, rightmost): isadjunct = True holetree = rightmost.copy(deep=True) newadjunctwhole = ParentedTree(deepcopy(holetree.node), []) newadjunctwhole.node['adj'] = True #print "changing the hole group information" #print newadjunctwhole.node['group'] newadjunctwhole.node['cand'] = newadjunctwhole.node['group'] newadjunctwhole.node['group'] = set() rightmost.parent()[rightmost.parent_index()] = newadjunctwhole # copy the adjuct tree adjuncttree = annotated_tree.copy(deep=True) adjuncttree.node['adj'] = True # subsitute the whole in if not annotated_tree is root: annotated_tree.parent()[annotated_tree.parent_index()] = newadjunctwhole # put root tree in the agenda self._grouping_help(root) agenda.append(root) # put new adjucttree in the agenda self._grouping_help(adjuncttree) agenda.append(adjuncttree) self._grouping_help(holetree) agenda.append(holetree) break rightmost = rightmost[-1] rightgo = not isinstance(rightmost, str) and rightmost.height() > 1 return isadjunct
import csv import os import pickle from nltk.featstruct import Feature, FeatStructReader from floraparser.FGFeatStructNonterminal import FGFeatStructNonterminal # read_expr = Expression.fromstring lexicon = {} multiwords = {} # Our features with default values (usually False) position = Feature('position', default=False) timing = Feature('timing', default=False) # posit = Feature('posit', default='') makecomp = Feature('makecomp', default=False) compar = Feature('compar', default=False) adjectival = Feature('adjectival', default=False) counted = Feature('counted', default=False) conditional = Feature('conditional', default=False) group = Feature('group', default=False) # nouns defaultfeatures = (position, timing, makecomp, compar, adjectival, counted, conditional, group) def pickle_lexicon():
def main(): """ This is the pypolibox commandline interface. It allows you to query the database and generate book recommendatins, which will either be handed to OpenCCG for generating sentences or printed to stdout in an XML format representing the text plans. """ query = Query(sys.argv[1:]) output_format = query.query_args.output_format valid_output_formats = ['openccg', 'hlds', 'textplan-xml', 'textplan-featstruct'] if output_format not in valid_output_formats: sys.stderr.write("Output format must be one of: {}\n".format(valid_output_formats)) sys.exit(1) try: lexicalize_messageblocks = \ __import__("lexicalize_messageblocks_%s" % query.query_args.output_language, globals(), locals(), [], -1) except ImportError: raise try: lexicalization = \ __import__("lexicalization_%s" % query.query_args.output_language, globals(), locals(), [], -1) except ImportError: raise lexicalize_message_block = lexicalize_messageblocks.lexicalize_message_block phrase2sentence = lexicalization.phrase2sentence textplans = generate_textplans(query) if output_format == 'openccg': openccg = initialize_openccg(lang=query.query_args.output_language) print "{} text plans will be generated.".format(len(textplans.document_plans)) for i, textplan in enumerate(textplans.document_plans): print "Generating text plan #%i:\n" % i check_and_realize_textplan(openccg, textplan, lexicalize_message_block, phrase2sentence) elif output_format == 'hlds': from copy import deepcopy from hlds import (Diamond, Sentence, diamond2sentence, add_nom_prefixes, create_hlds_file) for i, textplan in enumerate(textplans.document_plans): print "Text plan #%i:\n" % i # TODO: refactor to avoid code duplication w/ # check_and_realize_textplan() msg_blocks = linearize_textplan(textplan) for msg_block in msg_blocks: try: lexicalized_msg_block = lexicalize_message_block(msg_block) print "The {0} message block can be realized " \ "as follows:\n".format(msg_block[Feature("msgType")]) for lexicalized_phrase in lexicalized_msg_block: lexicalized_sentence = phrase2sentence(lexicalized_phrase) # TODO: refactor to avoid duplication w/ OpenCCG.realize temp_sentence = deepcopy(lexicalized_sentence) if isinstance(lexicalized_sentence, Diamond): temp_sentence = diamond2sentence(temp_sentence) add_nom_prefixes(temp_sentence) print create_hlds_file(temp_sentence, mode="realize", output="xml") except NotImplementedError, err: print err print "The message block contains these messages:\n", msg_block, \ "\n\n**********\n\n"
from nltk import FeatureChartParser from nltk.featstruct import Feature from nltk.grammar import FeatureGrammar from .. import path from ..core import Span from ..exceptions import ParserTimeout from ._config import get_parser_config logger = logging.getLogger(__name__) START_SYMBOL = "S" HEAD_SYMBOL = "H" TYPE_FEATURE = Feature("type", display="prefix") START_SYMBOLS = frozenset({START_SYMBOL, HEAD_SYMBOL}) MAX_PARSE_TIME = 2.0 class Parser: """ A language parser which is used to extract relations between entities in a given query and group related entities together. The parser uses a context free grammar based on a configuration to generate candidate entity groupings. Heuristics are then used to rank and select a grouping.
def _is_conj_tree(self, annotated_tree): if annotated_tree.node[Feature('type')] == 'NP': # we can only deal with conjuction which happens in NP for i in range(len(annotated_tree)): if annotated_tree[i].node[Feature('type')] == 'CC': return True return False
def lexicalize_message_block(messageblock): msg_type = messageblock[Feature("msgType")] lexicalize_function_name = "lexicalize_" + msg_type return eval(lexicalize_function_name)(messageblock)
from nltk import FeatureChartParser from nltk.grammar import FeatureGrammar from nltk.featstruct import Feature from ._config import get_parser_config from ..core import Span from ..exceptions import ParserTimeout from .. import path logger = logging.getLogger(__name__) START_SYMBOL = 'S' HEAD_SYMBOL = 'H' TYPE_FEATURE = Feature('type', display='prefix') START_SYMBOLS = frozenset({START_SYMBOL, HEAD_SYMBOL}) MAX_PARSE_TIME = 2.0 class Parser: """ A language parser which is used to extract relations between entities in a given query and group related entities together. The parser uses a context free grammar based on a configuration to generate candidate entity groupings. Heuristics are then used to rank and select a grouping.