Ejemplo n.º 1
0
def create_diamond(mode, nom, prop, nested_diamonds_list):
    """
    creates an HLDS feature structure from scratch (in contrast to 
    convert_diamond_xml2fs, which converts an HLDS XML structure into 
    its corresponding feature structure representation)
    
    NOTE: I'd like to simply put this into __init__, but I don't know how 
    to subclass FeatDict properly. FeatDict.__new__ complains about 
    Diamond.__init__(self, mode, nom, prop, nested_diamonds_list) having 
    too many arguments.
    
    :type mode: ``Str``
    :type nom: ``Str``
    :type prop: ``Str``
    :type nested_diamonds_list: ``list``
    """
    diamond = Diamond()
    diamond[Feature('mode')] = mode

    if nom:
        diamond.update({"nom": nom})
    if prop:
        diamond.update({"prop": prop})
    if nested_diamonds_list:
        for i, nested_diamond in enumerate(nested_diamonds_list):
            identifier = "{0}__{1}".format(
                str(i).zfill(2), nested_diamond[Feature("mode")])
            diamond.update({identifier: nested_diamond})
    return diamond
Ejemplo n.º 2
0
def abbreviate_textplan(textplan):
    """
    recursive helper function that prints only the skeletton of a textplan 
    (message types and RST relations but not the actual message content)
    
    :param textplan: a text plan, a constituent set or a message
    :type textplan: ``TextPlan`` or ``ConstituentSet`` or ``Message``
    
    :return: a message (without the attribute value pairs stored in it)
    :rtype: ``Message``
    """
    if isinstance(textplan, TextPlan):
        score = textplan["title"]["book score"]
        abbreviated_textplan = abbreviate_textplan(textplan["children"])
        return TextPlan(book_score=score, children=abbreviated_textplan)
    if isinstance(textplan, ConstituentSet):
        reltype = textplan[Feature("relType")]
        nucleus = abbreviate_textplan(textplan[Feature("nucleus")])
        satellite = abbreviate_textplan(textplan[Feature("satellite")])
        return ConstituentSet(relType=reltype,
                              nucleus=nucleus,
                              satellite=satellite)
    if isinstance(textplan, Message):
        msgtype = textplan[Feature("msgType")]
        return Message(msgType=msgtype)
def lexicalize_extra(extra_message_block):
    r"""
    lexicalize all the messages contained in an extra message block
    (aka ``Message``)

    :type: ``Message``
    :param: a message (of type "extra")
    
    :rtype: ``List`` of ``Diamond``s
    :return: a list of lexicalized phrases, which can be realized with
    ``tccg`` directly or turned into sentences beforehand with ``lexicalization.phrase2sentence`` to remove ambiguity
    
    NOTE: "außerdem" works only in a limited number of contexts, e.g. 'das
    Buch ist neu, außerdem ist es auf Deutsch' but not 'das Buch ist neu,
    außerdem ist das Buch auf Deutsch'. therefore, no connective is used here
    so far.
    """
    assert extra_message_block[Feature("msgType")] == "extra"
    
    msg_block = deepcopy(extra_message_block)
    authors = msg_block[Feature("reference_authors")]
    title = msg_block[Feature("reference_title")]
    #author_variations = lexicalize_authors_variations(authors)
    title_variations = lexicalize_title_variations(title, authors)

    lxed_phrses = []
    for msg_name, msg in msg_block.items():
        if isinstance(msg_name, str):
            lexicalize_function_name = "lexicalize_" + msg_name
            random_title = random_variation(title_variations)
            lxed_phrses.append(
                eval(lexicalize_function_name)(msg,
                                               lexicalized_title=random_title))
    return lxed_phrses
Ejemplo n.º 4
0
def __sentence_fs2xml(sentence, mode="test"):
    """    
    transforms a sentence (in NLTK feature structure notation) into its 
    corresponding HLDS XML <item></item> structure.
    
    :type sentence: ``Sentence``
    :param sentence: a sentence in NLTK feature structure notation
    
    :type mode: ``str``    
    :param mode: "test", if the sentence will be part of a (regression) 
    testbed file (ccg-test). "realize", if the sentence will be put in a 
    file on its own (ccg-realize).
    
    :rtype: ``etree._Element``
    :return: the input sentence in HLDS XML format (represented as an etree 
    element)
    """
    if mode is "test":
        expected_parses = sentence[Feature("expected_parses")]
        text = sentence[Feature("text")]
        item = etree.Element("item",
                             numOfParses=str(expected_parses),
                             string=ensure_unicode(text))
        xml = etree.SubElement(item, "xml")
        lf = etree.SubElement(xml, "lf")
    else:  # mode is "realize"
        lf = etree.Element("lf")

    root_nom = sentence[Feature("root_nom")]
    satop = etree.SubElement(lf, "satop", nom=root_nom)

    if Feature("root_prop") in sentence:
        root_prop = sentence[Feature("root_prop")]
        etree.SubElement(satop, "prop", name=root_prop)

    diamonds = []
    for key in sorted(sentence.keys()):
        # keys need to be sorted, otherwise Diamonds within a Sentence will have a
        # different order than before. Diamond keys seem ordered, but they aren't
        # (keys beginning with numbers seem to be in descending order, those
        # beginning with letters in ascending order)
        if isinstance(sentence[key], Diamond):
            diamonds.append(sentence[key])

    etree_diamonds = []
    for diamond in diamonds:
        etree_diamonds.append(__diamond_fs2xml(diamond))

    for diamond in etree_diamonds:
        final_position = len(satop)
        satop.insert(final_position, diamond)

    if mode is "test":
        return item
    else:
        return lf
Ejemplo n.º 5
0
def add_mode_suffix(diamond, mode="N"):
    matching_subdiamond_keys = []
    for key in diamond.keys():
        if isinstance(key, str) and key.endswith(mode):
            if diamond[key][Feature("mode")] == mode:
                matching_subdiamond_keys.append(key)

    sorted_subdiamond_keys = sorted(matching_subdiamond_keys)
    for i, key in enumerate(sorted_subdiamond_keys):
        diamond[key][Feature("mode")] = "{0}{1}".format(mode, i + 1)

    for key, value in diamond.items():
        if isinstance(value, Diamond):
            add_mode_suffix(value, mode)
Ejemplo n.º 6
0
def __msgtype_print(message):
    """    
    recursive helper function for msgtypes(), which prints message types 
    and RST relation types
    
    :type message: ``Message`` or ``ConstituentSet``
    :rtype: ``str``
    """
    if isinstance(message, Message):
        return message[Feature("msgType")]
    if isinstance(message, ConstituentSet):
        nucleus = __msgtype_print(message[Feature("nucleus")])
        reltype = message[Feature("relType")]
        satellite = __msgtype_print(message[Feature("satellite")])
        return "{0}({1}, {2})".format(reltype, nucleus, satellite)
Ejemplo n.º 7
0
def check_and_realize_textplan(openccg, textplan, lexicalize_message_block, phrase2sentence):
    """
    realizes a text plan and warns about message blocks that cannot be
    realized due to current restrictions in the OpenCC grammar.

    Parameters
    ----------
    openccg : OpenCCG
        a running OpenCCG instance
    textplan : TextPlan
        text plan to be realized
    """
    msg_blocks = linearize_textplan(textplan)
    for msg_block in msg_blocks:
        try:
            lexicalized_msg_block = lexicalize_message_block(msg_block)
            print "The {0} message block can be realized " \
                  "as follows:\n".format(msg_block[Feature("msgType")])
            for lexicalized_phrase in lexicalized_msg_block:
                lexicalized_sentence = phrase2sentence(lexicalized_phrase)
                for realized_sent in openccg.realize(lexicalized_sentence):
                    print realized_sent

        except NotImplementedError, err:
            print err
            print "The message block contains these messages:\n", msg_block, \
                  "\n\n**********\n\n"
        print
def lexicalize_lastbook_match(lastbook_match_message_block):
    r"""
    lexicalize all the messages contained in a lastbook_match message block
    (aka ``Message``)

    :type: ``Message``
    :param: a message (of type "lastbook_match")
    
    :rtype: ``List`` of ``Diamond``s
    :return: a list of lexicalized phrases, which can be realized with
    ``tccg`` directly or turned into sentences beforehand with ``lexicalization.phrase2sentence`` to remove ambiguity
    
    possible: sowohl X als auch Y / beide Bücher
    implemented: beide Bücher

    TODO: implement lexicalize_pagerange
    """
    assert lastbook_match_message_block[Feature("msgType")] == "lastbook_match"
    
    msg_block = deepcopy(lastbook_match_message_block)

    num = gen_num("plur")
    art = gen_art("quantbeide")
    agens = create_diamond("AGENS", "artefaktum", "Buch", [num, art])

    lxed_phrses = []
    for msg_name, msg in msg_block.items():
        if isinstance(msg_name, str) and msg_name not in ("lastbook_authors",
                                                          "lastbook_title",
                                                          "pagerange"):
            lexicalize_function_name = "lexicalize_" + msg_name
            lxed_phrses.append(
                eval(lexicalize_function_name)(msg,
                                               lexicalized_title=agens))
    return lxed_phrses
Ejemplo n.º 9
0
def __textplan_header2xml(tree_root, textplan):
    """
    helper function for textplan2xml() and textplans2xml().
    extracts meta data from the text plan (book score etc.), calls
    __textplantree2xml to convert the actual text plan to XML and inserts
    both into the tree_root XML structure.

    :type tree_root: ``etree._Element``
    :param tree_root: the root element of the resulting text plan XML structure
    :type textplan: ``TextPlan``

    :rtype: ``etree._Element``
    :return: one <textplan></textplan> XML structure
    """
    xml_textplan = etree.SubElement(tree_root, "textplan")

    book_score = str(textplan["title"]["book score"])
    document_type = textplan[Feature("type")]
    target_string = textplan["title"]["text"]

    header = etree.SubElement(xml_textplan,
                              "header",
                              score=book_score,
                              type=document_type)
    target = etree.SubElement(header, "target")
    target.text = target_string

    rst_tree = __textplantree2xml(textplan["children"])
    xml_textplan.insert(1, rst_tree)
    return xml_textplan
def lexicalize_id(id_message_block):
    r"""
    lexicalize all the messages contained in an id message block
    (aka ``Message``)

    :type: ``Message``
    :param: a message (of type "id")
    
    :rtype: ``List`` of ``Diamond``s
    :return: a list of lexicalized phrases, which can be realized with
    ``tccg`` directly or turned into sentences beforehand with ``lexicalization.phrase2sentence`` to remove ambiguity
    """
    assert id_message_block[Feature("msgType")] == "id"
    
    msg_block = deepcopy(id_message_block)
    authors = msg_block["authors"]
    title = msg_block["title"]
    #author_variations = lexicalize_authors_variations(authors)
    title_variations = lexicalize_title_variations(title, authors)

    lxed_phrses = []
    if "year" in msg_block:
        lxed_phrses.append(lexicalize_title_description(msg_block["title"],
                                                        msg_block["authors"],
                                                        msg_block["year"]))
    else:
        lxed_phrses.append(lexicalize_title_description(msg_block["title"],
                                                        msg_block["authors"]))

    identifiers = set(["title", "authors", "year"])
    for msg_name, msg in msg_block.items():
        if isinstance(msg_name, Feature) or msg_name in identifiers:
            msg_block.pop(msg_name)

    msg_names = msg_block.keys()

    if "codeexamples" in msg_names:
        if "proglang" in msg_names and msg_block["proglang"][0]:
            # proglang should not be realized if the book doesn't use one
            lexicalized_proglang = lexicalize_proglang(msg_block["proglang"],
                                                       realize="embedded")
            lxed_phrses.append(lexicalize_codeexamples(
                                    msg_block["codeexamples"],
                                    lexicalized_proglang,
                                    random_variation(title_variations),
                                    lexeme="random"))
            msg_block.pop("proglang")
        else:
            lxed_phrses.append(
                lexicalize_codeexamples(msg_block["codeexamples"],
                                        random_variation(title_variations),
                                        lexeme="random"))
        msg_block.pop("codeexamples")

    for msg_name, msg in msg_block.items():
        lexicalize_function_name = "lexicalize_" + msg_name
        lxed_phrses.append(
            eval(lexicalize_function_name)(msg,
                        lexicalized_title=random_variation(title_variations)))
    return lxed_phrses
Ejemplo n.º 11
0
    def insert_subdiamond(self, index, subdiamond_to_insert, mode=None):
        """
        insert a ``Diamond`` into this one before the index, while 
        allowing to change the mode of the subdiamond.
        
        :type index: ``int``
        :type subdiamond_to_insert: ``Diamond``

        :type mode: ``str`` or ``NoneType``
        :param mode: the mode that the subdiamond shall have. this will 
        also be used to determine the subdiamonds identifier. if the 
        diamond already has two subdiamonds (e.g. "00__AGENS" and 
        "01__PATIENS") and we'll insert a third subdiamond at index '1' 
        with mode "TEMP", its identifier will be "01__TEMP", while the 
        remaining two subdiamond identifiers will will be changed 
        accordingly, e.g. "00__AGENS" and "02__PATIENS".
        if mode is None, the subdiamonds mode will be left untouched.
        """
        if mode:  #change mode only if not None
            subdiamond_to_insert.update({Feature("mode"): mode})

        # a featstruct is essentially a dictionary, so we'll need to sort it!
        existing_subdiamonds = sorted(
            [(dkey, d)
             for (dkey, d) in self.items() if isinstance(d, Diamond)],
            key=itemgetter(0))

        prefixless_subdiamonds = []
        for diamond_key, diamond in existing_subdiamonds:
            prefixless_subdiamonds.append(diamond)
            self.pop(diamond_key)

        prefixless_subdiamonds.insert(index, subdiamond_to_insert)
        for diamond in prefixless_subdiamonds:
            self.append_subdiamond(diamond)
Ejemplo n.º 12
0
 def change_mode(self, mode):
     """
     changes the mode of a ``Diamond``, which is sometimes needed when 
     embedding it into another ``Diamond`` or ``Sentence``.
     
     :type mode: ``str``
     """
     self[Feature('mode')] = mode
Ejemplo n.º 13
0
def compute_features_dict(productions: List[Production]) -> Dict[str, List[str]]:
    features_dict: Dict[str, List[str]] = dict()
    for production in productions:
        for term in (production.lhs(),) + production.rhs():
            if isinstance(term, Nonterminal):
                name = str(term[Feature("type")]).lower()
                features_dict.setdefault(name, [])
                current_features = set(features_dict[name])
                new_features = set(str(feature) for feature in term if str(feature) != "*type*")
                features_dict[name] = list(current_features.union(new_features))

    return features_dict
Ejemplo n.º 14
0
def parse_term(term: Union[FeatStructNonterminal, str], features_dict: Dict[str, List[str]]) -> str:
    if not isinstance(term, Nonterminal):
        return term.lower()

    name = str(term[Feature("type")]).lower()
    param_list = []
    for feature in features_dict[name]:
        if feature in term:
            param_list.append(transform_value(term[feature]))
        else:
            param_list.append("_")
    return "%s(%s)" % (name, ", ".join(param_list)) if len(param_list) != 0 else name
Ejemplo n.º 15
0
    def append_subdiamond(self, subdiamond, mode=None):
        """
        appends a subdiamond structure to an existing diamond structure, while 
        allowing to change the mode of the subdiamond
        
        :type mode: ``str`` or ``NoneType``
        :param mode: the mode that the subdiamond shall have. this will 
        also be used to determine the subdiamonds identifier. if the 
        diamond already has two subdiamonds (e.g. "00__AGENS" and 
        "01__PATIENS") and add a third subdiamond with mode "TEMP", its 
        identifier will be "02__TEMP". if mode is None, the subdiamonds 
        mode will be left untouched.
        """
        index = last_diamond_index(self) + 1

        if mode:  #change mode only if not None
            subdiamond.update({Feature("mode"): mode})

        identifier = "{0}__{1}".format(
            str(index).zfill(2), subdiamond[Feature("mode")])
        self.update({identifier: subdiamond})
Ejemplo n.º 16
0
def __textplantree2xml(tree):
    """
    helper function for __textplan_header2xml() which converts the actual text
    plan into XML.

    :type tree: ``ConstituentSet`` or ``Message``
    :rtype: ``etree._Element``
    """
    if isinstance(tree, ConstituentSet):
        relation_type = tree[Feature("relType")]
        nucleus_tree = __textplantree2xml(tree[Feature("nucleus")])
        satellite_tree = __textplantree2xml(tree[Feature("satellite")])

        relation = etree.Element("relation", type=relation_type)
        nucleus = etree.SubElement(relation, "nucleus")
        nucleus.insert(0, nucleus_tree)
        satellite = etree.SubElement(relation, "satellite")
        satellite.insert(0, satellite_tree)
        return relation

    elif isinstance(tree, Message):
        return __message2xml(tree)
Ejemplo n.º 17
0
def __message2xml(message):
    """
    converts a single ``Message`` into an XML structure.

    :type message: ``Message``
    :rtype: ``etree._Element``
    """
    msg_type = message[Feature("msgType")]
    msg = etree.Element("message", type=msg_type)

    msg_elements = [(key, val) for (key, val) in message.items()
                    if key != Feature("msgType")]
    for key, val in msg_elements:
        if isinstance(key, str):
            if isinstance(val, FeatDict):  #relative length or recency
                __message_strkey_featdictval2xml(msg, key, val)

            elif isinstance(val, tuple):  # (value, rating) tuple
                __message_strkey_tupleval2xml(msg, key, val)

        else:  #if isinstance(key, Feature):
            __message_featurekey2xml(msg, key, val)
    return msg
Ejemplo n.º 18
0
    def create_sentence(self, sent_str, expected_parses, root_nom, root_prop,
                        diamonds):
        """         
        wraps all ``Diamond``s that were already constructed by 
        HLDSReader.parse_sentences() plus some meta data (root verb etc.) 
        into a NLTK feature structure that represents a complete sentence.
        
        :type sent_str: ``str``
        :param sent_str: the text that should be generated
        
        :type expected_parses: ``int``
        :param expected_parses: the expected number of parses
        
        :type root_prop: ``str``
        :param root_prop: the root element of that text (in case we're 
        actually generating a sentence: the main verb)
        
        :type root_nom: ``str``
        :param root_nom: the root (element/verb) category, e.g. "b1:handlung"
        
        :type diamonds: ``list`` of ``Diamond``s        
        :param diamonds: a list of the diamonds that are contained in the 
        sentence 
        """

        self.update({Feature("text"): sent_str})
        self.update({Feature("expected_parses"): int(expected_parses)})
        self.update({Feature("root_nom"): root_nom})
        if root_prop:  # not always present, e.g. when realizing a pronoun
            self.update({Feature("root_prop"): root_prop})

        if diamonds:
            for i, diamond in enumerate(diamonds):
                identifier = "{0}__{1}".format(
                    str(i).zfill(2), diamond[Feature("mode")])
                self.update({identifier: diamond})
Ejemplo n.º 19
0
def __diamond_fs2xml(diamond):
    """
    converts a {Diamond} feature structure into its corresponding HLDS 
    XML structure (stored in an etree element).
    
    :type diamond: ``Diamond``
    :param diamond: a Diamond feature structure containing nom? prop? diamond* 
    elements
    
    :rtype: ``etree._Element``
    :return: a Diamond in HLDS XML tree notation, represented as an etree 
    element
    """
    E = ElementMaker()
    NOM = E.nom
    PROP = E.prop
    DIAMOND = E.diamond

    diamond_etree = DIAMOND(mode=ensure_unicode(diamond[Feature("mode")]))

    if "prop" in diamond:
        diamond_etree.insert(0, PROP(name=ensure_unicode(diamond["prop"])))
    if "nom" in diamond:
        # if present, nom(inal) has to be the first argument/sub tag of a diamond
        diamond_etree.insert(0, NOM(name=ensure_unicode(diamond["nom"])))

    subdiamonds = []
    for key in sorted(diamond.keys()):
        # keys need to be sorted, otherwise Diamonds within a Sentence will have a
        # different order than before. Diamond keys seem ordered, but they aren't
        # (keys beginning with numbers seem to be in descending order, those
        # beginning with letters in ascending order)
        if isinstance(diamond[key], Diamond):
            subdiamonds.append(diamond[key])

    etree_subdiamonds = []
    for subdiamond in subdiamonds:
        etree_subdiamonds.append(__diamond_fs2xml(subdiamond))

    for subdiamond in etree_subdiamonds:
        final_position = len(diamond_etree)
        diamond_etree.insert(final_position, subdiamond)

    return diamond_etree
Ejemplo n.º 20
0
def featstruct2avm(featstruct, mode="non-recursive"):
    """
    converts an NLTK feature structure into an attribute-value matrix
    that can be printed with LaTeX's avm environment.

    :type featstruct: ``nltk.featstruct`` or ``Diamond`` or ``Sentence``
    :rtype: ``str``
    """
    ret_str = "\[ "
    for key, val in sorted(featstruct.items()):

        if isinstance(val, Diamond):  #handles nested Diamond structures
            diamond_key = val[Feature("mode")]
            diamond_val = featstruct2avm(val, mode="recursive")
            ret_str += "{0} & {1} \\\\\n".format(ensure_utf8(diamond_key),
                                                 ensure_utf8(diamond_val))

        elif isinstance(val, nltk.FeatStruct):
            #every other subclass of FeatStruct incl. FeatStruct
            nested_featstruct = featstruct2avm(val, mode="recursive")
            ret_str += "{0} & {1} \\\\\n".format(
                ensure_utf8(key), ensure_utf8(nested_featstruct))

        else:  # normal key, value pairs within a FeatStruct
            if key in (Feature("mode"), Feature("expected_parses")):
                continue  # don't print "mode" or "expected_parses" keys
            elif key == Feature("root_nom"):
                key = Feature("nom")
            elif key == Feature("root_prop"):
                key = Feature("prop")

            ret_str += "{0} & `{1}' \\\\\n".format(ensure_utf8(key),
                                                   ensure_utf8(val))

    ret_str += " \]\n"

    if mode == "non-recursive":
        clean_ret_str = ret_str.replace("*", "$*$").replace("_", "\_")
        ret_str = "{0}\n{1}{2}".format('\\begin{avm}', clean_ret_str,
                                       '\\end{avm}')
    return ret_str
Ejemplo n.º 21
0
    def _detect_conj_help(self, annotated_tree, root, agenda, extracted_tree):
        '''
        detect conjunction tree, and it requires special treatment.
        '''
        if (annotated_tree.height() == 2 and isinstance(annotated_tree[0], str)) or annotated_tree.height() == 1:
            return
        else:
            if self._is_conj_tree(annotated_tree):
                conjtree = annotated_tree.copy(deep=True)
                subs_conjtree = ParentedTree(deepcopy(annotated_tree.node), [])
                subs_conjtree.node['subs'] = True
                #print "changing the hole group information"
                #print subs_conjtree.node['group']
                subs_conjtree.node['cand'] = subs_conjtree.node['group']
                subs_conjtree.node['group'] = set()
                root[annotated_tree.treeposition()] = subs_conjtree

                # deal with conjtree
                for i in range(len(conjtree)):
                    if not (conjtree[i][0] == ',' or conjtree[i].node[Feature('type')] == 'CC'):
                        # needs to be substitued
                        agenda.append(conjtree[i].copy(deep=True))
                        subs_tree = ParentedTree(deepcopy(conjtree[i].node), [])
                        subs_tree.node['subs'] = True
                        #print "changing the hole group information"
                        #print newadjunctwhole.node['group']
                        subs_conjtree.node['cand'] = subs_conjtree.node['group']
                        subs_conjtree.node['group'] = set()
                        subs_tree.node['group'] = set()
                        conjtree[i] = subs_tree
                #update the grouping information of conjtree
                self._grouping_help(conjtree)
                extracted_tree[self._get_group(conjtree)].append(conjtree)
            else:
                for i in range(len(annotated_tree)):
                    self._detect_conj_help(annotated_tree[i], root, agenda, extracted_tree)
Ejemplo n.º 22
0
    def _detect_adjunct_help(self, annotated_tree, root, agenda, extracted_tree):
        #ipdb.set_trace()
        pos = annotated_tree.node[Feature('type')]
        # we assume that there is no wrap adjuction tree in our grammar
        # so the adjuction either happens on the most left child or most right childtree
        # and every time we find a new tree, one normal tree and one adjuction tree will be constructed
        # to be fed into the agenda
        isadjunct = False
        #print annotated_tree
        leftmost = annotated_tree[0]
        rightmost = annotated_tree[-1]
        leftgo = not isinstance(leftmost, str) and leftmost.height() > 1
        rightgo = not isinstance(rightmost, str) and rightmost.height() > 1
        while  leftgo or rightgo:
            if leftgo:
                if leftmost.node[Feature('type')] == pos and leftmost.node.get('head') is True:
                    # candidate adjuction node
                    if self._is_well_partitioned(annotated_tree, leftmost):
                        # it's really adjuction node
                        # 1. deepcopy it and add substitute to the root
                        # 2. deepcopy the whole tree, replace found node as a adjuct hole
                        isadjunct = True

                        holetree = leftmost.copy(deep=True)
                        newadjunctwhole = ParentedTree(deepcopy(holetree.node), [])
                        newadjunctwhole.node['adj'] = True
                        #print "changing the hole group information"
                        #print newadjunctwhole.node['group']
                        newadjunctwhole.node['cand'] = newadjunctwhole.node['group']
                        newadjunctwhole.node['group'] = set()
                        leftmost.parent()[leftmost.parent_index()] = newadjunctwhole
                        # copy the adjuct tree
                        adjuncttree = annotated_tree.copy(deep=True)
                        adjuncttree.node['adj'] = True
                        # when the adjuction is not at the root, subsitute the whole in
                        if not annotated_tree is root:
                            #print "find different"
                            annotated_tree.parent()[annotated_tree.parent_index()] = holetree
                            # put root tree in the agenda
                            self._grouping_help(root)
                            agenda.append(root)
                        # put new adjucttree in the agenda
                        self._grouping_help(adjuncttree)
                        #print "adding adjucttree"
                        #adjuncttree.draw()
                        agenda.append(adjuncttree)
                        self._grouping_help(holetree)
                        #print "adding holetree"
                        #holetree.draw()
                        agenda.append(holetree)
                        break
                leftmost = leftmost[0]
                leftgo = not isinstance(leftmost, str) and leftmost.height() > 1
            if rightgo:
                if rightmost.node[Feature('type')] == pos and rightmost.node.get('head') is True:
                    if self._is_well_partitioned(annotated_tree, rightmost):
                        isadjunct = True
                        holetree = rightmost.copy(deep=True)
                        newadjunctwhole = ParentedTree(deepcopy(holetree.node), [])
                        newadjunctwhole.node['adj'] = True
                        #print "changing the hole group information"
                        #print newadjunctwhole.node['group']
                        newadjunctwhole.node['cand'] = newadjunctwhole.node['group']
                        newadjunctwhole.node['group'] = set()
                        rightmost.parent()[rightmost.parent_index()] = newadjunctwhole
                        # copy the adjuct tree
                        adjuncttree = annotated_tree.copy(deep=True)
                        adjuncttree.node['adj'] = True
                        # subsitute the whole in
                        if not annotated_tree is root:
                            annotated_tree.parent()[annotated_tree.parent_index()] = newadjunctwhole
                            # put root tree in the agenda
                            self._grouping_help(root)
                            agenda.append(root)
                        # put new adjucttree in the agenda
                        self._grouping_help(adjuncttree)
                        agenda.append(adjuncttree)
                        self._grouping_help(holetree)
                        agenda.append(holetree)
                        break
                rightmost = rightmost[-1]
                rightgo = not isinstance(rightmost, str) and rightmost.height() > 1
        return isadjunct
Ejemplo n.º 23
0
import csv
import os
import pickle

from nltk.featstruct import Feature, FeatStructReader

from floraparser.FGFeatStructNonterminal import FGFeatStructNonterminal

# read_expr = Expression.fromstring

lexicon = {}

multiwords = {}

# Our features with default values (usually False)
position = Feature('position', default=False)
timing = Feature('timing', default=False)
# posit    =  Feature('posit', default='')
makecomp = Feature('makecomp', default=False)
compar = Feature('compar', default=False)
adjectival = Feature('adjectival', default=False)
counted = Feature('counted', default=False)
conditional = Feature('conditional', default=False)
group = Feature('group', default=False)  # nouns

defaultfeatures = (position, timing, makecomp, compar, adjectival, counted,
                   conditional, group)


def pickle_lexicon():
Ejemplo n.º 24
0
def main():
    """
    This is the pypolibox commandline interface. It allows you to query
    the database and generate book recommendatins, which will either be
    handed to OpenCCG for generating sentences or printed to stdout in
    an XML format representing the text plans.
    """
    query = Query(sys.argv[1:])

    output_format = query.query_args.output_format
    valid_output_formats = ['openccg', 'hlds', 'textplan-xml', 'textplan-featstruct']
    if output_format not in valid_output_formats:
        sys.stderr.write("Output format must be one of: {}\n".format(valid_output_formats))
        sys.exit(1)

    try:
        lexicalize_messageblocks = \
            __import__("lexicalize_messageblocks_%s" % query.query_args.output_language, globals(), locals(), [], -1)
    except ImportError:
        raise

    try:
        lexicalization = \
            __import__("lexicalization_%s" % query.query_args.output_language, globals(), locals(), [], -1)
    except ImportError:
        raise

    lexicalize_message_block = lexicalize_messageblocks.lexicalize_message_block
    phrase2sentence = lexicalization.phrase2sentence


    textplans = generate_textplans(query)

    if output_format == 'openccg':
        openccg = initialize_openccg(lang=query.query_args.output_language)
        print "{} text plans will be generated.".format(len(textplans.document_plans))
        for i, textplan in enumerate(textplans.document_plans):
            print "Generating text plan #%i:\n" % i
            check_and_realize_textplan(openccg, textplan, lexicalize_message_block, phrase2sentence)
    elif output_format == 'hlds':
        from copy import deepcopy
        from hlds import (Diamond, Sentence, diamond2sentence,
            add_nom_prefixes, create_hlds_file)

        for i, textplan in enumerate(textplans.document_plans):
            print "Text plan #%i:\n" % i

            # TODO: refactor to avoid code duplication w/
            # check_and_realize_textplan()
            msg_blocks = linearize_textplan(textplan)
            for msg_block in msg_blocks:
                try:
                    lexicalized_msg_block = lexicalize_message_block(msg_block)
                    print "The {0} message block can be realized " \
                          "as follows:\n".format(msg_block[Feature("msgType")])
                    for lexicalized_phrase in lexicalized_msg_block:
                        lexicalized_sentence = phrase2sentence(lexicalized_phrase)

                        # TODO: refactor to avoid duplication w/ OpenCCG.realize
                        temp_sentence = deepcopy(lexicalized_sentence)

                        if isinstance(lexicalized_sentence, Diamond):
                            temp_sentence = diamond2sentence(temp_sentence)

                        add_nom_prefixes(temp_sentence)
                        print create_hlds_file(temp_sentence,
                            mode="realize", output="xml")

                except NotImplementedError, err:
                    print err
                    print "The message block contains these messages:\n", msg_block, \
                          "\n\n**********\n\n"
Ejemplo n.º 25
0
from nltk import FeatureChartParser
from nltk.featstruct import Feature
from nltk.grammar import FeatureGrammar

from .. import path
from ..core import Span
from ..exceptions import ParserTimeout
from ._config import get_parser_config

logger = logging.getLogger(__name__)

START_SYMBOL = "S"
HEAD_SYMBOL = "H"

TYPE_FEATURE = Feature("type", display="prefix")

START_SYMBOLS = frozenset({START_SYMBOL, HEAD_SYMBOL})

MAX_PARSE_TIME = 2.0


class Parser:
    """
    A language parser which is used to extract relations between entities in a
    given query and group related entities together.

    The parser uses a context free grammar based on a configuration to generate
    candidate entity groupings. Heuristics are then used to rank and select a
    grouping.
Ejemplo n.º 26
0
 def _is_conj_tree(self, annotated_tree):
     if annotated_tree.node[Feature('type')] == 'NP': # we can only deal with conjuction which happens in NP
         for i in range(len(annotated_tree)):
             if annotated_tree[i].node[Feature('type')] == 'CC':
                 return True
     return False
def lexicalize_message_block(messageblock):
    msg_type = messageblock[Feature("msgType")]
    lexicalize_function_name = "lexicalize_" + msg_type
    return eval(lexicalize_function_name)(messageblock)
Ejemplo n.º 28
0
from nltk import FeatureChartParser
from nltk.grammar import FeatureGrammar
from nltk.featstruct import Feature

from ._config import get_parser_config

from ..core import Span
from ..exceptions import ParserTimeout
from .. import path

logger = logging.getLogger(__name__)

START_SYMBOL = 'S'
HEAD_SYMBOL = 'H'

TYPE_FEATURE = Feature('type', display='prefix')

START_SYMBOLS = frozenset({START_SYMBOL, HEAD_SYMBOL})

MAX_PARSE_TIME = 2.0


class Parser:
    """
    A language parser which is used to extract relations between entities in a
    given query and group related entities together.

    The parser uses a context free grammar based on a configuration to generate
    candidate entity groupings. Heuristics are then used to rank and select a
    grouping.