Beispiel #1
0
class AddCoordPunct(Block):
    """
    Add comma to coordinated lists of 3 and more elements, as well as before
    some Czech coordination conjunctions ('ale', 'ani').

    Arguments:
        language: the language of the target tree
        selector: the selector of the target tree
    """
    def __init__(self, scenario, args):
        "Constructor, just checking the argument values"
        Block.__init__(self, scenario, args)
        if self.language is None:
            raise LoadingException('Language must be defined!')
        self.lexicon = Lexicon()

    def process_anode(self, anode):
        "Add coordination punctuation to the given anode, if applicable."
        if anode.afun != 'Coord':
            return
        achildren = anode.get_children(ordered=True)
        if not achildren:
            return
        # add comma before certain conjunctions
        if self.lexicon.is_coord_conj(anode.lemma) == 'Y' and \
                self.is_at_clause_boundary(anode):
            acomma = self.add_comma_node(anode)
            acomma.shift_before_node(anode)
        # add comma in lists with multiple members (before every member
        # except the first one and the last one, which is connected with
        # the conjunction)
        for aprec_member in [
                an for an in anode.get_children()
                if an.is_member and an < anode
        ][1:]:
            acomma = self.add_comma_node(anode)
            acomma.shift_before_subtree(aprec_member)

    def add_comma_node(self, anode):
        "Add a comma AuxX node under the given node."
        return anode.create_child(
            data={
                'form': ',',
                'lemma': ',',
                'afun': 'AuxX',
                'morphcat': {
                    'pos': 'Z'
                },
                'clause_number': 0
            })

    def is_at_clause_boundary(self, anode):
        """Return true if the given node is at a clause boundary (i.e. the
        nodes immediately before and after it belong to different clauses)."""
        prev_node = anode.get_prev_node()
        next_node = anode.get_next_node()
        return prev_node and next_node and \
                prev_node.clause_number != next_node.clause_number
Beispiel #2
0
class AddCoordPunct(Block):
    """
    Add comma to coordinated lists of 3 and more elements, as well as before
    some Czech coordination conjunctions ('ale', 'ani').

    Arguments:
        language: the language of the target tree
        selector: the selector of the target tree
    """

    def __init__(self, scenario, args):
        "Constructor, just checking the argument values"
        Block.__init__(self, scenario, args)
        if self.language is None:
            raise LoadingException('Language must be defined!')
        self.lexicon = Lexicon()

    def process_anode(self, anode):
        "Add coordination punctuation to the given anode, if applicable."
        if anode.afun != 'Coord':
            return
        achildren = anode.get_children(ordered=True)
        if not achildren:
            return
        # add comma before certain conjunctions
        if self.lexicon.is_coord_conj(anode.lemma) == 'Y' and \
                self.is_at_clause_boundary(anode):
            acomma = self.add_comma_node(anode)
            acomma.shift_before_node(anode)
        # add comma in lists with multiple members (before every member
        # except the first one and the last one, which is connected with
        # the conjunction)
        for aprec_member in [an for an in anode.get_children()
                             if an.is_member and an < anode][1:]:
            acomma = self.add_comma_node(anode)
            acomma.shift_before_subtree(aprec_member)

    def add_comma_node(self, anode):
        "Add a comma AuxX node under the given node."
        return anode.create_child(data={'form': ',', 'lemma': ',',
                                        'afun': 'AuxX',
                                        'morphcat': {'pos': 'Z'},
                                        'clause_number': 0})

    def is_at_clause_boundary(self, anode):
        """Return true if the given node is at a clause boundary (i.e. the
        nodes immediately before and after it belong to different clauses)."""
        prev_node = anode.get_prev_node()
        next_node = anode.get_next_node()
        return prev_node and next_node and \
                prev_node.clause_number != next_node.clause_number
Beispiel #3
0
class AddSubordClausePunct(AddClausalPunct):
    """
    Add commas separating subordinate clauses.

    Arguments:
        language: the language of the target tree
        selector: the selector of the target tree
    """
    def __init__(self, scenario, args):
        "Constructor, just checking the argument values"
        Block.__init__(self, scenario, args)
        if self.language is None:
            raise LoadingException('Language must be defined!')
        self.lexicon = Lexicon()

    def process_atree(self, aroot):
        "Add subordinate clause punctuation to the given sentence."
        anodes = aroot.get_descendants(ordered=True)
        # examine all places between two nodes
        for (aleft, aright) in zip(anodes[:-1], anodes[1:]):
            # exclude all places where we don't want a comma
            # within the same clause
            if aleft.clause_number == aright.clause_number:
                continue
            # clause boundaries, such as brackets
            if aright.clause_number == 0:
                continue
            # some punctuation is here already
            if [
                    an for an in (aleft, aright)
                    if re.match(r'^[,:;.?!-]', an.lemma)
            ]:
                continue
            # coordinating conjunctions or nodes in clauses belonging
            # to the same coordination
            if [
                    an for an in (aleft, aright)
                    if self.lexicon.is_coord_conj(an.lemma)
            ]:
                continue
            if self.are_in_coord_clauses(aleft, aright):
                continue
            # left token is an opening quote or bracket
            if re.match(r'^[„(]', aleft.lemma):
                continue
            # right token is a closing bracket or quote followed by a period
            if aright.lemma == ')' or \
                    (aright.lemma == '“' and not aright.is_last_node() and
                     aright.get_next_node().lemma == '.'):
                continue
            # left token is a closing quote or bracket preceded by a comma
            # (which has been inserted in the last step)
            if re.match(r'^[“)]', aleft.lemma) and not aleft.is_first_node() \
                        and aright.get_prev_node().lemma == ',':
                continue
            # now we know we want to insert a comma
            acomma = self.insert_comma_between(aleft, aright)
            # move the comma if the left token marks
            # the end of an enquoted clause
            if self.is_clause_in_quotes(aleft):
                acomma.shift_before_node(aleft)
            # move the comma after clausal expletives in expression "poté co"
            if aright.lemma == 'poté':
                acomma.shift_after_node(aright)

    def are_in_coord_clauses(self, aleft, aright):
        "Check if the given nodes are in two coordinated clauses."
        alparent = self.get_clause_parent(aleft)
        arparent = self.get_clause_parent(aright)
        return alparent == arparent and \
                not alparent.is_root and is_coord_conj(alparent.lemma)

    def get_clause_parent(self, anode):
        """Return the parent of the clause the given node belongs to;
        the result may be the root of the tree."""
        if anode.clause_number == 0:
            parent = anode
        else:
            parent = anode.get_clause_root().parent
        while parent.is_coap_root() and parent.is_member:
            parent = parent.parent
        return parent

    def insert_comma_between(self, aleft, aright):
        """Insert a comma node between these two nodes,
        find out where to hang it."""
        # find out the parent
        aleft_clause_root = aleft.get_clause_root()
        aright_clause_root = aright.get_clause_root()
        ahigher_clause_root = aleft_clause_root.get_depth() > \
                aright_clause_root.get_depth() and \
                aleft_clause_root or aright_clause_root
        # insert the new node
        acomma = ahigher_clause_root.create_child(\
                         data={'form': ',', 'lemma': ',', 'afun': 'AuxX',
                               'morphcat': {'pos': 'Z'}, 'clause_number': 0})
        # shift the new node to its rightful place
        acomma.shift_after_node(aleft)
        return acomma
Beispiel #4
0
class AddSubordClausePunct(AddClausalPunct):
    """
    Add commas separating subordinate clauses.

    Arguments:
        language: the language of the target tree
        selector: the selector of the target tree
    """

    def __init__(self, scenario, args):
        "Constructor, just checking the argument values"
        Block.__init__(self, scenario, args)
        if self.language is None:
            raise LoadingException('Language must be defined!')
        self.lexicon = Lexicon()

    def process_atree(self, aroot):
        "Add subordinate clause punctuation to the given sentence."
        anodes = aroot.get_descendants(ordered=True)
        # examine all places between two nodes
        for (aleft, aright) in zip(anodes[:-1], anodes[1:]):
            # exclude all places where we don't want a comma
            # within the same clause
            if aleft.clause_number == aright.clause_number:
                continue
            # clause boundaries, such as brackets
            if aright.clause_number == 0:
                continue
            # some punctuation is here already
            if [an for an in (aleft, aright)
                    if re.match(r'^[,:;.?!-]', an.lemma)]:
                continue
            # coordinating conjunctions or nodes in clauses belonging
            # to the same coordination
            if [an for an in (aleft, aright)
                if self.lexicon.is_coord_conj(an.lemma)]:
                continue
            if self.are_in_coord_clauses(aleft, aright):
                continue
            # left token is an opening quote or bracket
            if re.match(r'^[„(]', aleft.lemma):
                continue
            # right token is a closing bracket or quote followed by a period
            if aright.lemma == ')' or \
                    (aright.lemma == '“' and not aright.is_last_node() and
                     aright.get_next_node().lemma == '.'):
                continue
            # left token is a closing quote or bracket preceded by a comma
            # (which has been inserted in the last step)
            if re.match(r'^[“)]', aleft.lemma) and not aleft.is_first_node() \
                        and aright.get_prev_node().lemma == ',':
                continue
            # now we know we want to insert a comma
            acomma = self.insert_comma_between(aleft, aright)
            # move the comma if the left token marks
            # the end of an enquoted clause
            if self.is_clause_in_quotes(aleft):
                acomma.shift_before_node(aleft)
            # move the comma after clausal expletives in expression "poté co"
            if aright.lemma == 'poté':
                acomma.shift_after_node(aright)

    def are_in_coord_clauses(self, aleft, aright):
        "Check if the given nodes are in two coordinated clauses."
        alparent = self.get_clause_parent(aleft)
        arparent = self.get_clause_parent(aright)
        return alparent == arparent and \
                not alparent.is_root and is_coord_conj(alparent.lemma)

    def get_clause_parent(self, anode):
        """Return the parent of the clause the given node belongs to;
        the result may be the root of the tree."""
        if anode.clause_number == 0:
            parent = anode
        else:
            parent = anode.get_clause_root().parent
        while parent.is_coap_root() and parent.is_member:
            parent = parent.parent
        return parent

    def insert_comma_between(self, aleft, aright):
        """Insert a comma node between these two nodes,
        find out where to hang it."""
        # find out the parent
        aleft_clause_root = aleft.get_clause_root()
        aright_clause_root = aright.get_clause_root()
        ahigher_clause_root = aleft_clause_root.get_depth() > \
                aright_clause_root.get_depth() and \
                aleft_clause_root or aright_clause_root
        # insert the new node
        acomma = ahigher_clause_root.create_child(\
                         data={'form': ',', 'lemma': ',', 'afun': 'AuxX',
                               'morphcat': {'pos': 'Z'}, 'clause_number': 0})
        # shift the new node to its rightful place
        acomma.shift_after_node(aleft)
        return acomma