Exemple #1
0
class SuperParser:
    def __init__(self,
                 string,
                 bracket='',
                 html=None,
                 latex=None,
                 dot_label=None):
        """ SuperParser attempts to use some (really) rough heuristics to guess how the input is
        formatted: does it use parenthesis, brackets, curly braces etc to express embedding,
        does it use html, latex or nothing for internal formatting of each node and does it use
        dot-labeling. Heuristics can be avoided by giving these things as parameters.

        SuperParser can also accept inputs that are already well-formatted python iterables.

        :param string:
        :param bracket:
        :param html:
        :param latex:
        :param dot_label:
        """
        try:
            parts_list = ast.literal_eval(string)
        except SyntaxError:
            parts_list = []
        except ValueError:
            parts_list = []
        ss = string.strip()
        #print('stripped input:', ss)
        if not bracket:
            if ss[0] in '[({':
                bracket = ss[0]
            elif ss[-1] == ']':
                bracket = '['
            elif ss[-1] == '}':
                bracket = '{'
            elif ss[-1] == ')':
                bracket = '('
        self.lbracket = bracket
        if bracket == '[':
            self.rbracket = ']'
        elif bracket == '(':
            self.rbracket = ')'
        elif bracket == '{':
            self.rbracket = '}'
        else:
            self.rbracket = ''
        #print('assuming brackets to be ', self.lbracket, self.rbracket)
        self.latex = latex
        self.html = html
        if self.latex is None and self.html is None:
            latex_prob = 0.1
            html_prob = 0
            if "</" in ss:
                html_prob += 0.5
            if "<i>" in ss:
                html_prob += 0.5
            if "<a href" in ss:
                html_prob += 1.0
            if self.lbracket != "{" and "{" in ss:
                latex_prob += 0.5
            if ss.count('$') > 2:
                latex_prob += 0.5
            if latex_prob < 0.5 and html_prob < 0.5:
                self.latex = False
                self.html = False
            elif latex_prob > html_prob:
                self.latex = True
                self.html = False
            else:
                self.latex = False
                self.html = True
        if dot_label is None:
            if ss.count('.') > 4 or ss.count('[.') > 2:
                self.dot_label = True
            else:
                self.dot_label = False
        else:
            self.dot_label = dot_label
        #print('assuming to be LaTeX: %s, HTML: %s, dot_label: %s ' % (self.latex, self.html,
        #                                                              self.dot_label))
        #print(ss)

        if self.html:
            self.ltag = "<"
            self.rtag = ">"
            self.bitag = ""
            self.escape = ""
            self.divider = " "
            self.row_parser = HTMLToINode(rows_mode=True)
        elif self.latex:
            self.ltag = "{"
            self.rtag = "}"
            self.bitag = "$"
            self.escape = "\\"
            self.divider = " "
            self.row_parser = LatexToINode(rows_mode=True)
        else:
            self.ltag = ""
            self.rtag = ""
            self.bitag = '"'
            self.escape = ""
            self.divider = " "
            self.row_parser = None
        if self.lbracket == '(' and not self.html and not self.latex:  # treebank format
            self.treebank = True
            self.dot_label = False
            #print('** assuming treebank format **')
        else:
            self.treebank = False

        if not parts_list:
            string_lists = self.chop_into_parts(ss)
            parts_list = self.stringify_and_split(string_lists)
            if len(parts_list) == 1 and isinstance(
                    parts_list[0], list) and len(parts_list[0]) == 1:
                parts_list = parts_list[
                    0]  # treebank trees have unnecessary () around them
        self.tree = self.parse_parts(parts_list)
        self.nodes = self.tree

    def chop_into_parts(self, string):
        """ First thing in actual parsing is to find the separate constituents from the string.
        The challenge is that the brackets that sign the start and end of constituent can also be
        found within style definitions, e.g. inside html tags, or as escaped characters that
        should be used literally.

        A structure of embedded lists is built that respects the structure given as string.
        :param string:
        :return:
        """
        open_tags = 0
        open_bitag = False
        stack = []
        current = []
        escaped = False
        for char in string:
            if escaped:
                escaped = False
                current.append(char)
            elif char == self.escape:
                escaped = True
                current.append(char)
            elif char == self.bitag:
                open_bitag = not open_bitag
                current.append(char)
            elif char == self.ltag:
                open_tags += 1
                current.append(char)
            elif char == self.rtag:
                open_tags -= 1
                if open_tags < 0:
                    open_tags = 0
                current.append(char)
            elif not (open_tags or open_bitag):
                if char == self.lbracket:
                    stack.append(current)
                    new = []
                    current.append(new)
                    current = new
                elif char == self.rbracket:
                    if stack:
                        current = stack.pop()
                else:
                    current.append(char)
            else:
                current.append(char)
        return current

    def stringify_and_split(self, charlist):
        """ Stringify and split tries to extract strings that form the labels or and
        other information about the node. Depending on tree style, a word end (space) means end
        of the label and then there are various ways around the rule.
        :param charlist:
        :return:
        """
        s = ''
        parts = []
        open_tags = 0
        open_bitag = False
        escaped = False
        splits = 0
        if self.treebank:
            max_splits = 1
        else:
            max_splits = 0

        for item in charlist:
            if escaped:
                if isinstance(item, str):
                    s += item
                escaped = False
            elif item == self.escape:
                escaped = True
                s += item
            elif item == self.bitag:
                open_bitag = not open_bitag
                s += item
            elif item == self.ltag:
                open_tags += 1
                s += item
            elif item == self.rtag:
                open_tags -= 1
                s += item
            elif isinstance(item, str):
                if open_tags == 0 and (not open_bitag) \
                        and ((self.divider == ' ' and item.isspace()) or item == self.divider) \
                        and (max_splits == 0 or max_splits > splits):
                    splits += 1
                    s = s.strip()
                    if s:
                        parts.append(s)
                    s = ''
                else:
                    s += item

            elif isinstance(item, list):
                s = s.strip()
                if s:
                    parts.append(s)
                s = ''
                parts.append(self.stringify_and_split(item))
        s = s.strip()
        if s:
            parts.append(s)
        return parts

    def parse_parts(self, parts_list, tidy_up=True):
        """ Turn text into IParserNodes. IParserNodes are flexible pre-constituents that have
        the field texts converted to ITextNodes or ICommandNodes.
            :param parts_list: list of strings or lists
            :tidy_up: tidy all contained inodes, do it once
        """
        def has_triangle(inode):
            if isinstance(inode, ICommandNode) and inode.command == 'qroof':
                return inode
            elif isinstance(inode, ITextNode):
                for part in inode.parts:
                    found = has_triangle(part)
                    if found:
                        return found

        node = IParserNode()
        deep = any([isinstance(part, list) for part in parts_list])

        for part in parts_list:
            if isinstance(part, list):
                node.parts.append(self.parse_parts(part, tidy_up=False))
            else:
                rows_of_words = self.parse_word(part)
                tidy_rows = []
                found_triangle = None
                for row in rows_of_words:
                    if isinstance(row, ITextNode):
                        row = row.tidy(keep_node=False)
                        found = has_triangle(row)
                        if found:
                            found_triangle = found
                    tidy_rows.append(row)
                if rows_of_words and self.dot_label and str(
                        tidy_rows[0]).startswith('.'):
                    node.label_rows = tidy_rows
                    if found_triangle:
                        node.has_triangle = found_triangle
                elif (not node.label_rows) and tidy_rows and (self.treebank
                                                              or deep):
                    node.label_rows = tidy_rows
                    if found_triangle:
                        node.has_triangle = found_triangle
                else:
                    new_part = IParserNode()
                    new_part.label_rows = tidy_rows
                    new_part.check_for_index()
                    if found_triangle:
                        new_part.has_triangle = found_triangle
                    node.parts.append(new_part)
        if tidy_up:
            node = node.tidy(keep_node=True)
        node.check_for_index()
        return node

    def parse_word(self, string):
        """ Parse string of label text into ITextNodes or ICommandNodes. Relies on HtmlToINode,
        LatexToINode or other row_parser implementation. """
        if self.latex or self.html:
            return self.row_parser.process(string)
        else:
            return string.split('\n')
Exemple #2
0
    def __init__(self,
                 string,
                 bracket='',
                 html=None,
                 latex=None,
                 dot_label=None):
        """ SuperParser attempts to use some (really) rough heuristics to guess how the input is
        formatted: does it use parenthesis, brackets, curly braces etc to express embedding,
        does it use html, latex or nothing for internal formatting of each node and does it use
        dot-labeling. Heuristics can be avoided by giving these things as parameters.

        SuperParser can also accept inputs that are already well-formatted python iterables.

        :param string:
        :param bracket:
        :param html:
        :param latex:
        :param dot_label:
        """
        try:
            parts_list = ast.literal_eval(string)
        except SyntaxError:
            parts_list = []
        except ValueError:
            parts_list = []
        ss = string.strip()
        #print('stripped input:', ss)
        if not bracket:
            if ss[0] in '[({':
                bracket = ss[0]
            elif ss[-1] == ']':
                bracket = '['
            elif ss[-1] == '}':
                bracket = '{'
            elif ss[-1] == ')':
                bracket = '('
        self.lbracket = bracket
        if bracket == '[':
            self.rbracket = ']'
        elif bracket == '(':
            self.rbracket = ')'
        elif bracket == '{':
            self.rbracket = '}'
        else:
            self.rbracket = ''
        #print('assuming brackets to be ', self.lbracket, self.rbracket)
        self.latex = latex
        self.html = html
        if self.latex is None and self.html is None:
            latex_prob = 0.1
            html_prob = 0
            if "</" in ss:
                html_prob += 0.5
            if "<i>" in ss:
                html_prob += 0.5
            if "<a href" in ss:
                html_prob += 1.0
            if self.lbracket != "{" and "{" in ss:
                latex_prob += 0.5
            if ss.count('$') > 2:
                latex_prob += 0.5
            if latex_prob < 0.5 and html_prob < 0.5:
                self.latex = False
                self.html = False
            elif latex_prob > html_prob:
                self.latex = True
                self.html = False
            else:
                self.latex = False
                self.html = True
        if dot_label is None:
            if ss.count('.') > 4 or ss.count('[.') > 2:
                self.dot_label = True
            else:
                self.dot_label = False
        else:
            self.dot_label = dot_label
        #print('assuming to be LaTeX: %s, HTML: %s, dot_label: %s ' % (self.latex, self.html,
        #                                                              self.dot_label))
        #print(ss)

        if self.html:
            self.ltag = "<"
            self.rtag = ">"
            self.bitag = ""
            self.escape = ""
            self.divider = " "
            self.row_parser = HTMLToINode(rows_mode=True)
        elif self.latex:
            self.ltag = "{"
            self.rtag = "}"
            self.bitag = "$"
            self.escape = "\\"
            self.divider = " "
            self.row_parser = LatexToINode(rows_mode=True)
        else:
            self.ltag = ""
            self.rtag = ""
            self.bitag = '"'
            self.escape = ""
            self.divider = " "
            self.row_parser = None
        if self.lbracket == '(' and not self.html and not self.latex:  # treebank format
            self.treebank = True
            self.dot_label = False
            #print('** assuming treebank format **')
        else:
            self.treebank = False

        if not parts_list:
            string_lists = self.chop_into_parts(ss)
            parts_list = self.stringify_and_split(string_lists)
            if len(parts_list) == 1 and isinstance(
                    parts_list[0], list) and len(parts_list[0]) == 1:
                parts_list = parts_list[
                    0]  # treebank trees have unnecessary () around them
        self.tree = self.parse_parts(parts_list)
        self.nodes = self.tree