Example #1
0
def test_extract():
    """tests the extract function"""
    assert quote.extract("the <quoted> part", "<", ">", "\\", 0) == ("<quoted>", False)
    assert quote.extract("the 'quoted' part", "'", "'", "\\", 0) == ("'quoted'", False)
    assert quote.extract("the 'isn\\'t escaping fun' part", "'", "'", "\\", 0) == (
        "'isn\\'t escaping fun'",
        False,
    )
    assert quote.extract("the 'isn\\'t something ", "'", "'", "\\", 0) == (
        "'isn\\'t something ",
        True,
    )
    assert quote.extract("<quoted>\\", "<", ">", "\\", 0) == ("<quoted>", False)
    assert quote.extract("<quoted><again>", "<", ">", "\\", 0) == (
        "<quoted><again>",
        False,
    )
    assert quote.extract("<quoted>\\\\<again>", "<", ">", "\\", 0) == (
        "<quoted><again>",
        False,
    )
    assert quote.extract("<quoted\\>", "<", ">", "\\", 0) == ("<quoted\\>", True)
    assert quote.extract(' -->\n<!ENTITY blah "Some">', "<!--", "-->", None, 1) == (
        " -->",
        False,
    )
    assert quote.extract('">\n', '"', '"', None, True) == ('"', False)
Example #2
0
def test_extract():
    """tests the extract function"""
    assert quote.extract("the <quoted> part", "<", ">", "\\", 0) == ("<quoted>", False)
    assert quote.extract("the 'quoted' part", "'", "'", "\\", 0) == ("'quoted'", False)
    assert quote.extract("the 'isn\\'t escaping fun' part", "'", "'", "\\", 0) == ("'isn\\'t escaping fun'", False)
    assert quote.extract("the 'isn\\'t something ", "'", "'", "\\", 0) == ("'isn\\'t something ", True)
    assert quote.extract("<quoted>\\", "<", ">", "\\", 0) == ("<quoted>", False)
    assert quote.extract("<quoted><again>", "<", ">", "\\", 0) == ("<quoted><again>", False)
    assert quote.extract("<quoted>\\\\<again>", "<", ">", "\\", 0) == ("<quoted><again>", False)
    assert quote.extract("<quoted\\>", "<", ">", "\\", 0) == ("<quoted\\>", True)
    assert quote.extract(' -->\n<!ENTITY blah "Some">', "<!--", "-->", None, 1) == (" -->", False)
    assert quote.extract('">\n', '"', '"', None, True) == ('"', False)
Example #3
0
    def parse(self, dtdsrc):
        """read the first dtd element from the source code into this object, return linesprocessed"""
        self.comments = []
        # make all the lists the same
        self._locfilenotes = self.comments
        self._locgroupstarts = self.comments
        self._locgroupends = self.comments
        self._locnotes = self.comments
        # self._locfilenotes = []
        # self._locgroupstarts = []
        # self._locgroupends = []
        # self._locnotes = []
        # self.comments = []
        self.entity = None
        self.definition = ''
        if not dtdsrc:
            return 0
        lines = dtdsrc.split("\n")
        linesprocessed = 0
        comment = ""
        for line in lines:
            line += "\n"
            linesprocessed += 1
            # print "line(%d,%d): " % (self.incomment,self.inentity),line[:-1]
            if not self.incomment:
                if (line.find('<!--') != -1):
                    self.incomment = True
                    self.continuecomment = False
                    # now work out the type of comment, and save it (remember we're not in the comment yet)
                    (comment, dummy) = quote.extract(line, "<!--", "-->", None,
                                                     0)
                    if comment.find('LOCALIZATION NOTE') != -1:
                        l = quote.findend(comment, 'LOCALIZATION NOTE')
                        while (comment[l] == ' '):
                            l += 1
                        if comment.find('FILE', l) == l:
                            self.commenttype = "locfile"
                        elif comment.find('BEGIN', l) == l:
                            self.commenttype = "locgroupstart"
                        elif comment.find('END', l) == l:
                            self.commenttype = "locgroupend"
                        else:
                            self.commenttype = "locnote"
                    else:
                        # plain comment
                        self.commenttype = "comment"
                #FIXME: bloody entity might share a line with something important
                elif not self.inentity and re.search("%.*;", line):
                    # now work out the type of comment, and save it (remember we're not in the comment yet)
                    self.comments.append(("comment", line))
                    line = ""
                    continue

            if self.incomment:
                # some kind of comment
                (comment,
                 self.incomment) = quote.extract(line, "<!--", "-->", None,
                                                 self.continuecomment)
                # print "comment(%d,%d): " % (self.incomment,self.continuecomment),comment
                self.continuecomment = self.incomment
                # strip the comment out of what will be parsed
                line = line.replace(comment, "", 1)
                # add a end of line of this is the end of the comment
                if not self.incomment:
                    if line.isspace():
                        comment += line
                        line = ''
                    else:
                        comment += '\n'
                # check if there's actually an entity definition that's commented out
                # TODO: parse these, store as obsolete messages
                # if comment.find('<!ENTITY') != -1:
                #     # remove the entity from the comment
                #     comment, dummy = quote.extractwithoutquotes(comment, ">", "<!ENTITY", None, 1)
                # depending on the type of comment (worked out at the start), put it in the right place
                # make it record the comment and type as a tuple
                commentpair = (self.commenttype, comment)
                if self.commenttype == "locfile":
                    self._locfilenotes.append(commentpair)
                elif self.commenttype == "locgroupstart":
                    self._locgroupstarts.append(commentpair)
                elif self.commenttype == "locgroupend":
                    self._locgroupends.append(commentpair)
                elif self.commenttype == "locnote":
                    self._locnotes.append(commentpair)
                elif self.commenttype == "comment":
                    self.comments.append(commentpair)

            if not self.inentity and not self.incomment:
                entitypos = line.find('<!ENTITY')
                if entitypos != -1:
                    self.inentity = True
                    beforeentity = line[:entitypos].strip()
                    if beforeentity.startswith("#"):
                        self.hashprefix = beforeentity
                    self.entitypart = "start"
                else:
                    self.unparsedlines.append(line)

            if self.inentity:
                if self.entitypart == "start":
                    # the entity definition
                    e = quote.findend(line, '<!ENTITY')
                    line = line[e:]
                    self.entitypart = "name"
                    self.entitytype = "internal"
                if self.entitypart == "name":
                    s = 0
                    e = 0
                    while (e < len(line) and line[e].isspace()):
                        e += 1
                    self.space_pre_entity = ' ' * (e - s)
                    s = e
                    self.entity = ''
                    if (e < len(line) and line[e] == '%'):
                        self.entitytype = "external"
                        self.entityparameter = ""
                        e += 1
                        while (e < len(line) and line[e].isspace()):
                            e += 1
                    while (e < len(line) and not line[e].isspace()):
                        self.entity += line[e]
                        e += 1
                    s = e

                    assert quote.rstripeol(self.entity) == self.entity
                    while (e < len(line) and line[e].isspace()):
                        e += 1
                    self.space_pre_definition = ' ' * (e - s)
                    if self.entity:
                        if self.entitytype == "external":
                            self.entitypart = "parameter"
                        else:
                            self.entitypart = "definition"
                        # remember the start position and the quote character
                        if e == len(line):
                            self.entityhelp = None
                            e = 0
                            continue
                        elif self.entitypart == "definition":
                            self.entityhelp = (e, line[e])
                            self.instring = False
                if self.entitypart == "parameter":
                    while (e < len(line) and line[e].isspace()):
                        e += 1
                    paramstart = e
                    while (e < len(line) and line[e].isalnum()):
                        e += 1
                    self.entityparameter += line[paramstart:e]
                    while (e < len(line) and line[e].isspace()):
                        e += 1
                    line = line[e:]
                    e = 0
                    if not line:
                        continue
                    if line[0] in ('"', "'"):
                        self.entitypart = "definition"
                        self.entityhelp = (e, line[e])
                        self.instring = False
                if self.entitypart == "definition":
                    if self.entityhelp is None:
                        e = 0
                        while (e < len(line) and line[e].isspace()):
                            e += 1
                        if e == len(line):
                            continue
                        self.entityhelp = (e, line[e])
                        self.instring = False
                    # actually the lines below should remember instring, rather than using it as dummy
                    e = self.entityhelp[0]
                    if (self.entityhelp[1] == "'"):
                        (defpart, self.instring) = quote.extract(
                            line[e:],
                            "'",
                            "'",
                            startinstring=self.instring,
                            allowreentry=False)
                    elif (self.entityhelp[1] == '"'):
                        (defpart, self.instring) = quote.extract(
                            line[e:],
                            '"',
                            '"',
                            startinstring=self.instring,
                            allowreentry=False)
                    else:
                        raise ValueError("Unexpected quote character... %r" %
                                         (self.entityhelp[1]))
                    # for any following lines, start at the beginning of the line. remember the quote character
                    self.entityhelp = (0, self.entityhelp[1])
                    self.definition += defpart
                    if not self.instring:
                        self.closing = line[e + len(defpart):].rstrip("\n\r")
                        self.inentity = False
                        break

        # uncomment this line to debug processing
        if 0:
            for attr in dir(self):
                r = repr(getattr(self, attr))
                if len(r) > 60:
                    r = r[:57] + "..."
                self.comments.append(("comment", "self.%s = %s" % (attr, r)))
        return linesprocessed
Example #4
0
    def parse(self, dtdsrc):
        """read the first dtd element from the source code into this object, return linesprocessed"""
        self.comments = []
        # make all the lists the same
        self._locfilenotes = self.comments
        self._locgroupstarts = self.comments
        self._locgroupends = self.comments
        self._locnotes = self.comments
        # self._locfilenotes = []
        # self._locgroupstarts = []
        # self._locgroupends = []
        # self._locnotes = []
        # self.comments = []
        self.entity = None
        self.definition = ''
        if not dtdsrc:
            return 0
        lines = dtdsrc.split("\n")
        linesprocessed = 0
        comment = ""
        for line in lines:
            line += "\n"
            linesprocessed += 1
            if not self.incomment:
                if (line.find('<!--') != -1):
                    self.incomment = True
                    self.continuecomment = False
                    # now work out the type of comment, and save it (remember we're not in the comment yet)
                    (comment, dummy) = quote.extract(line, "<!--", "-->", None, 0)
                    if comment.find('LOCALIZATION NOTE') != -1:
                        l = quote.findend(comment, 'LOCALIZATION NOTE')
                        while (comment[l] == ' '):
                            l += 1
                        if comment.find('FILE', l) == l:
                            self.commenttype = "locfile"
                        elif comment.find('BEGIN', l) == l:
                            self.commenttype = "locgroupstart"
                        elif comment.find('END', l) == l:
                            self.commenttype = "locgroupend"
                        else:
                            self.commenttype = "locnote"
                    else:
                        # plain comment
                        self.commenttype = "comment"
                #FIXME: bloody entity might share a line with something important
                elif not self.inentity and re.search("%.*;", line):
                    # now work out the type of comment, and save it (remember we're not in the comment yet)
                    self.comments.append(("comment", line))
                    line = ""
                    continue

            if self.incomment:
                # some kind of comment
                (comment, self.incomment) = quote.extract(line, "<!--", "-->", None, self.continuecomment)
                self.continuecomment = self.incomment
                # strip the comment out of what will be parsed
                line = line.replace(comment, "", 1)
                # add a end of line of this is the end of the comment
                if not self.incomment:
                    if line.isspace():
                        comment += line
                        line = ''
                    else:
                        comment += '\n'
                # check if there's actually an entity definition that's commented out
                # TODO: parse these, store as obsolete messages
                # if comment.find('<!ENTITY') != -1:
                #     # remove the entity from the comment
                #     comment, dummy = quote.extractwithoutquotes(comment, ">", "<!ENTITY", None, 1)
                # depending on the type of comment (worked out at the start), put it in the right place
                # make it record the comment and type as a tuple
                commentpair = (self.commenttype, comment)
                if self.commenttype == "locfile":
                    self._locfilenotes.append(commentpair)
                elif self.commenttype == "locgroupstart":
                    self._locgroupstarts.append(commentpair)
                elif self.commenttype == "locgroupend":
                    self._locgroupends.append(commentpair)
                elif self.commenttype == "locnote":
                    self._locnotes.append(commentpair)
                elif self.commenttype == "comment":
                    self.comments.append(commentpair)

            if not self.inentity and not self.incomment:
                entitypos = line.find('<!ENTITY')
                if entitypos != -1:
                    self.inentity = True
                    beforeentity = line[:entitypos].strip()
                    if beforeentity.startswith("#"):
                        self.hashprefix = beforeentity
                    self.entitypart = "start"
                else:
                    self.unparsedlines.append(line)

            if self.inentity:
                if self.entitypart == "start":
                    # the entity definition
                    e = quote.findend(line, '<!ENTITY')
                    line = line[e:]
                    self.entitypart = "name"
                    self.entitytype = "internal"
                if self.entitypart == "name":
                    s = 0
                    e = 0
                    while (e < len(line) and line[e].isspace()):
                        e += 1
                    self.space_pre_entity = ' ' * (e - s)
                    s = e
                    self.entity = ''
                    if (e < len(line) and line[e] == '%'):
                        self.entitytype = "external"
                        self.entityparameter = ""
                        e += 1
                        while (e < len(line) and line[e].isspace()):
                            e += 1
                    while (e < len(line) and not line[e].isspace()):
                        self.entity += line[e]
                        e += 1
                    s = e

                    assert quote.rstripeol(self.entity) == self.entity
                    while (e < len(line) and line[e].isspace()):
                        e += 1
                    self.space_pre_definition = ' ' * (e - s)
                    if self.entity:
                        if self.entitytype == "external":
                            self.entitypart = "parameter"
                        else:
                            self.entitypart = "definition"
                        # remember the start position and the quote character
                        if e == len(line):
                            self.entityhelp = None
                            e = 0
                            continue
                        elif self.entitypart == "definition":
                            self.entityhelp = (e, line[e])
                            self.instring = False
                if self.entitypart == "parameter":
                    while (e < len(line) and line[e].isspace()):
                        e += 1
                    paramstart = e
                    while (e < len(line) and line[e].isalnum()):
                        e += 1
                    self.entityparameter += line[paramstart:e]
                    while (e < len(line) and line[e].isspace()):
                        e += 1
                    line = line[e:]
                    e = 0
                    if not line:
                        continue
                    if line[0] in ('"', "'"):
                        self.entitypart = "definition"
                        self.entityhelp = (e, line[e])
                        self.instring = False
                if self.entitypart == "definition":
                    if self.entityhelp is None:
                        e = 0
                        while (e < len(line) and line[e].isspace()):
                            e += 1
                        if e == len(line):
                            continue
                        self.entityhelp = (e, line[e])
                        self.instring = False
                    # actually the lines below should remember instring, rather than using it as dummy
                    e = self.entityhelp[0]
                    if (self.entityhelp[1] == "'"):
                        (defpart, self.instring) = quote.extract(line[e:], "'", "'", startinstring=self.instring, allowreentry=False)
                    elif (self.entityhelp[1] == '"'):
                        (defpart, self.instring) = quote.extract(line[e:], '"', '"', startinstring=self.instring, allowreentry=False)
                    else:
                        raise ValueError("Unexpected quote character... %r" % (self.entityhelp[1]))
                    # for any following lines, start at the beginning of the line. remember the quote character
                    self.entityhelp = (0, self.entityhelp[1])
                    self.definition += defpart
                    if not self.instring:
                        self.closing = line[e+len(defpart):].rstrip("\n\r")
                        self.inentity = False
                        break

        # uncomment this line to debug processing
        if 0:
            for attr in dir(self):
                r = repr(getattr(self, attr))
                if len(r) > 60:
                    r = r[:57] + "..."
                self.comments.append(("comment", "self.%s = %s" % (attr, r)))
        return linesprocessed