def test_extract(): """tests the extract function""" assert quote.extract("the <quoted> part", "<", ">", "\\", 0) == ("<quoted>", False) assert quote.extract("the 'quoted' part", "'", "'", "\\", 0) == ("'quoted'", False) assert quote.extract("the 'isn\\'t escaping fun' part", "'", "'", "\\", 0) == ( "'isn\\'t escaping fun'", False, ) assert quote.extract("the 'isn\\'t something ", "'", "'", "\\", 0) == ( "'isn\\'t something ", True, ) assert quote.extract("<quoted>\\", "<", ">", "\\", 0) == ("<quoted>", False) assert quote.extract("<quoted><again>", "<", ">", "\\", 0) == ( "<quoted><again>", False, ) assert quote.extract("<quoted>\\\\<again>", "<", ">", "\\", 0) == ( "<quoted><again>", False, ) assert quote.extract("<quoted\\>", "<", ">", "\\", 0) == ("<quoted\\>", True) assert quote.extract(' -->\n<!ENTITY blah "Some">', "<!--", "-->", None, 1) == ( " -->", False, ) assert quote.extract('">\n', '"', '"', None, True) == ('"', False)
def test_extract(): """tests the extract function""" assert quote.extract("the <quoted> part", "<", ">", "\\", 0) == ("<quoted>", False) assert quote.extract("the 'quoted' part", "'", "'", "\\", 0) == ("'quoted'", False) assert quote.extract("the 'isn\\'t escaping fun' part", "'", "'", "\\", 0) == ("'isn\\'t escaping fun'", False) assert quote.extract("the 'isn\\'t something ", "'", "'", "\\", 0) == ("'isn\\'t something ", True) assert quote.extract("<quoted>\\", "<", ">", "\\", 0) == ("<quoted>", False) assert quote.extract("<quoted><again>", "<", ">", "\\", 0) == ("<quoted><again>", False) assert quote.extract("<quoted>\\\\<again>", "<", ">", "\\", 0) == ("<quoted><again>", False) assert quote.extract("<quoted\\>", "<", ">", "\\", 0) == ("<quoted\\>", True) assert quote.extract(' -->\n<!ENTITY blah "Some">', "<!--", "-->", None, 1) == (" -->", False) assert quote.extract('">\n', '"', '"', None, True) == ('"', False)
def parse(self, dtdsrc): """read the first dtd element from the source code into this object, return linesprocessed""" self.comments = [] # make all the lists the same self._locfilenotes = self.comments self._locgroupstarts = self.comments self._locgroupends = self.comments self._locnotes = self.comments # self._locfilenotes = [] # self._locgroupstarts = [] # self._locgroupends = [] # self._locnotes = [] # self.comments = [] self.entity = None self.definition = '' if not dtdsrc: return 0 lines = dtdsrc.split("\n") linesprocessed = 0 comment = "" for line in lines: line += "\n" linesprocessed += 1 # print "line(%d,%d): " % (self.incomment,self.inentity),line[:-1] if not self.incomment: if (line.find('<!--') != -1): self.incomment = True self.continuecomment = False # now work out the type of comment, and save it (remember we're not in the comment yet) (comment, dummy) = quote.extract(line, "<!--", "-->", None, 0) if comment.find('LOCALIZATION NOTE') != -1: l = quote.findend(comment, 'LOCALIZATION NOTE') while (comment[l] == ' '): l += 1 if comment.find('FILE', l) == l: self.commenttype = "locfile" elif comment.find('BEGIN', l) == l: self.commenttype = "locgroupstart" elif comment.find('END', l) == l: self.commenttype = "locgroupend" else: self.commenttype = "locnote" else: # plain comment self.commenttype = "comment" #FIXME: bloody entity might share a line with something important elif not self.inentity and re.search("%.*;", line): # now work out the type of comment, and save it (remember we're not in the comment yet) self.comments.append(("comment", line)) line = "" continue if self.incomment: # some kind of comment (comment, self.incomment) = quote.extract(line, "<!--", "-->", None, self.continuecomment) # print "comment(%d,%d): " % (self.incomment,self.continuecomment),comment self.continuecomment = self.incomment # strip the comment out of what will be parsed line = line.replace(comment, "", 1) # add a end of line of this is the end of the comment if not self.incomment: if line.isspace(): comment += line line = '' else: comment += '\n' # check if there's actually an entity definition that's commented out # TODO: parse these, store as obsolete messages # if comment.find('<!ENTITY') != -1: # # remove the entity from the comment # comment, dummy = quote.extractwithoutquotes(comment, ">", "<!ENTITY", None, 1) # depending on the type of comment (worked out at the start), put it in the right place # make it record the comment and type as a tuple commentpair = (self.commenttype, comment) if self.commenttype == "locfile": self._locfilenotes.append(commentpair) elif self.commenttype == "locgroupstart": self._locgroupstarts.append(commentpair) elif self.commenttype == "locgroupend": self._locgroupends.append(commentpair) elif self.commenttype == "locnote": self._locnotes.append(commentpair) elif self.commenttype == "comment": self.comments.append(commentpair) if not self.inentity and not self.incomment: entitypos = line.find('<!ENTITY') if entitypos != -1: self.inentity = True beforeentity = line[:entitypos].strip() if beforeentity.startswith("#"): self.hashprefix = beforeentity self.entitypart = "start" else: self.unparsedlines.append(line) if self.inentity: if self.entitypart == "start": # the entity definition e = quote.findend(line, '<!ENTITY') line = line[e:] self.entitypart = "name" self.entitytype = "internal" if self.entitypart == "name": s = 0 e = 0 while (e < len(line) and line[e].isspace()): e += 1 self.space_pre_entity = ' ' * (e - s) s = e self.entity = '' if (e < len(line) and line[e] == '%'): self.entitytype = "external" self.entityparameter = "" e += 1 while (e < len(line) and line[e].isspace()): e += 1 while (e < len(line) and not line[e].isspace()): self.entity += line[e] e += 1 s = e assert quote.rstripeol(self.entity) == self.entity while (e < len(line) and line[e].isspace()): e += 1 self.space_pre_definition = ' ' * (e - s) if self.entity: if self.entitytype == "external": self.entitypart = "parameter" else: self.entitypart = "definition" # remember the start position and the quote character if e == len(line): self.entityhelp = None e = 0 continue elif self.entitypart == "definition": self.entityhelp = (e, line[e]) self.instring = False if self.entitypart == "parameter": while (e < len(line) and line[e].isspace()): e += 1 paramstart = e while (e < len(line) and line[e].isalnum()): e += 1 self.entityparameter += line[paramstart:e] while (e < len(line) and line[e].isspace()): e += 1 line = line[e:] e = 0 if not line: continue if line[0] in ('"', "'"): self.entitypart = "definition" self.entityhelp = (e, line[e]) self.instring = False if self.entitypart == "definition": if self.entityhelp is None: e = 0 while (e < len(line) and line[e].isspace()): e += 1 if e == len(line): continue self.entityhelp = (e, line[e]) self.instring = False # actually the lines below should remember instring, rather than using it as dummy e = self.entityhelp[0] if (self.entityhelp[1] == "'"): (defpart, self.instring) = quote.extract( line[e:], "'", "'", startinstring=self.instring, allowreentry=False) elif (self.entityhelp[1] == '"'): (defpart, self.instring) = quote.extract( line[e:], '"', '"', startinstring=self.instring, allowreentry=False) else: raise ValueError("Unexpected quote character... %r" % (self.entityhelp[1])) # for any following lines, start at the beginning of the line. remember the quote character self.entityhelp = (0, self.entityhelp[1]) self.definition += defpart if not self.instring: self.closing = line[e + len(defpart):].rstrip("\n\r") self.inentity = False break # uncomment this line to debug processing if 0: for attr in dir(self): r = repr(getattr(self, attr)) if len(r) > 60: r = r[:57] + "..." self.comments.append(("comment", "self.%s = %s" % (attr, r))) return linesprocessed
def parse(self, dtdsrc): """read the first dtd element from the source code into this object, return linesprocessed""" self.comments = [] # make all the lists the same self._locfilenotes = self.comments self._locgroupstarts = self.comments self._locgroupends = self.comments self._locnotes = self.comments # self._locfilenotes = [] # self._locgroupstarts = [] # self._locgroupends = [] # self._locnotes = [] # self.comments = [] self.entity = None self.definition = '' if not dtdsrc: return 0 lines = dtdsrc.split("\n") linesprocessed = 0 comment = "" for line in lines: line += "\n" linesprocessed += 1 if not self.incomment: if (line.find('<!--') != -1): self.incomment = True self.continuecomment = False # now work out the type of comment, and save it (remember we're not in the comment yet) (comment, dummy) = quote.extract(line, "<!--", "-->", None, 0) if comment.find('LOCALIZATION NOTE') != -1: l = quote.findend(comment, 'LOCALIZATION NOTE') while (comment[l] == ' '): l += 1 if comment.find('FILE', l) == l: self.commenttype = "locfile" elif comment.find('BEGIN', l) == l: self.commenttype = "locgroupstart" elif comment.find('END', l) == l: self.commenttype = "locgroupend" else: self.commenttype = "locnote" else: # plain comment self.commenttype = "comment" #FIXME: bloody entity might share a line with something important elif not self.inentity and re.search("%.*;", line): # now work out the type of comment, and save it (remember we're not in the comment yet) self.comments.append(("comment", line)) line = "" continue if self.incomment: # some kind of comment (comment, self.incomment) = quote.extract(line, "<!--", "-->", None, self.continuecomment) self.continuecomment = self.incomment # strip the comment out of what will be parsed line = line.replace(comment, "", 1) # add a end of line of this is the end of the comment if not self.incomment: if line.isspace(): comment += line line = '' else: comment += '\n' # check if there's actually an entity definition that's commented out # TODO: parse these, store as obsolete messages # if comment.find('<!ENTITY') != -1: # # remove the entity from the comment # comment, dummy = quote.extractwithoutquotes(comment, ">", "<!ENTITY", None, 1) # depending on the type of comment (worked out at the start), put it in the right place # make it record the comment and type as a tuple commentpair = (self.commenttype, comment) if self.commenttype == "locfile": self._locfilenotes.append(commentpair) elif self.commenttype == "locgroupstart": self._locgroupstarts.append(commentpair) elif self.commenttype == "locgroupend": self._locgroupends.append(commentpair) elif self.commenttype == "locnote": self._locnotes.append(commentpair) elif self.commenttype == "comment": self.comments.append(commentpair) if not self.inentity and not self.incomment: entitypos = line.find('<!ENTITY') if entitypos != -1: self.inentity = True beforeentity = line[:entitypos].strip() if beforeentity.startswith("#"): self.hashprefix = beforeentity self.entitypart = "start" else: self.unparsedlines.append(line) if self.inentity: if self.entitypart == "start": # the entity definition e = quote.findend(line, '<!ENTITY') line = line[e:] self.entitypart = "name" self.entitytype = "internal" if self.entitypart == "name": s = 0 e = 0 while (e < len(line) and line[e].isspace()): e += 1 self.space_pre_entity = ' ' * (e - s) s = e self.entity = '' if (e < len(line) and line[e] == '%'): self.entitytype = "external" self.entityparameter = "" e += 1 while (e < len(line) and line[e].isspace()): e += 1 while (e < len(line) and not line[e].isspace()): self.entity += line[e] e += 1 s = e assert quote.rstripeol(self.entity) == self.entity while (e < len(line) and line[e].isspace()): e += 1 self.space_pre_definition = ' ' * (e - s) if self.entity: if self.entitytype == "external": self.entitypart = "parameter" else: self.entitypart = "definition" # remember the start position and the quote character if e == len(line): self.entityhelp = None e = 0 continue elif self.entitypart == "definition": self.entityhelp = (e, line[e]) self.instring = False if self.entitypart == "parameter": while (e < len(line) and line[e].isspace()): e += 1 paramstart = e while (e < len(line) and line[e].isalnum()): e += 1 self.entityparameter += line[paramstart:e] while (e < len(line) and line[e].isspace()): e += 1 line = line[e:] e = 0 if not line: continue if line[0] in ('"', "'"): self.entitypart = "definition" self.entityhelp = (e, line[e]) self.instring = False if self.entitypart == "definition": if self.entityhelp is None: e = 0 while (e < len(line) and line[e].isspace()): e += 1 if e == len(line): continue self.entityhelp = (e, line[e]) self.instring = False # actually the lines below should remember instring, rather than using it as dummy e = self.entityhelp[0] if (self.entityhelp[1] == "'"): (defpart, self.instring) = quote.extract(line[e:], "'", "'", startinstring=self.instring, allowreentry=False) elif (self.entityhelp[1] == '"'): (defpart, self.instring) = quote.extract(line[e:], '"', '"', startinstring=self.instring, allowreentry=False) else: raise ValueError("Unexpected quote character... %r" % (self.entityhelp[1])) # for any following lines, start at the beginning of the line. remember the quote character self.entityhelp = (0, self.entityhelp[1]) self.definition += defpart if not self.instring: self.closing = line[e+len(defpart):].rstrip("\n\r") self.inentity = False break # uncomment this line to debug processing if 0: for attr in dir(self): r = repr(getattr(self, attr)) if len(r) > 60: r = r[:57] + "..." self.comments.append(("comment", "self.%s = %s" % (attr, r))) return linesprocessed