Exemple #1
0
 def parse_string(self, text):
     """Parses a text string and returns a SourceDoc. Simply dumps the full
     string into the text variable of the SourceDoc."""
     sourcedoc = SourceDoc(None)
     # TODO: do we need to ensure the text is unicode?
     sourcedoc.text = text
     return TarsqiDocument(sourcedoc, {})
Exemple #2
0
 def parse_file(self, filename, tarsqidoc):
     """Parses filename and returns a SourceDoc. Uses the ParseFile routine
     of the expat parser, where all the handlers are set up to fill in the
     text and tags in SourceDoc."""
     self.sourcedoc = SourceDoc(filename)
     # TODO: should this be codecs.open() for non-ascii?
     self.parser.ParseFile(open(filename))
     self.sourcedoc.finish()
     tarsqidoc.sourcedoc = self.sourcedoc
Exemple #3
0
 def parse_string(self, text, tarsqidoc):
     """Parses a text string and returns a SourceDoc. Uses the ParseFile routine of
     the expat parser, where all the handlers are set up to fill in the text
     and tags in SourceDoc."""
     self.sourcedoc = SourceDoc(None)
     # TODO: do we need to make sure that text is unicode?
     self.parser.Parse(text)
     self.sourcedoc.finish()
     tarsqidoc.sourcedoc = self.sourcedoc
Exemple #4
0
 def parse_file(self, filename, tarsqidoc):
     """Parses filename and returns a SourceDoc. Uses the ParseFile routine
     of the expat parser, where all the handlers are set up to fill in the
     text and tags in SourceDoc."""
     self.sourcedoc = SourceDoc(filename)
     # TODO: should this be codecs.open() for non-ascii?
     # self.parser.ParseFile(open(filename))
     # NOTE: actually, the above line needed to replaced with the following
     # while preparing to port code to Python3.
     content = codecs.open(filename).read()
     self.parser.Parse(content)
     self.sourcedoc.finish()
     tarsqidoc.sourcedoc = self.sourcedoc
Exemple #5
0
 def parse_string(self, text, tarsqidoc):
     """Parse the TTK string and put the contents in the appropriate parts of the
     SourceDoc."""
     self.lif = LIF(json_string=text)
     tarsqidoc.sourcedoc = SourceDoc()
     tarsqidoc.sourcedoc.text = self.lif.text.value
     tarsqidoc.sourcedoc.lif = self.lif
Exemple #6
0
 def _parse(self, tarsqidoc):
     self.sourcedoc = SourceDoc(None)
     self.tarsqidoc = tarsqidoc
     self.tarsqidoc.sourcedoc = self.sourcedoc
     self.sourcedoc.text = self.topnodes['text'].firstChild.data
     self._add_source_tags()
     self._add_tarsqi_tags()
     self._add_comments()
     self._add_metadata()
Exemple #7
0
 def parse_string(self, text, tarsqidoc):
     """Parses a text string and returns a SourceDoc. Uses the ParseFile routine of
     the expat parser, where all the handlers are set up to fill in the text
     and tags in SourceDoc."""
     self.sourcedoc = SourceDoc(None)
     # TODO: do we need to make sure that text is unicode?
     self.parser.Parse(text)
     self.sourcedoc.finish()
     tarsqidoc.sourcedoc = self.sourcedoc
Exemple #8
0
 def parse_file(self, filename, tarsqidoc):
     """Parses filename and returns a SourceDoc. Uses the ParseFile routine
     of the expat parser, where all the handlers are set up to fill in the
     text and tags in SourceDoc."""
     self.sourcedoc = SourceDoc(filename)
     # TODO: should this be codecs.open() for non-ascii?
     self.parser.ParseFile(open(filename))
     self.sourcedoc.finish()
     tarsqidoc.sourcedoc = self.sourcedoc
Exemple #9
0
 def parse_file(self, filename):
     """Parse the TTK file and put the contents in the appropriate parts of
     the SourceDoc."""
     self._load_dom(filename)
     self.sourcedoc = SourceDoc(filename)
     self.tarsqidoc = TarsqiDocument(self.sourcedoc, {})
     self.sourcedoc.text = self.topnodes['text'].firstChild.data
     self._add_source_tags()
     self._add_tarsqi_tags()
     self._add_comments()
     self._add_metadata()
     return self.tarsqidoc
Exemple #10
0
 def parse_file(self, filename, tarsqidoc):
     """Parse the TTK file and put the contents in the appropriate parts of
     the SourceDoc."""
     if self.is_container(filename):
         self.container = Container(json_file=filename)
         self.lif = self.container.payload
     else:
         self.container = None
         self.lif = LIF(json_file=filename)
     tarsqidoc.sourcedoc = SourceDoc(filename)
     tarsqidoc.sourcedoc.text = self.lif.text.value
     tarsqidoc.sourcedoc.lif = self.lif
     tarsqidoc.sourcedoc.lif_container = self.container
Exemple #11
0
 def parse_file(self, filename):
     """Parses filename and returns a SourceDoc. Simply dumps the full file
     content into the text variable of the SourceDoc."""
     sourcedoc = SourceDoc(filename)
     sourcedoc.text = codecs.open(filename, encoding='utf8').read()
     return TarsqiDocument(sourcedoc, {})
Exemple #12
0
class SourceParserXML(SourceParser):

    """Simple XML parser, using the Expat parser.

    Instance variables
       encoding - a string
       sourcedoc - an instance of SourceDoc
       parser - an Expat parser """

    # TODO: may need to add other handlers for completeness, see
    # http://docs.python.org/library/pyexpat.html, note however that if we
    # change our notion of primary data than we may not need to do that.

    # TODO. The way this is set up now requires the SourceDoc to know a lot
    # about the internal workings of the Expat parser (for example the notion
    # that begin and end tags are found separately). It is probably better to
    # keep that knowledge here, by building lists of tags here and only export
    # them after all elements are gathered (see note in parse_file).

    def __init__(self, encoding='utf-8'):
        """Set up the Expat parser."""
        self.encoding = encoding
        self.sourcedoc = None
        self.parser = xml.parsers.expat.ParserCreate(encoding=encoding)
        self.parser.buffer_text = 1
        self.parser.XmlDeclHandler = self._handle_xmldecl
        self.parser.ProcessingInstructionHandler = \
            self._handle_processing_instruction
        self.parser.CommentHandler = self._handle_comment
        self.parser.StartElementHandler = self._handle_start
        self.parser.EndElementHandler = self._handle_end
        self.parser.CharacterDataHandler = self._handle_characters
        self.parser.DefaultHandler = self._handle_default

    def parse_file(self, filename):
        """Parses filename and returns a SourceDoc. Uses the ParseFile routine
        of the expat parser, where all the handlers are set up to fill in the
        text and tags in SourceDoc."""
        self.sourcedoc = SourceDoc(filename)
        # TODO: should this be codecs.open() for non-ascii?
        self.parser.ParseFile(open(filename))
        self.sourcedoc.finish()
        tarsqidoc = TarsqiDocument(self.sourcedoc, {})
        return tarsqidoc

    def parse_string(self, text):
        """Parses a text string and returns a SourceDoc. Uses the ParseFile routine of
        the expat parser, where all the handlers are set up to fill in the text
        and tags in SourceDoc."""
        self.sourcedoc = SourceDoc(None)
        # TODO: do we need to make sure that text is unicode?
        self.parser.Parse(text)
        self.sourcedoc.finish()
        tarsqidoc = TarsqiDocument(self.sourcedoc, {})
        return tarsqidoc

    def _handle_xmldecl(self, version, encoding, standalone):
        """Store the XML declaration."""
        self._debug('xmldec')
        self.sourcedoc.xmldecl = (version, encoding, standalone)

    def _handle_processing_instruction(self, target, data):
        """Store processing instructions"""
        self._debug('proc', target, len(data))
        self.sourcedoc.add_processing_instruction(target, data)

    def _handle_comment(self, data):
        """Store comments."""
        self._debug('comment', len(data))
        self.sourcedoc.add_comment(data)

    def _handle_start(self, name, attrs):
        """Handle opening tags. Takes two arguments: a tag name and a dictionary
        of attributes. Asks the SourceDoc instance in the sourcedoc variable to
        add an opening tag."""
        self._debug('start', name, attrs)
        self.sourcedoc.add_opening_tag(name, attrs)

    def _handle_end(self, name):
        """Add closing tags to the SourceDoc."""
        self._debug('end', name)
        self.sourcedoc.add_closing_tag(name)

    def _handle_characters(self, string):
        """Handle character data by asking the SourceDocument to add the
        data. This will not necesarily add a contiguous string of character data
        as one data element. This should include ingnorable whtespace, but see
        the comment in the method below, I apparantly had reason t think
        otherwise."""
        self._debug('chars', len(string), string)
        self.sourcedoc.add_characters(string)

    def _handle_default(self, string):
        """Handle default data by asking the SourceDoc to add it as
        characters. This is here to get the 'ignoreable' whitespace, which I do
        not want to ignore."""
        # TODO: maybe ignore that whitespace after all, it does not seem to
        # matter though
        self._debug('default', len(string), string)
        self.sourcedoc.add_characters(string)

    def _debug(self, *rest):
        if SourceParser.DEBUG:
            p1 = "%s-%s" % (self.parser.CurrentLineNumber,
                            self.parser.CurrentColumnNumber)
            p2 = "%s" % self.parser.CurrentByteIndex
            print("%-5s  %-4s    %s" %
                  (p1, p2, "  ".join(["%-8s" % replace_newline(x) for x in rest])))
Exemple #13
0
 def parse_string(self, text, tarsqidoc):
     """Parses a text string and returns a SourceDoc. Simply dumps the full
     string into the text variable of the SourceDoc."""
     tarsqidoc.sourcedoc = SourceDoc(None)
     # TODO: do we need to ensure the text is unicode?
     tarsqidoc.sourcedoc.text = text