def parse(self, input, output):
     self.reader = input # filtering for bad encoding should be done in the reader
     self.writer = output # sorting or filtering output should be done by piping the writer
     self.event_stream = buffer_stream(shlax.parser(self.reader))
     for n in self.event_stream:
         if n.type == "StartTag":
             self.make_object(n.name,"doc",n,{"filename":self.filename})
     # note that objects are emitted in the order in which they END, due to the recursion.
     # document finishes last. sort as necessary/desired.
     return (self.objects_max,self.counts)
Example #2
0
    def parse(self, input, output):
        self.reader = input
        self.writer = output
        p = shlax.parser(self.reader)
        for n in p:
            if n.type == "StartTag":
                self.parallel["byte"] = n.start
                self.context.append(n.name)
                #match metadata after you append: you want to see if you're entering a metadata context.
                for pattern in self.metamap:
                    if context_match(self.context,pattern):
                        self.metahandler = self.metamap[pattern]
                        self.context_memory = self.context
#                meta_result = self.match_metapaths()
#                if meta_result: print meta_result
                if n.name in self.mapping:
                    type = self.mapping[n.name]
                    self.push_object(type)
                    attlist = ""
                    for k,v in n.attributes.items():
                        attlist += " %s=\"%s\"" % (k,v)
                    try:
                        emit_object(self.writer,type,"<" + n.name + attlist + ">",self.objects.v,self.parallel["byte"],self.parallel["line"])
                    except UnicodeDecodeError:
                        print >> sys.stderr, "bad encoding at %s byte %s" % (self.filename,n.start)
                    if type == "doc":
                        print >> self.writer, "meta %s %s" % ("filename", self.filename)
                if n.name == "l":
                    if "n" in n.attributes.keys():
                        self.parallel["line"] = int(n.attributes["n"])
                    else:
                        self.parallel["line"] += 1
                    print >> self.writer, "line %d %d" % (self.parallel["byte"],self.parallel["line"])
                    self.line_max = max(self.parallel["line"],self.line_max)
            elif n.type == "EndTag":
                self.parallel["byte"] = n.start
                #match metadata before you pop: you want to see if you're leaving a metadata context.
                for pattern in self.metamap:
                    if self.context_memory and self.context_memory == self.context:
                        self.metahandler = None
                        self.context_memory = None
                if self.context[-1] == n.name:
                    self.context.pop()
                else:
                    print >> sys.stderr, "mismatched tag at %s byte %s" % (self.filename,n.start)
                if n.name in self.mapping:
                    type = self.mapping[n.name]
                    self.pull_object(type)
                    emit_object(self.writer,type,"</" + n.name + ">",self.objects.v,self.parallel["byte"],self.parallel["line"])                           
            elif n.type == "text":
                self.parallel["byte"] = n.start 
                try: # this tokenizer could go into it's own subroutine...
                    text = n.content.decode("UTF-8")
                    tokens = re.finditer(ur"([\w\u2019]+)|([\.;:?!])",text,re.U)
                    offset = self.parallel["byte"]
                    if self.metahandler:
                        cleantext = re.sub("[\n\t]"," ",text)
                        print >> self.writer, "meta %s %s" % (self.metahandler,cleantext)
                    for token in tokens:
                        if token.group(1):
                            self.push_object("word")
                            char_offset = token.start(1)
                            byte_length = len(text[:char_offset].encode("UTF-8"))
                            emit_object(self.writer,"word",token.group(1),self.objects.v,offset + byte_length,self.parallel["line"])                           
                            self.counts[token.group(1)] = self.counts.get(token.group(1),0) + 1
                        if token.group(2):
                            self.push_object("sent")
                            char_offset = token.start(1)
                            byte_length = len(text[:char_offset].encode("UTF-8"))
                            emit_object(self.writer,"sent",token.group(2),self.objects.v,offset + byte_length,self.parallel["line"])                           
                except UnicodeDecodeError:
                    print >> sys.stderr, "bad encoding in %s around byte %s" % (self.filename,n.start)
        #Here [after done parsing] I should see if I still have an object stack, and unwind it, if so.
        max_v = self.objects_max
        max_v.extend((self.parallel["byte"],self.line_max))
        return (max_v,self.counts)