def parse(self, input, output): self.reader = input # filtering for bad encoding should be done in the reader self.writer = output # sorting or filtering output should be done by piping the writer self.event_stream = buffer_stream(shlax.parser(self.reader)) for n in self.event_stream: if n.type == "StartTag": self.make_object(n.name,"doc",n,{"filename":self.filename}) # note that objects are emitted in the order in which they END, due to the recursion. # document finishes last. sort as necessary/desired. return (self.objects_max,self.counts)
def parse(self, input, output): self.reader = input self.writer = output p = shlax.parser(self.reader) for n in p: if n.type == "StartTag": self.parallel["byte"] = n.start self.context.append(n.name) #match metadata after you append: you want to see if you're entering a metadata context. for pattern in self.metamap: if context_match(self.context,pattern): self.metahandler = self.metamap[pattern] self.context_memory = self.context # meta_result = self.match_metapaths() # if meta_result: print meta_result if n.name in self.mapping: type = self.mapping[n.name] self.push_object(type) attlist = "" for k,v in n.attributes.items(): attlist += " %s=\"%s\"" % (k,v) try: emit_object(self.writer,type,"<" + n.name + attlist + ">",self.objects.v,self.parallel["byte"],self.parallel["line"]) except UnicodeDecodeError: print >> sys.stderr, "bad encoding at %s byte %s" % (self.filename,n.start) if type == "doc": print >> self.writer, "meta %s %s" % ("filename", self.filename) if n.name == "l": if "n" in n.attributes.keys(): self.parallel["line"] = int(n.attributes["n"]) else: self.parallel["line"] += 1 print >> self.writer, "line %d %d" % (self.parallel["byte"],self.parallel["line"]) self.line_max = max(self.parallel["line"],self.line_max) elif n.type == "EndTag": self.parallel["byte"] = n.start #match metadata before you pop: you want to see if you're leaving a metadata context. for pattern in self.metamap: if self.context_memory and self.context_memory == self.context: self.metahandler = None self.context_memory = None if self.context[-1] == n.name: self.context.pop() else: print >> sys.stderr, "mismatched tag at %s byte %s" % (self.filename,n.start) if n.name in self.mapping: type = self.mapping[n.name] self.pull_object(type) emit_object(self.writer,type,"</" + n.name + ">",self.objects.v,self.parallel["byte"],self.parallel["line"]) elif n.type == "text": self.parallel["byte"] = n.start try: # this tokenizer could go into it's own subroutine... text = n.content.decode("UTF-8") tokens = re.finditer(ur"([\w\u2019]+)|([\.;:?!])",text,re.U) offset = self.parallel["byte"] if self.metahandler: cleantext = re.sub("[\n\t]"," ",text) print >> self.writer, "meta %s %s" % (self.metahandler,cleantext) for token in tokens: if token.group(1): self.push_object("word") char_offset = token.start(1) byte_length = len(text[:char_offset].encode("UTF-8")) emit_object(self.writer,"word",token.group(1),self.objects.v,offset + byte_length,self.parallel["line"]) self.counts[token.group(1)] = self.counts.get(token.group(1),0) + 1 if token.group(2): self.push_object("sent") char_offset = token.start(1) byte_length = len(text[:char_offset].encode("UTF-8")) emit_object(self.writer,"sent",token.group(2),self.objects.v,offset + byte_length,self.parallel["line"]) except UnicodeDecodeError: print >> sys.stderr, "bad encoding in %s around byte %s" % (self.filename,n.start) #Here [after done parsing] I should see if I still have an object stack, and unwind it, if so. max_v = self.objects_max max_v.extend((self.parallel["byte"],self.line_max)) return (max_v,self.counts)
def parse(self, input, output): self.reader = input self.writer = output p = shlax.parser(self.reader) for n in p: if n.type == "StartTag": self.parallel["byte"] = n.start self.context.append(n.name) #match metadata after you append: you want to see if you're entering a metadata context. for pattern in self.metamap: if context_match(self.context, pattern): self.metahandler = self.metamap[pattern] self.context_memory = self.context # meta_result = self.match_metapaths() # if meta_result: print meta_result if n.name in self.mapping: type = self.mapping[n.name] self.push_object(type) attlist = "" for k, v in n.attributes.items(): attlist += " %s=\"%s\"" % (k, v) try: emit_object(self.writer, type, "<" + n.name + attlist + ">", self.objects.v, self.parallel["byte"], self.parallel["line"]) except UnicodeDecodeError: print >> sys.stderr, "bad encoding at %s byte %s" % ( self.filename, n.start) if type == "doc": print >> self.writer, "meta %s %s" % ("filename", self.filename) if n.name == "l": if "n" in n.attributes.keys(): self.parallel["line"] = int(n.attributes["n"]) else: self.parallel["line"] += 1 print >> self.writer, "line %d %d" % ( self.parallel["byte"], self.parallel["line"]) self.line_max = max(self.parallel["line"], self.line_max) elif n.type == "EndTag": self.parallel["byte"] = n.start #match metadata before you pop: you want to see if you're leaving a metadata context. for pattern in self.metamap: if self.context_memory and self.context_memory == self.context: self.metahandler = None self.context_memory = None if self.context[-1] == n.name: self.context.pop() else: print >> sys.stderr, "mismatched tag at %s byte %s" % ( self.filename, n.start) if n.name in self.mapping: type = self.mapping[n.name] self.pull_object(type) emit_object(self.writer, type, "</" + n.name + ">", self.objects.v, self.parallel["byte"], self.parallel["line"]) elif n.type == "text": self.parallel["byte"] = n.start try: # this tokenizer could go into it's own subroutine... text = n.content.decode("UTF-8") tokens = re.finditer(ur"([\w\u2019]+)|([\.;:?!])", text, re.U) offset = self.parallel["byte"] if self.metahandler: cleantext = re.sub("[\n\t]", " ", text) print >> self.writer, "meta %s %s" % (self.metahandler, cleantext) for token in tokens: if token.group(1): self.push_object("word") char_offset = token.start(1) byte_length = len( text[:char_offset].encode("UTF-8")) emit_object(self.writer, "word", token.group(1), self.objects.v, offset + byte_length, self.parallel["line"]) self.counts[token.group(1)] = self.counts.get( token.group(1), 0) + 1 if token.group(2): self.push_object("sent") char_offset = token.start(1) byte_length = len( text[:char_offset].encode("UTF-8")) emit_object(self.writer, "sent", token.group(2), self.objects.v, offset + byte_length, self.parallel["line"]) except UnicodeDecodeError: print >> sys.stderr, "bad encoding in %s around byte %s" % ( self.filename, n.start) #Here [after done parsing] I should see if I still have an object stack, and unwind it, if so. max_v = self.objects_max max_v.extend((self.parallel["byte"], self.line_max)) return (max_v, self.counts)