class Wikiparser(SGMLParser): def __init__(self, url, verbose=0): "Initialise an object, passing 'verbose' to the superclass." SGMLParser.__init__(self, verbose) self.hyperlinks = [] self.url = url self.pdf = PDFWriter(self.url.split("/")[-1] +".pdf",595, 842) header = Header(text_align = pango.ALIGN_CENTER) #TODO Alignment not working. header.set_text(self.url) self.pdf.set_header(header) h1= Text(self.url.split("/")[-1],font="Dyuthi",font_size=32) self.pdf.add_h1(h1) h2= Text(self.url,font="Rachan",font_size=16) self.pdf.add_h2(h2) footer = Footer(text_align = pango.ALIGN_CENTER) footer.set_text("wiki2pdf") self.pdf.set_footer(footer) self.pdf.page_break() def reset(self): SGMLParser.reset(self) self.images = [] #TODO Alignment not working. self.h1 = False self.h2 = False self.li = False self.p = False self.a = False self.ul = False self.ol = False self.span = False self.buffer = None def handle_data(self,data): if data.strip() == "": return if self.p or self.h1 or self.h2 or self.a or self.span: if self.buffer!=None: self.buffer+= data def start_img(self, attrs): src = [value for key, value in attrs if key=='src'] if src: self.images.extend(src) def start_h1(self, attrs): self.h1=True self.buffer="" def end_h1(self): self.h1=False h1= Text(self.buffer,font="Dyuthi",font_size=16) self.pdf.add_h1(h1) self.buffer = None def start_h2(self, attrs): self.h2=True self.buffer="" def end_h2(self): self.h2=False if self.buffer and self.buffer.strip()>"": h2= Text(self.buffer,font="Rachana",font_size=14) self.pdf.add_h2(h2) self.buffer = None def start_li(self, attrs): self.li=True self.buffer="" def end_li(self): self.li=False if self.buffer and self.buffer.strip()>"": if self.ul: li= Text("• "+self.buffer,font_size=10) else: li= Text(self.buffer,font_size=10) self.pdf.add_li(li) self.buffer = None def start_a(self, attrs): self.a = True def end_a(self): self.a = False def start_ol(self,attrs): self.ol=True def end_ol(self): self.ol=False def start_ul(self,attrs): self.ul=True def end_ul(self): self.ul=False def start_span(self, attrs): self.span=True if self.buffer==None: self.buffer="" def end_span(self): self.buffer+=" " self.span=False def start_p(self,attrs): self.p=True self.buffer="" def end_p(self) : self.p=False para = Paragraph(text=self.buffer, font="Rachana",font_size=10,) para.set_justify(True) para.language = "ml_IN" para.set_hyphenate(True) self.pdf.add_paragraph(para) self.buffer = None def set_header(self,text): self.header = text def parse(self): opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] infile = opener.open(self.url) page = infile.read() page = cleanup(page) "Parse the given string 's'." self.feed(page) self.close()
class Wikiparser(SGMLParser): def reset(self): SGMLParser.reset(self) self.images = [] #TODO make the output file configurable- take it from command line self.pdf = PDFWriter("output.pdf",595, 842) header = Header(text_align = pango.ALIGN_CENTER) #TODO Alignment not working. header.set_text("A wikipedia article") self.pdf.set_header(header) footer = Footer(text_align = pango.ALIGN_CENTER) footer.set_text("wiki2pdf") #TODO Alignment not working. self.pdf.set_footer(footer) self.h1 = False self.h2 = False self.li = False self.p = False self.a = False self.ul = False self.ol = False self.span = False self.buffer = None def handle_data(self,data): if data.strip() == "": return if self.p or self.h1 or self.h2 or self.a or self.span: if self.buffer!=None: self.buffer+= data def start_img(self, attrs): src = [value for key, value in attrs if key=='src'] if src: self.images.extend(src) def start_h1(self, attrs): self.h1=True self.buffer="" def end_h1(self): self.h1=False h1= Text(self.buffer,font_size=16) self.pdf.add_h1(h1) self.buffer = None def start_h2(self, attrs): self.h2=True self.buffer="" def end_h2(self): self.h2=False if self.buffer and self.buffer.strip()>"": h2= Text(self.buffer,font_size=14) self.pdf.add_h2(h2) self.buffer = None def start_li(self, attrs): self.li=True self.buffer="" def end_li(self): self.li=False if self.buffer and self.buffer.strip()>"": if self.ul: li= Text("• "+self.buffer,font_size=10) else: li= Text(self.buffer,font_size=10) self.pdf.add_li(li) self.buffer = None def start_a(self, attrs): self.a = True def end_a(self): self.a = False def start_ol(self,attrs): self.ol=True def end_ol(self): self.ol=False def start_ul(self,attrs): self.ul=True def end_ul(self): self.ul=False def start_span(self, attrs): self.span=True if self.buffer==None: self.buffer="" def end_span(self): self.buffer+=" " self.span=False def start_p(self,attrs): self.p=True self.buffer="" def end_p(self) : self.p=False para = Paragraph(text=self.buffer, font="Rachana",font_size=10,) para.set_justify(True) self.pdf.add_paragraph(para) self.buffer = None